In [1]:
# 데이터 분석 라이브러리
import numpy as np
import pandas as pd

# 시각화 라이브러리
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *

# 모델링 라이브러리
from scipy import stats
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer, LabelEncoder
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, StratifiedKFold, GridSearchCV, RandomizedSearchCV, train_test_split
from category_encoders.ordinal import OrdinalEncoder
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, make_scorer, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score
import sklearn.metrics as metrics

# 기타 라이브러리
import warnings
import random
import gc
import os

warnings.filterwarnings("ignore", category=RuntimeWarning)        
sns.set_style("whitegrid")

In [2]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.width', 10000)
pd.options.display.float_format = '{:.3f}'.format

# 데이터 불러오기

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./submission/all_case.csv') # fnlwgt 제거

In [4]:
test['income'] = sample_submission['prediction']

# 후처리

In [5]:
train.drop(['id'], axis = 1, inplace = True)
test.drop(['id'], axis = 1, inplace = True)

In [6]:
columns = train.columns.tolist()

## income class가 1개이면서 row가 2개 이상인 경우

In [8]:
train[train['age'] == 17]

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
1,17,Private,101626,9th,5,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,20,United-States,<=50K
73,17,Private,413557,11th,7,Never-married,Adm-clerical,Own-child,White,Female,0,0,32,United-States,<=50K
82,17,Private,186890,10th,6,Married-civ-spouse,Sales,Own-child,White,Female,0,0,30,United-States,<=50K
152,17,Private,136363,10th,6,Never-married,Other-service,Own-child,White,Male,0,0,20,United-States,<=50K
299,17,Private,285169,11th,7,Never-married,Priv-house-serv,Own-child,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25669,17,Private,206506,10th,6,Never-married,Handlers-cleaners,Other-relative,White,Male,0,0,10,El-Salvador,<=50K
25675,17,Private,321880,10th,6,Never-married,Other-service,Own-child,Black,Male,0,0,15,United-States,<=50K
25762,17,Private,193748,11th,7,Never-married,Sales,Own-child,White,Male,0,0,15,United-States,<=50K
25980,17,?,112942,10th,6,Never-married,?,Own-child,White,Male,0,0,40,United-States,<=50K


In [7]:
ans = pd.DataFrame()
for col in columns:
    for value in train[col].value_counts().index.tolist():
        if (train[train[col] == value]['income'].nunique() == 1) &\
        (train[train[col] == value]['income'].value_counts().values[0] >= 2):
            count = train[train[col] == value]['income'].value_counts().values[0]
            income = train[train[col] == value]['income'].value_counts().index[0]
            ans = ans.append(pd.DataFrame({'column' : [col], 'value' : [value],\
                                          'income' : [income], 'count' : [count]}))          
ans = ans.reset_index(drop = True)
# income 제거
ans.drop([3040, 3041], axis = 0, inplace = True)
# income 변수 라벨링
ans['income'] = ans['income'].apply(lambda x: 0 if x == '<=50K' else 1)

In [10]:
ans

Unnamed: 0,column,value,income,count
0,age,20,0,597
1,age,18,0,429
2,age,17,0,325
3,age,82,0,10
4,age,88,0,3
...,...,...,...,...
3035,hours_per_week,91,0,3
3036,hours_per_week,86,0,2
3037,hours_per_week,61,1,2
3038,native_country,Outlying-US(Guam-USVI-etc),0,13


In [9]:
ans['column'].value_counts()

fnlwgt            2847
capital_gain       101
capital_loss        69
hours_per_week      10
age                  6
native_country       2
workclass            2
education            1
occupation           1
education_num        1
Name: column, dtype: int64

## ans에서 fnlwgt를 제거한 경우

In [25]:
del_fnlwgt = ans[ans['column'] != 'fnlwgt']
del_fnlwgt['column'].value_counts()

capital_gain      101
capital_loss       69
hours_per_week     10
age                 6
native_country      2
workclass           2
occupation          1
education           1
education_num       1
Name: column, dtype: int64

In [26]:
del_fnlwgt.sample(6)

Unnamed: 0,column,value,income,count
2927,capital_gain,2346,0,5
2928,capital_gain,1831,0,5
2904,capital_gain,6418,1,8
2949,capital_gain,2036,0,3
2880,capital_gain,13550,1,21
2921,capital_gain,6514,1,5


In [27]:
print('<후처리 전>\n', test['income'].value_counts())
for x in del_fnlwgt.iterrows():
    col = x[1]['column']
    value = x[1]['value'] 
    income = x[1]['income']
    test.loc[test[col] == value, 'income'] = income
print('<후처리 후>\n', test['income'].value_counts())

<후처리 전>
 0    5185
1    1327
Name: income, dtype: int64
<후처리 후>
 0    5190
1    1322
Name: income, dtype: int64


In [28]:
test.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,28,Private,67661,Some-college,10,Never-married,Adm-clerical,Other-relative,White,Female,0,0,40,United-States,0
1,40,Self-emp-inc,37869,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,1
2,20,Private,109952,Some-college,10,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,25,United-States,0


test.loc[test['capital_gain'] == 3103, 'income'] = 1

In [15]:
train[train.duplicated()]

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
6816,23,Private,240137,5th-6th,3,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,55,Mexico,<=50K
8305,39,Private,30916,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
8742,42,Private,204235,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,>50K
9837,21,Private,243368,Preschool,1,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,50,Mexico,<=50K
12159,46,Private,133616,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
13184,28,Private,274679,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,50,United-States,<=50K
13290,20,Private,107658,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,10,United-States,<=50K
13667,35,Private,379959,HS-grad,9,Divorced,Other-service,Not-in-family,White,Female,0,0,40,United-States,<=50K
14029,21,Private,250051,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,10,United-States,<=50K
15249,46,Private,173243,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K


## 저장

In [29]:
sample_submission['prediction'] = test['income']

In [30]:
sample_submission['prediction'].value_counts()

0    5190
1    1322
Name: prediction, dtype: int64

In [31]:
sample_submission.to_csv('./submission/all_case_treatment.csv', index = False)

In [32]:
!kaggle competitions submit -c kakr-4th-competition -f ./submission/all_case_treatment.csv -m "모든 경우의 수 후처리"

100%|██████████████████████████████████████| 43.4k/43.4k [00:08<00:00, 4.97kB/s]
Successfully submitted to [T-Academy X KaKr] 성인 인구조사 소득 예측 대회 

In [34]:
sample_submission['prediction']

0       0
1       1
2       0
3       1
4       0
       ..
6507    0
6508    1
6509    0
6510    0
6511    0
Name: prediction, Length: 6512, dtype: int64

In [35]:
best = pd.read_csv('./submission/whochurie_del_flnwgt.csv')

In [52]:
a = pd.DataFrame({'compare' : (sample_submission['prediction'] == best['prediction']).tolist()})

In [54]:
a['all'] = sample_submission['prediction']

In [56]:
a['best'] = best['prediction']

In [None]:
sample_submission # 0.87
best # 0.88

In [78]:
a[a['compare'] == 0].index

Int64Index([   4,   12,   58,   93,  101,  104,  116,  224,  397,  426,
            ...
            6318, 6340, 6351, 6376, 6383, 6423, 6472, 6491, 6498, 6507], dtype='int64', length=190)

In [80]:
zero = sample_submission
one = sample_submission

In [96]:
lst = a[a['compare'] == 0].index.tolist()

In [113]:
for x in lst:
    one.loc[one['id'] == x, 'prediction'] = 1

In [118]:
#zero.iloc[a[a['compare'] == 0].index.tolist()]['prediction']
one.iloc[a[a['compare'] == 0].index.tolist()]['prediction']

4       1
12      1
58      1
93      1
101     1
104     1
116     1
224     1
397     1
426     1
517     1
537     1
538     1
547     1
563     1
591     1
668     1
674     1
772     1
779     1
840     1
866     1
877     1
888     1
972     1
992     1
1086    1
1136    1
1201    1
1302    1
1328    1
1415    1
1483    1
1488    1
1491    1
1501    1
1538    1
1564    1
1573    1
1618    1
1647    1
1686    1
1762    1
1861    1
1895    1
1900    1
1944    1
2021    1
2105    1
2125    1
2134    1
2158    1
2169    1
2205    1
2216    1
2368    1
2369    1
2422    1
2439    1
2463    1
2553    1
2597    1
2605    1
2638    1
2641    1
2642    1
2668    1
2678    1
2770    1
2805    1
2849    1
2855    1
2872    1
2908    1
2928    1
2929    1
2944    1
3009    1
3028    1
3053    1
3181    1
3221    1
3241    1
3263    1
3299    1
3473    1
3539    1
3549    1
3584    1
3617    1
3733    1
3751    1
3757    1
3807    1
3809    1
3867    1
3900    1
3925    1
3941    1
3957    1


In [100]:
zero.iloc[a[a['compare'] == 0].index.tolist()]

Unnamed: 0,id,prediction
4,4,0
12,12,0
58,58,0
93,93,0
101,101,0
104,104,0
116,116,0
224,224,0
397,397,0
426,426,0


In [122]:
zero.to_csv('./submission/zero.csv', index = False)

In [123]:
one.to_csv('./submission/one.csv', index = False)

In [124]:
pd.read_csv('./submission/one.csv')

Unnamed: 0,id,prediction
0,0,0
1,1,1
2,2,0
3,3,1
4,4,1
...,...,...
6507,6507,1
6508,6508,1
6509,6509,0
6510,6510,0
