In [196]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random
from IPython.core.display import display, HTML


display(HTML("<style>.container { width: 100% !important; }</style>"))

## Data Load & Preprocessing
+ 훈련에 필요없는 index 컬럼 삭제.
+ missing value를 모두 NAN 문자열로 대체
+ dtype object 인 컬럼들을 onehot encoding

In [197]:
train = pd.read_csv('./open/train.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 


test = pd.read_csv('./open/test.csv')
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

submit = pd.read_csv('./open/sample_submission.csv')

In [198]:
data=pd.concat([train, test], axis=0)
data.shape

(36457, 19)

In [199]:
data['family_size'].value_counts()

2.0     19463
1.0      6987
3.0      6421
4.0      3106
5.0       397
6.0        58
7.0        19
15.0        3
9.0         2
20.0        1
Name: family_size, dtype: int64

'phone',  'email', 'work_phone' 3가지 컬럼도 추가

In [200]:
# train=train.drop('occyp_type', axis=1)
# test=test.drop('occyp_type', axis=1)

In [201]:
# train=train.drop(['email'], axis=1)
# test=test.drop(['email'], axis=1)

In [202]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)
#     elif col in ['phone',  'email', 'work_phone', 'FLAG_MOBIL']:
#         object_col.append(col)

In [203]:
object_col

['gender',
 'car',
 'reality',
 'income_type',
 'edu_type',
 'family_type',
 'house_type',
 'occyp_type']

In [204]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])

OneHotEncoder()

In [205]:
train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [206]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

## 수치형 데이터 feature engineering

In [207]:
train['income_total'] = train['income_total']/10000
train['income_total_dev'] = (train['income_total'] - train['income_total'].mean())**2
train['income_total_log'] = train['income_total'].apply(np.log)

In [208]:
train['DAYS_EMPLOYED_log'] = train['DAYS_EMPLOYED'].map(lambda x: x if x < 0 else 0).apply(lambda x: np.log1p(x*-1))

In [209]:
#train['Month_BIRTH'] = train['DAYS_BIRTH']/30 
#train['q_BIRTH'] = train['DAYS_BIRTH']/90
train['year_BIRTH'] = train['DAYS_BIRTH']/365
train['begin_month2'] = train['begin_month']

In [210]:
train.loc[train['DAYS_EMPLOYED'] < 0,'DAYS_EMPLOYED']=0
train.loc[train['DAYS_EMPLOYED'] > 0,'DAYS_EMPLOYED']=1
train.loc[train['begin_month'] < 0,'begin_month']=0
train.loc[train['begin_month'] > 0,'begin_month']=1
train.loc[train['child_num'] >= 3,'child_num']=3
train.loc[train['family_size'] >= 5,'child_num']=5
train['year_BIRTH'] =- train['year_BIRTH'] 
train['begin_month2'] =- train['begin_month2']

In [211]:
train['total_phone'] = train['work_phone'] + train['phone'] + train['FLAG_MOBIL']
train['diff_child'] = train['family_size'] - train['child_num']

In [212]:
test['income_total'] = test['income_total']/10000
test['income_total_dev'] = (test['income_total'] - test['income_total'].mean())**2
test['income_total_log'] = test['income_total'].apply(np.log)

In [213]:
test['DAYS_EMPLOYED_log'] = test['DAYS_EMPLOYED'].map(lambda x: x if x < 0 else 0).apply(lambda x: np.log1p(x*-1))

In [214]:
#train['Month_BIRTH'] = train['DAYS_BIRTH']/30 
#train['q_BIRTH'] = train['DAYS_BIRTH']/90
test['year_BIRTH'] = test['DAYS_BIRTH']/365
test['begin_month2'] = test['begin_month']

In [215]:
test.loc[test['DAYS_EMPLOYED'] < 0,'DAYS_EMPLOYED']=0
test.loc[test['DAYS_EMPLOYED'] > 0,'DAYS_EMPLOYED']=1
test.loc[test['begin_month'] < 0,'begin_month']=0
test.loc[test['begin_month'] > 0,'begin_month']=1
test.loc[test['child_num'] >= 3,'child_num']=3
test.loc[test['family_size'] >= 5,'child_num']=5
test['year_BIRTH'] =- test['year_BIRTH'] 
#test['year_EMPLOYED'] =- test['year_EMPLOYED']
test['begin_month2'] =- test['begin_month2']
#test['income_total3'] = test['income_total']/1000

In [216]:
test['total_phone'] = test['work_phone'] + test['phone'] + test['FLAG_MOBIL']
test['diff_child'] = test['family_size'] - test['child_num']

In [217]:
#minus 변경하고
#구간화 함수
def make_bin(df, variable, n):
    data = df
    #data[variable] =- data[variable]
    count, bin_dividers = np.histogram(data[variable], bins=n)
    bin_names=[str(i) for i in range(n)]
    data['%s_bin' % variable]=pd.cut(x=data[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)

In [218]:
make_bin(train, 'income_total_log', n=25)
make_bin(train, 'year_BIRTH', n=30)
make_bin(train, 'begin_month2', n=6)
make_bin(train, 'DAYS_EMPLOYED_log', n=20)
#make_bin(train, 'DAYS_EMPLOYED', n=2)
#make_bin(train, 'child_num', n=2)

In [219]:
make_bin(test, 'income_total_log', n=25)
make_bin(test, 'year_BIRTH', n=30)
make_bin(test, 'begin_month2', n=6)
make_bin(test, 'DAYS_EMPLOYED_log', n=20)
#make_bin(test, 'DAYS_EMPLOYED', n=2)
#make_bin(test, 'child_num', n=2)

In [220]:
enc = OneHotEncoder()
object_col = ['income_total_log_bin', 'year_BIRTH_bin', 'begin_month2_bin', 'DAYS_EMPLOYED_log_bin']
enc.fit(train.loc[:,object_col])

OneHotEncoder()

In [221]:
train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)
train = train.drop(['work_phone', 'phone', 'FLAG_MOBIL'], axis=1)

In [222]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)
test = test.drop(['work_phone', 'phone', 'FLAG_MOBIL'], axis=1)

In [223]:
train

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,email,family_size,begin_month,credit,gender_F,gender_M,...,DAYS_EMPLOYED_log_bin_14,DAYS_EMPLOYED_log_bin_15,DAYS_EMPLOYED_log_bin_16,DAYS_EMPLOYED_log_bin_17,DAYS_EMPLOYED_log_bin_18,DAYS_EMPLOYED_log_bin_19,DAYS_EMPLOYED_log_bin_5,DAYS_EMPLOYED_log_bin_7,DAYS_EMPLOYED_log_bin_8,DAYS_EMPLOYED_log_bin_9
0,0,20.25,-13899,0,0,2.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,24.75,-11380,0,1,3.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,45.00,-19087,0,0,2.0,0.0,2.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,20.25,-15088,0,0,2.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,15.75,-15037,0,0,2.0,0.0,2.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,2,22.50,-12079,0,0,4.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26453,1,18.00,-15291,0,0,2.0,0.0,2.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26454,0,29.25,-10082,0,0,2.0,0.0,2.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26455,0,17.10,-10145,0,0,1.0,0.0,2.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [224]:
train.shape, test.shape

((26457, 136), (10000, 135))

In [225]:
from bayes_opt import BayesianOptimization
import xgboost as xgb

In [226]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train.drop(['credit'],axis=1), train['credit'], 
                                                    stratify=train['credit'], test_size=0.2,
                                                    random_state = 10086)

In [227]:
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_val)

In [228]:
def xgb_evaluate(max_depth, subsample, colsample_bytree,learning_rate):
    params = {'eval_metric': 'mlogloss',
              'objective': 'multi:softprob',
              'num_class' : 3,
              'max_depth': int(max_depth),
              'subsample': subsample,
              'eta': learning_rate,
#               'gamma': gamma,
              'colsample_bytree': colsample_bytree,   
#               'min_child_weight': min_child_weight ,
#               'max_delta_step':max_delta_step
             }
    # Used around 1000 boosting rounds in the full model
    cv_result = xgb.cv(params, dtrain, num_boost_round=200, nfold=5, early_stopping_rounds=50)    
    # Bayesian optimization only knows how to maximize, not minimize, so return the negative RMSE
    return -1.0 * cv_result['test-mlogloss-mean'].iloc[-1]

In [None]:
xgb_bo = BayesianOptimization(xgb_evaluate, {
                                    'max_depth': (2, 12),
#                                      'gamma': (0.001, 10.0),
#                                      'min_child_weight': (0, 20),
#                                      'max_delta_step': (0, 10),
                                     'subsample': (0.4, 1.0),
                                     'colsample_bytree' :(0.4, 1.0),
                                     'learning_rate': (0.01,0.2)})
# Use the expected improvement acquisition function to handle negative numbers
# Optimally needs quite a few more initiation points and number of iterations
xgb_bo.maximize(init_points=5, n_iter = 5, acq='ei', xi=0.01)

|   iter    |  target   | colsam... | learni... | max_depth | subsample |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.798   [0m | [0m 0.4296  [0m | [0m 0.0192  [0m | [0m 6.421   [0m | [0m 0.6183  [0m |
| [0m 2       [0m | [0m-0.8147  [0m | [0m 0.6632  [0m | [0m 0.01182 [0m | [0m 5.024   [0m | [0m 0.9802  [0m |
| [95m 3       [0m | [95m-0.7432  [0m | [95m 0.9641  [0m | [95m 0.1572  [0m | [95m 7.18    [0m | [95m 0.8703  [0m |
| [95m 4       [0m | [95m-0.7372  [0m | [95m 0.706   [0m | [95m 0.1128  [0m | [95m 10.26   [0m | [95m 0.9599  [0m |
| [0m 5       [0m | [0m-0.7942  [0m | [0m 0.4239  [0m | [0m 0.1659  [0m | [0m 2.258   [0m | [0m 0.7552  [0m |
| [0m 6       [0m | [0m-0.7399  [0m | [0m 0.6407  [0m | [0m 0.1659  [0m | [0m 11.66   [0m | [0m 0.6679  [0m |
| [0m 7       [0m | [0m-0.7495  [0m | [0m 1.0     [0m | [0m 0.2     [0m | [0m 8.282   [0m | [0m 1

In [160]:
params = xgb_bo.max['params']
params

{'colsample_bytree': 0.6728347384034667,
 'learning_rate': 0.09359844698912438,
 'max_depth': 9.8806162854336,
 'subsample': 0.8621865275602482}

In [161]:
params['max_depth'] = 10
params['eval_metric'] = 'mlogloss'
params['objective'] = 'multi:softprob'
params['num_class'] = 3
#params['learning_rate'] = 0.06
params

{'colsample_bytree': 0.6728347384034667,
 'learning_rate': 0.09359844698912438,
 'max_depth': 10,
 'subsample': 0.8621865275602482,
 'eval_metric': 'mlogloss',
 'objective': 'multi:softprob',
 'num_class': 3}

In [162]:
model = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=1000,
    nfold=5,
    early_stopping_rounds=50,
    verbose_eval = 50
)
model

Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
0,1.052303,0.000795,1.056390,0.000643
1,1.014858,0.002119,1.023504,0.001704
2,0.980519,0.003021,0.993844,0.001976
3,0.951515,0.002763,0.969182,0.001711
4,0.925199,0.002408,0.946810,0.002084
...,...,...,...,...
138,0.420911,0.002595,0.734612,0.010575
139,0.419318,0.002464,0.734667,0.010645
140,0.417798,0.002484,0.734595,0.010687
141,0.416301,0.002287,0.734624,0.010735


In [166]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss
from tensorflow.keras.utils import to_categorical

train_x = train.drop(['credit'],axis=1)
train_y = train['credit']
dtest2 = xgb.DMatrix(test)

def run_kfold():
    folds=StratifiedKFold(n_splits=5, shuffle=True, random_state=55)
    outcomes=[]
    sub=np.zeros((test.shape[0], 3))  
    for n_fold, (train_index, val_index) in enumerate(folds.split(train_x, train_y)):
        X_train, X_val = train_x.iloc[train_index], train_x.iloc[val_index]
        y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]
        
        dtrain = xgb.DMatrix(X_train, y_train)
        dtest = xgb.DMatrix(X_val)
#         clf.fit(X_train, y_train)
        
#         predictions=clf.predict_proba(X_val)
        
        final_gb = xgb.train(params, dtrain, num_boost_round=160, verbose_eval=50)

        predictions = final_gb.predict(dtest)
        test_predictions = final_gb.predict(dtest2 )
        
        logloss=log_loss(to_categorical(y_val), predictions)
        outcomes.append(logloss)
        print(f"FOLD {n_fold} : logloss:{logloss}")
        
        sub += test_predictions
        
        
    mean_outcome=np.mean(outcomes)
    
    print("Mean:{}".format(mean_outcome))
    return sub/folds.n_splits

my_submission = run_kfold()

FOLD 0 : logloss:0.7205538502718355
FOLD 1 : logloss:0.7168944590064702
FOLD 2 : logloss:0.7239226906693609
FOLD 3 : logloss:0.7087438465037089
FOLD 4 : logloss:0.7126673952366737
Mean:0.7165564483376099


In [150]:
# submit.iloc[:,1:]=0
# for fold in range(5):
#     submit.iloc[:,1:] += lgb_models[fold].predict_proba(test)/5

In [168]:
my_submission

array([[0.06064598, 0.10535355, 0.83400047],
       [0.21153897, 0.18444604, 0.604015  ],
       [0.04703892, 0.10323175, 0.84972931],
       ...,
       [0.02441764, 0.05327652, 0.92230585],
       [0.12509056, 0.32860161, 0.54630783],
       [0.06960583, 0.18498847, 0.74540572]])

In [169]:
submission=pd.read_csv('./open/sample_submission.csv')
submission.loc[:,1:]=my_submission
submission

Unnamed: 0,index,0,1,2
0,26457,0.060646,0.105354,0.834000
1,26458,0.211539,0.184446,0.604015
2,26459,0.047039,0.103232,0.849729
3,26460,0.129369,0.131347,0.739284
4,26461,0.080555,0.146880,0.772565
...,...,...,...,...
9995,36452,0.096725,0.230292,0.672983
9996,36453,0.190241,0.305657,0.504102
9997,36454,0.024418,0.053277,0.922306
9998,36455,0.125091,0.328602,0.546308


In [171]:
submission.to_csv('./submit/5fold_xgb_log_0.7165.csv', index=False) # 0.7272812144

In [172]:
submission.head(20)

Unnamed: 0,index,0,1,2
0,26457,0.060646,0.105354,0.834
1,26458,0.211539,0.184446,0.604015
2,26459,0.047039,0.103232,0.849729
3,26460,0.129369,0.131347,0.739284
4,26461,0.080555,0.14688,0.772565
5,26462,0.063077,0.118842,0.818081
6,26463,0.652723,0.345172,0.002105
7,26464,0.073429,0.103138,0.823433
8,26465,0.067529,0.131824,0.800647
9,26466,0.06378,0.281449,0.654771
