# Library

In [30]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random
import os

# Data Load & Preprocessing
- 훈련에 필요없는 index 컬럼 삭제.
- missing value를 모두 NAN 문자열로 대체
- dtype object 인 컬럼들을 onehot encoding

In [31]:
d = "C:\kaggle_data\credit_card"
lst = os.listdir(d)
print(lst)
train = pd.read_csv(d + '\\' +lst[3])
test = pd.read_csv(d + '\\' +lst[2])
ss = pd.read_csv(d + '\\' +lst[1])

['.ipynb_checkpoints', 'sample_submission.csv', 'test.csv', 'train.csv', 'Untitled.ipynb']


In [32]:
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

In [33]:
train['log_income'] = np.log(train['income_total'])
test['log_income'] = np.log(test['income_total'])

In [34]:
train = train.drop('income_total',1)
test = test.drop('income_total',1)

In [35]:
train['liab'] = train['car'] + train['reality']
test['liab'] = test['car'] + test['reality']

In [36]:
train = train.drop(['car', 'reality'],1)
test = test.drop(['car', 'reality'],1)

In [37]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)

In [38]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [39]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

- ㅇAYS_BIRTH : 출생일, 데이터 수집일이 0이고 역으로 계산, int
- DAYS_EMPLOYED: 업무 시작일, 데이터 수집일이 0이고 역으로 계산, int
- begin_month: 신용카드 발급 월, 수집 당시 0 역으로 계산,int형

# 일 -> 연도로 바꿔보면
for c in ['DAYS_BIRTH','DAYS_EMPLOYED']:
    train[c] = (train[c]/12)
    test[c] = (test[c]/12)

# Training
- 데이터 분리는 StratifiedKFold 를 사용하여 y값 분포를 비슷하게 분리시킴. -> 5-fold
- lightgbm의 default parameter로 훈련.
- 30번 이상 개선 없을 경우 중단.
- 각 5개의 fold를 훈련하여 저장

In [45]:
train.columns

Index(['child_num', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'work_phone',
       'phone', 'email', 'family_size', 'begin_month', 'credit', 'log_income',
       'gender_F', 'gender_M', 'income_type_Commercial associate',
       'income_type_Pensioner', 'income_type_State servant',
       'income_type_Student', 'income_type_Working',
       'edu_type_Academic degree', 'edu_type_Higher education',
       'edu_type_Incomplete higher', 'edu_type_Lower secondary',
       'edu_type_Secondary / secondary special', 'family_type_Civil marriage',
       'family_type_Married', 'family_type_Separated',
       'family_type_Single / not married', 'family_type_Widow',
       'house_type_Co-op apartment', 'house_type_House / apartment',
       'house_type_Municipal apartment', 'house_type_Office apartment',
       'house_type_Rented apartment', 'house_type_With parents',
       'occyp_type_Accountants', 'occyp_type_Cleaning staff',
       'occyp_type_Cooking staff', 'occyp_type_Core staff',
      

In [40]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

In [41]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))
random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.65054	valid_1's multi_logloss: 0.752322
[200]	training's multi_logloss: 0.565407	valid_1's multi_logloss: 0.737783
[300]	training's multi_logloss: 0.504073	valid_1's multi_logloss: 0.73312
[400]	training's multi_logloss: 0.452307	valid_1's multi_logloss: 0.73208
Early stopping, best iteration is:
[383]	training's multi_logloss: 0.459756	valid_1's multi_logloss: 0.731807


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.644877	valid_1's multi_logloss: 0.760468
[200]	training's multi_logloss: 0.558568	valid_1's multi_logloss: 0.747751
Early stopping, best iteration is:
[233]	training's multi_logloss: 0.536143	valid_1's multi_logloss: 0.745926


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.649316	valid_1's multi_logloss: 0.757925
[200]	training's multi_logloss: 0.562021	valid_1's multi_logloss: 0.747

In [46]:
sum([0.731807, 0.745926,0.743879, 0.736054, 0.736006])/5

0.7387344

# Test inference
- 각 fold를 훈련시킨 lightgbm model로 predict.
- 해당 대회는 logloss score를 겨루는 것이기 때문에 각 class의 probability를 얻어야함.
- 대부분의 머신러닝 모델에서 predict, predict_proba를 구분하여 사용함.
- predict는 class 출력을 해주고 predict_proba는 class별 probability를 출력해줌.
- predict_proba를 사용하여 예측한 것을 5-fold 더하여 평균내어 앙상블.

In [42]:
ss.iloc[:,1:]=0
for fold in range(5):
    ss.iloc[:,1:] += lgb_models[fold].predict_proba(test)/5

In [44]:
# log_income을 사용했을 때의 점수
from sklearn.model_selection import cross_val_score

print(cross_val_score(lgb_models[0], train.drop(['credit'],axis=1), train['credit'], cv=skf).mean())

0.7063157848872134


In [43]:
ss.to_csv('add_liab.csv', index=False) # 

In [34]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train.drop('log_income',1), train['credit']):
    folds.append((train_idx, valid_idx))

In [35]:
random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['log_income','credit'],axis=1).iloc[train_idx].values, train.drop(['log_income','credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.649825	valid_1's multi_logloss: 0.754342
[200]	training's multi_logloss: 0.565196	valid_1's multi_logloss: 0.738368
[300]	training's multi_logloss: 0.503465	valid_1's multi_logloss: 0.731678
Early stopping, best iteration is:
[307]	training's multi_logloss: 0.499573	valid_1's multi_logloss: 0.731416


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.646021	valid_1's multi_logloss: 0.764365
[200]	training's multi_logloss: 0.560632	valid_1's multi_logloss: 0.751211
[300]	training's multi_logloss: 0.497513	valid_1's multi_logloss: 0.748437
Early stopping, best iteration is:
[330]	training's multi_logloss: 0.480923	valid_1's multi_logloss: 0.747248


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.650293	valid_1's multi_logloss: 0.758441
[200]	training's multi_logloss: 0.562092	valid_1's multi_logloss: 0.

In [28]:
train['liab']

KeyError: 'liab'

In [38]:
sum([0.735772, 0.735804, 0.742915, 0.747248, 0.731416])/5

0.7386309999999999

In [37]:
# log_income을 사용하지 않았을 때의 점수
from sklearn.model_selection import cross_val_score
print(cross_val_score(lgb_models[0], train.drop(['income_total','credit'],axis=1), train['credit'], cv=skf).mean())

0.7049552192409335


In [39]:
pd.DataFrame([['no_log', 0.7386309999999999, 0.7049552192409335],['yes_log', 0.7386307999999999,0.7049552192409335]],columns=['status','self','cv'])

Unnamed: 0,status,self,cv
0,no_log,0.738631,0.704955
1,yes_log,0.738631,0.704955
