# Library

In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random
import os

# Data Load & Preprocessing
- 훈련에 필요없는 index 컬럼 삭제.
- missing value를 모두 NAN 문자열로 대체
- dtype object 인 컬럼들을 onehot encoding

In [2]:
d = "C:\kaggle_data\credit_card"
lst = os.listdir(d)
print(lst)
train = pd.read_csv(d + '\\' +lst[2])
test = pd.read_csv(d + '\\' +lst[1])
ss = pd.read_csv(d + '\\' +lst[0])

['sample_submission.csv', 'test.csv', 'train.csv']


In [3]:
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 

test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

In [4]:
train.loc[(train['occyp_type']=='NAN')&(train['DAYS_EMPLOYED']>=1),'occyp_type']='BS'
test.loc[(test['occyp_type']=='NAN')&(test['DAYS_EMPLOYED']>=1),'occyp_type']='BS'

In [5]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)

In [6]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [7]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

- ㅇAYS_BIRTH : 출생일, 데이터 수집일이 0이고 역으로 계산, int
- DAYS_EMPLOYED: 업무 시작일, 데이터 수집일이 0이고 역으로 계산, int
- begin_month: 신용카드 발급 월, 수집 당시 0 역으로 계산,int형

# 일 -> 연도로 바꿔보면
for c in ['DAYS_BIRTH','DAYS_EMPLOYED']:
    train[c] = (train[c]/12)
    test[c] = (test[c]/12)

# Training
- 데이터 분리는 StratifiedKFold 를 사용하여 y값 분포를 비슷하게 분리시킴. -> 5-fold
- lightgbm의 default parameter로 훈련.
- 30번 이상 개선 없을 경우 중단.
- 각 5개의 fold를 훈련하여 저장

In [8]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

In [9]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))
random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.652377	valid_1's multi_logloss: 0.752522
[200]	training's multi_logloss: 0.56559	valid_1's multi_logloss: 0.737156
[300]	training's multi_logloss: 0.501921	valid_1's multi_logloss: 0.731843
Early stopping, best iteration is:
[348]	training's multi_logloss: 0.475822	valid_1's multi_logloss: 0.729304


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.646476	valid_1's multi_logloss: 0.764955
[200]	training's multi_logloss: 0.560614	valid_1's multi_logloss: 0.751681
[300]	training's multi_logloss: 0.497532	valid_1's multi_logloss: 0.748007
Early stopping, best iteration is:
[281]	training's multi_logloss: 0.508566	valid_1's multi_logloss: 0.747717


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.652042	valid_1's multi_logloss: 0.758665
[200]	training's multi_logloss: 0.560771	valid_1's multi_logloss: 0.7

# Test inference
- 각 fold를 훈련시킨 lightgbm model로 predict.
- 해당 대회는 logloss score를 겨루는 것이기 때문에 각 class의 probability를 얻어야함.
- 대부분의 머신러닝 모델에서 predict, predict_proba를 구분하여 사용함.
- predict는 class 출력을 해주고 predict_proba는 class별 probability를 출력해줌.
- predict_proba를 사용하여 예측한 것을 5-fold 더하여 평균내어 앙상블.

In [10]:
ss.iloc[:,1:]=0
for fold in range(5):
    ss.iloc[:,1:] += lgb_models[fold].predict_proba(test)/5

In [11]:
from sklearn.model_selection import cross_val_score

print(cross_val_score(lgb_models[0], train.drop(['credit'],axis=1), train['credit'], cv=skf).mean())

0.7082057939200797


In [12]:
ss.to_csv('add_BS.csv', index=False) # 0.7272812144