### 가상환경 activate 후 lightgbm 설치
#### conda install -c conda-forge lightgbm

In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random

#### DATA LOAD

In [2]:
train = pd.read_csv('./train.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 

test = pd.read_csv('./test.csv')
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

submit = pd.read_csv('./sample_submission.csv')

In [3]:
train.info()
# 범주형 데이터를 가지는 변수 확인(dtype object)
# gender(2), car(2), reality(2), income_type(5), edu_type(5), family_type(5), house_type(6), occyp_type(18)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         26457 non-null  object 
 1   car            26457 non-null  object 
 2   reality        26457 non-null  object 
 3   child_num      26457 non-null  int64  
 4   income_total   26457 non-null  float64
 5   income_type    26457 non-null  object 
 6   edu_type       26457 non-null  object 
 7   family_type    26457 non-null  object 
 8   house_type     26457 non-null  object 
 9   DAYS_BIRTH     26457 non-null  int64  
 10  DAYS_EMPLOYED  26457 non-null  int64  
 11  FLAG_MOBIL     26457 non-null  int64  
 12  work_phone     26457 non-null  int64  
 13  phone          26457 non-null  int64  
 14  email          26457 non-null  int64  
 15  occyp_type     26457 non-null  object 
 16  family_size    26457 non-null  float64
 17  begin_month    26457 non-null  float64
 18  credit

In [3]:
# object 객체를 가지는 변수(열) 확인
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)
display(object_col)

['gender',
 'car',
 'reality',
 'income_type',
 'edu_type',
 'family_type',
 'house_type',
 'occyp_type']

In [4]:
# occyp_type의 경우 unique가 18인데, onehot encoding 하는게 맞을까? 이 경우엔 label encoding하는게 좋을듯
# 우선 가이드코드에는 모든 object객체 열을 onehot encoding 하였음
# train 데이터 onehot encoding

enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [5]:
display(train)

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,...,occyp_type_Low-skill Laborers,occyp_type_Managers,occyp_type_Medicine staff,occyp_type_NAN,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Waiters/barmen staff
0,0,202500.0,-13899,-4709,1,0,0,0,2.0,-6.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,247500.0,-11380,-1540,1,0,0,1,3.0,-5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,450000.0,-19087,-4434,1,0,1,0,2.0,-22.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,202500.0,-15088,-2092,1,0,1,0,2.0,-37.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0,157500.0,-15037,-2105,1,0,0,0,2.0,-26.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,2,225000.0,-12079,-1984,1,0,0,0,4.0,-2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26453,1,180000.0,-15291,-2475,1,0,0,0,2.0,-47.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
26454,0,292500.0,-10082,-2015,1,0,0,0,2.0,-25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26455,0,171000.0,-10145,-107,1,0,0,0,1.0,-59.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# test 데이터 onehot encoding

test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

In [None]:
diplay(test)

In [7]:
# StratifiedKFold - 불균형한 분포도를 가진 레이블 데이터를 위한 방식, 특정 레이블 값이 너무 많거나 적은 경우
# 5-fold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

In [8]:
# 30번 이상 개선 없을 시 중단
# 5개의 fold를 훈련하여 저장
random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.649825	valid_1's multi_logloss: 0.754342
[200]	training's multi_logloss: 0.565196	valid_1's multi_logloss: 0.738368
[300]	training's multi_logloss: 0.503465	valid_1's multi_logloss: 0.731678
Early stopping, best iteration is:
[307]	training's multi_logloss: 0.499573	valid_1's multi_logloss: 0.731416


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.646021	valid_1's multi_logloss: 0.764365
[200]	training's multi_logloss: 0.560632	valid_1's multi_logloss: 0.751211
[300]	training's multi_logloss: 0.497513	valid_1's multi_logloss: 0.748437
Early stopping, best iteration is:
[330]	training's multi_logloss: 0.480923	valid_1's multi_logloss: 0.747248


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.650293	valid_1's multi_logloss: 0.758441
[200]	training's multi_logloss: 0.562092	valid_1's multi_logloss: 0.

In [9]:
submit.iloc[:,1:]=0
for fold in range(5):
    submit.iloc[:,1:] += lgb_models[fold].predict_proba(test)/5

In [10]:
submit.to_csv('./20210416_test_submit_ensemble.csv', index=False) # 0.7272812144

In [11]:
submit.head(20)

Unnamed: 0,index,0,1,2
0,26457,0.053656,0.111823,0.834521
1,26458,0.244308,0.136086,0.619607
2,26459,0.040271,0.108892,0.850838
3,26460,0.108074,0.133723,0.758203
4,26461,0.099427,0.172556,0.728017
5,26462,0.05321,0.136312,0.810478
6,26463,0.471949,0.527646,0.000406
7,26464,0.117717,0.148413,0.73387
8,26465,0.075306,0.132764,0.79193
9,26466,0.062579,0.257245,0.680176
