In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random

## Data Load & Preprocessing
+ 훈련에 필요없는 index 컬럼 삭제.
+ missing value를 모두 NAN 문자열로 대체
+ dtype object 인 컬럼들을 onehot encoding

In [2]:
train = pd.read_csv('./open/train.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 


test = pd.read_csv('./open/test.csv')
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

submit = pd.read_csv('./open/sample_submission.csv')

In [3]:
data=pd.concat([train, test], axis=0)
data.shape

(36457, 19)

In [4]:
data['family_size'].value_counts()

2.0     19463
1.0      6987
3.0      6421
4.0      3106
5.0       397
6.0        58
7.0        19
15.0        3
9.0         2
20.0        1
Name: family_size, dtype: int64

'phone',  'email', 'work_phone' 3가지 컬럼도 추가

In [5]:
# train=train.drop('occyp_type', axis=1)
# test=test.drop('occyp_type', axis=1)

In [6]:
# train=train.drop(['email'], axis=1)
# test=test.drop(['email'], axis=1)

In [7]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)
#     elif col in ['phone',  'email', 'work_phone', 'FLAG_MOBIL']:
#         object_col.append(col)

In [8]:
object_col

['gender',
 'car',
 'reality',
 'income_type',
 'edu_type',
 'family_type',
 'house_type',
 'occyp_type']

In [9]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])

OneHotEncoder()

In [10]:
train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [11]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

## 수치형 데이터 feature engineering

In [12]:
#train['income_total'] = train['income_total'].astype(object)
train['income_total'] = train['income_total']/10000 
#train['Month_BIRTH'] = train['DAYS_BIRTH']/30 
#train['Month_EMPLOYED'] = train['DAYS_EMPLOYED']/30 
#train['q_BIRTH'] = train['DAYS_BIRTH']/90
#train['q_EMPLOYED'] = train['DAYS_EMPLOYED']/90
train['year_BIRTH'] = train['DAYS_BIRTH']/365
train['year_EMPLOYED'] = train['DAYS_EMPLOYED']/365
train['begin_year'] = train['begin_month']/12
train.loc[train['DAYS_EMPLOYED'] < 0,'DAYS_EMPLOYED']=0
train.loc[train['DAYS_EMPLOYED'] > 0,'DAYS_EMPLOYED']=1
train.loc[train['begin_month'] < 0,'begin_month']=0
train.loc[train['begin_month'] > 0,'begin_month']=1
train.loc[train['child_num'] >= 3,'child_num']=3
train.loc[train['family_size'] >= 5,'child_num']=5
#train['income_total3'] = train['income_total']/1000

In [13]:
train_emp = []
for i in range(len(train['DAYS_EMPLOYED'])):
    if train['year_EMPLOYED'][i] > 0:
        train_emp.append(0)
    else:
        train_emp.append(train['year_EMPLOYED'][i])
train['yes_emp'] = train_emp     

In [14]:
#test['income_total'] = test['income_total'].astype(object)
test['income_total'] = test['income_total']/10000 
#test['Month_BIRTH'] = test['DAYS_BIRTH']/30 
#test['Month_EMPLOYED'] = test['DAYS_EMPLOYED']/30
#test['q_BIRTH'] = test['DAYS_BIRTH']/90
#test['q_EMPLOYED'] = test['DAYS_EMPLOYED']/90
test['year_BIRTH'] = test['DAYS_BIRTH']/365
test['year_EMPLOYED'] = test['DAYS_EMPLOYED']/365
test['begin_year'] = test['begin_month']/12
test.loc[test['DAYS_EMPLOYED'] < 0,'DAYS_EMPLOYED']=0
test.loc[test['DAYS_EMPLOYED'] > 0,'DAYS_EMPLOYED']=1
test.loc[test['begin_month'] < 0,'begin_month']=0
test.loc[test['begin_month'] > 0,'begin_month']=1
test.loc[test['child_num'] >= 3,'child_num']=3
test.loc[test['family_size'] >= 5,'child_num']=5
#test['income_total3'] = test['income_total']/1000

In [15]:
test_emp = []
for i in range(len(test['DAYS_EMPLOYED'])):
    if test['year_EMPLOYED'][i] > 0:
        test_emp.append(0)
    else:
        test_emp.append(test['year_EMPLOYED'][i])
test['yes_emp'] = test_emp  

In [16]:
print(train['income_total'].value_counts(bins=6,sort=False))
print(train['year_BIRTH'].value_counts(bins=10,sort=False))
print(data['begin_month'].value_counts(bins=6,sort=False))
print(train['year_EMPLOYED'].value_counts(bins=10,sort=False))
#data['DAYS_EMPLOYED'].plot(kind='hist',bins=2,density=True)

(2.544, 28.5]     23146
(28.5, 54.3]       3037
(54.3, 80.1]        216
(80.1, 105.9]        47
(105.9, 131.7]        2
(131.7, 157.5]        9
Name: income_total, dtype: int64
(-68.958, -64.13]     848
(-64.13, -59.35]     2340
(-59.35, -54.57]     2665
(-54.57, -49.79]     2773
(-49.79, -45.01]     2941
(-45.01, -40.23]     3497
(-40.23, -35.45]     3748
(-35.45, -30.67]     3498
(-30.67, -25.89]     3271
(-25.89, -21.11]      876
Name: year_BIRTH, dtype: int64
(-60.061, -50.0]    4099
(-50.0, -40.0]      4925
(-40.0, -30.0]      5865
(-30.0, -20.0]      6686
(-20.0, -10.0]      7689
(-10.0, 0.0]        7193
Name: begin_month, dtype: int64
(-44.094, 61.322]      22019
(61.322, 165.694]          0
(165.694, 270.065]         0
(270.065, 374.437]         0
(374.437, 478.808]         0
(478.808, 583.18]          0
(583.18, 687.551]          0
(687.551, 791.923]         0
(791.923, 896.294]         0
(896.294, 1000.666]     4438
Name: year_EMPLOYED, dtype: int64


In [17]:
#minus 변경하고
#구간화 함수
def make_bin(df, variable, n):
    data = df
    data[variable] =- data[variable]
    count, bin_dividers = np.histogram(data[variable], bins=n)
    bin_names=[str(i) for i in range(n)]
    data['%s_bin' % variable]=pd.cut(x=data[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)

In [18]:
make_bin(train, 'income_total', n=7)
make_bin(train, 'year_BIRTH', n=10)
make_bin(train, 'begin_month', n=6)
make_bin(train, 'yes_emp', n=10)
#make_bin(train, 'DAYS_EMPLOYED', n=2)
#make_bin(train, 'child_num', n=2)

In [19]:
make_bin(test, 'income_total', n=7)
make_bin(test, 'year_BIRTH', n=10)
make_bin(test, 'begin_month', n=6)
make_bin(test, 'yes_emp', n=10)
#make_bin(test, 'DAYS_EMPLOYED', n=2)
#make_bin(test, 'child_num', n=2)

In [20]:
enc = OneHotEncoder()
object_col = ['income_total_bin', 'year_BIRTH_bin', 'begin_month_bin', 'yes_emp_bin']
enc.fit(train.loc[:,object_col])

OneHotEncoder()

In [21]:
train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)
#train = train.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED', 'begin_month'], axis=1)

In [22]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)
#test = test.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED', 'begin_month'], axis=1)

In [23]:
train.shape, test.shape

((26457, 89), (10000, 88))

In [24]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

In [25]:
random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=50,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 50 rounds
[100]	training's multi_logloss: 0.648953	valid_1's multi_logloss: 0.750248
[200]	training's multi_logloss: 0.562382	valid_1's multi_logloss: 0.733084
[300]	training's multi_logloss: 0.500723	valid_1's multi_logloss: 0.727882
[400]	training's multi_logloss: 0.447799	valid_1's multi_logloss: 0.726733
Early stopping, best iteration is:
[354]	training's multi_logloss: 0.471481	valid_1's multi_logloss: 0.726043


Training until validation scores don't improve for 50 rounds
[100]	training's multi_logloss: 0.64557	valid_1's multi_logloss: 0.762176
[200]	training's multi_logloss: 0.556183	valid_1's multi_logloss: 0.748587
[300]	training's multi_logloss: 0.492039	valid_1's multi_logloss: 0.744265
[400]	training's multi_logloss: 0.441873	valid_1's multi_logloss: 0.745668
Early stopping, best iteration is:
[350]	training's multi_logloss: 0.465535	valid_1's multi_logloss: 0.742643


Training until validation scores don't improve for 50 r

In [26]:
submit.iloc[:,1:]=0
for fold in range(5):
    submit.iloc[:,1:] += lgb_models[fold].predict_proba(test)/5

In [27]:
submit.to_csv('./submit/5fold_lgb3.csv', index=False) # 0.7272812144

In [28]:
submit.head(20)

Unnamed: 0,index,0,1,2
0,26457,0.047761,0.09822,0.854018
1,26458,0.214729,0.155689,0.629582
2,26459,0.046273,0.103584,0.850143
3,26460,0.106974,0.118443,0.774583
4,26461,0.079316,0.16408,0.756604
5,26462,0.083584,0.148262,0.768154
6,26463,0.456232,0.543424,0.000344
7,26464,0.096812,0.141025,0.762164
8,26465,0.102001,0.148786,0.749213
9,26466,0.064506,0.263556,0.671939
