# Import Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
path = '/content/drive/MyDrive/credit/'

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from tqdm.notebook import tqdm

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss

from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import accuracy_score, mean_squared_error

In [5]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',20)

# Read in the data files

In [6]:
train = pd.read_csv(path+'train.csv')
train.fillna('NAN', inplace=True) 


test = pd.read_csv(path+'test.csv')
test.fillna('NAN', inplace=True)

submit = pd.read_csv(path+'sample_submission.csv')

# Feature Engineering

In [7]:
data = pd.concat([train.copy(),test.copy()],axis=0)

In [8]:
def preprocessing(df):
    
    # 컬럼명 소문자화
    df.columns = df.columns.str.lower()
    
    # 범주형 변수 type 변경
    df['work_phone'] = np.where(df['work_phone']==1,"Y","N")
    df['phone'] = np.where(df['phone']==1,"Y","N")
    df['email'] = np.where(df['email']==1,"Y","N")
    
    # 부호 변경
    df['days_birth'] = -1*df['days_birth']
    df['days_employed'] = -1*df['days_employed'] 
    df['begin_month'] = -1*df['begin_month']
    
    
    # 열 제거
    df = df.drop(['flag_mobil', 'index'],axis=1) 
    
    
    cat_features = df.dtypes[df.dtypes == 'object'].index.tolist()
    
    
    edu_dict = {'Academic degree' : 5,
            'Higher education' : 4,
            'Incomplete higher' : 3,
            'Secondary / secondary special' : 2,
            'Lower secondary' : 1
            }

    df['edu_type'] = df['edu_type'].apply(lambda x : edu_dict[x]).astype('int')
    
    
    df['age'] = df['days_birth']%365
    
    work = ['Commercial associate', 'Working', 'State servant']
    df['group_income_type_work'] = df['income_type'].apply(lambda x: np.where(x in work, 'work', 'non_work'))
    
    
    money = ['Commercial associate', 'Working', 'State servant', 'Pensioner'] 
    df['group_income_type_money'] = df['income_type'].apply(lambda x: np.where(x in money, 'money', 'non_money'))
    
    
    couple = ['Married', 'Civil marriage']
    df['group_family_type_solo'] = df['family_type'].apply(lambda x: np.where(x in couple, 'couple', 'solo'))
                                                              
    
    married1 = ['Married', 'Civil marriage', 'Separated']
    df['group_family_type_married1'] = df['family_type'].apply(lambda x: np.where(x in married1, 'married1', 'non_married1'))
    
                                                              
    married2 = ['Married', 'Civil marriage', 'Separated', 'Widow']
    df['group_family_type_married2'] = df['family_type'].apply(lambda x: np.where(x in married2, 'married2', 'non_married2'))                                                                          
    
                                                              
    return df

In [9]:
data = preprocessing(data)

### Feature engineering pipeline

In [10]:
def feature_pipeline(df):    
    df['income_child_ratio'] = df['income_total']/df['child_num']
    df['birth_child_ratio'] = df['days_birth']/df['child_num']
    df['employed_child_ratio'] = df['days_employed']/df['child_num']
    df['child_family_ratio'] = df['child_num']/df['family_size']
    df['begin_child_ratio'] = df['begin_month']/df['child_num']
    df['income_birth_ratio'] = df['income_total']/df['days_birth']
    df['income_emp_ratio'] = df['income_total']/df['days_employed']
    df['income_family_ratio'] = df['income_total']/df['family_size']
    df['income_begin_ratio'] = df['income_total']/df['begin_month']    
    df['employed_birth_ratio'] = df['days_employed']/df['days_birth']
    df['family_birth_ratio'] = df['family_size']/df['days_birth']
    df['begin_birth_ratio'] = df['begin_month']/df['days_birth']
    df['family_employed_ratio'] = df['family_size']/df['days_employed']
    df['begin_employed_ratio'] = df['begin_month']/df['days_employed']
    df['begin_family_ratio'] = df['begin_month']/df['family_size']
    df['child_age_ratio'] = df['child_num']/df['age']
    df['income_total_age_ratio'] = df['income_total']/df['age']
    df['birth_age_ratio'] = df['days_birth']/df['age']
    df['employed_age_ratio'] = df['days_employed']/df['age']
    df['family_age_ratio'] = df['family_size']/df['age']
    df['begin_age_ratio'] = df['begin_month']/df['age']

    

    df['family_child_diff'] = df['family_size'] - df['child_num']   
    df['birth_begin_diff'] = df['days_birth'] - (df['begin_month'] * 30)    
    df['employed_begin_diff'] = df['days_employed'] - df['begin_month']
    
    
    df['day_birth_month']=np.floor((df['days_birth'])/30)-((np.floor((df['days_birth'])/30)/12).astype(int)*12)
    df['DAYS_BIRTH_week']=np.floor((df['days_birth'])/7)-((np.floor((df['days_birth'])/7)/4).astype(int)*4)
    
    df['DAYS_EMPLOYED_month']=np.floor((df['days_birth'])/30)-((np.floor((df['days_birth'])/30)/12).astype(int)*12)
    df['DAYS_EMPLOYED_week']=np.floor((df['days_birth'])/7)-((np.floor((df['days_birth'])/7)/4).astype(int)*4)
    
    df['before_employed']=df['days_birth']-df['days_employed']
    df['before_employed_month']=np.floor((df['before_employed'])/30)-((np.floor((df['before_employed'])/30)/12).astype(int)*12)
    df['before_employed_week']=np.floor((df['before_employed'])/7)-((np.floor((df['before_employed'])/7)/4).astype(int)*4)
    
    df['days_birth_weekday'] = (df['days_birth'])%7   
    df['days_birth_days_in_month'] = (df['days_birth']) - (np.floor((-df['days_birth'])/30)*30)
    df['days_birth_day'] = (df['days_birth'])%365
    
    df['days_employed_weekday'] = (df['days_employed'])%7
    df['before_EMPLOYED_weekday'] = (df['days_employed'])%7
    df['before_employed_weekday_days_in_month'] = (df['before_employed']) - (np.floor((df['before_employed'])/30)*30)
    df['before_employed_year_week'] = ((df['before_employed'])%365)/7
    df['before_employed_year_month'] = ((df['before_employed'])%365)/30
    df['days_birth_year_num_week'] = ((df['days_birth'])%365)//7
    df['before_employed_year_num_month'] = ((df['before_employed'])%365)//30
    df['days_birth_10']=np.floor((df['days_birth'])/7)-((np.floor((df['days_birth'])/7)/3).astype(int)*3)
    df['days_employed_10']=np.floor((df['days_employed'])/7)-((np.floor((df['days_employed'])/7)/3).astype(int)*3)
    df['before_employed_10']=np.floor((df['before_employed'])/7)-((np.floor((df['before_employed'])/7)/3).astype(int)*3)
    df['days_birth_year_week'] = ((df['days_birth'])%365)/7

    
    return df

In [11]:
data= feature_pipeline(data)

In [12]:
cat_columns = ['gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'work_phone', 
               'phone', 'email', 'occyp_type', 'credit', 'group_income_type_work', 'group_income_type_money', 
              'group_family_type_solo', 'group_family_type_married1', 'group_family_type_married2'] 
num_columns = [c for c in data.columns if c not in cat_columns]
print('Categorical Columns: \n{}\n\n Numeric Columns: \n{}\n'.format(cat_columns, num_columns))

Categorical Columns: 
['gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'work_phone', 'phone', 'email', 'occyp_type', 'credit', 'group_income_type_work', 'group_income_type_money', 'group_family_type_solo', 'group_family_type_married1', 'group_family_type_married2']

 Numeric Columns: 
['child_num', 'income_total', 'days_birth', 'days_employed', 'family_size', 'begin_month', 'age', 'income_child_ratio', 'birth_child_ratio', 'employed_child_ratio', 'child_family_ratio', 'begin_child_ratio', 'income_birth_ratio', 'income_emp_ratio', 'income_family_ratio', 'income_begin_ratio', 'employed_birth_ratio', 'family_birth_ratio', 'begin_birth_ratio', 'family_employed_ratio', 'begin_employed_ratio', 'begin_family_ratio', 'child_age_ratio', 'income_total_age_ratio', 'birth_age_ratio', 'employed_age_ratio', 'family_age_ratio', 'begin_age_ratio', 'family_child_diff', 'birth_begin_diff', 'employed_begin_diff', 'day_birth_month', 'DAYS_BIRTH_week', 'DAYS_EMPLOYED_mon

In [25]:
def derived_variable(df):

    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'gender', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max', 
                                          'std','var'   
                                         ]
                              )
    
    X_columns = [agg + '_' + 'gender' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='gender')
    
    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'car', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max', 
                                          'std','var'   
                                         ]
                              )
    
    X_columns = [agg + '_' + 'car' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='car')
    
    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'reality', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max', 
                                          'std','var'   
                                         ]
                              )
    
    X_columns = [agg + '_' + 'reality' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='reality')
    
    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'income_type', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max', 
                                          'std','var'   
                                         ]
                              )
    
    X_columns = [agg + '_' + 'income_type' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='income_type')
    
    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'edu_type', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max', 
                                          'std','var'   
                                         ]
                              )
    
    X_columns = [agg + '_' + 'edu_type' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='edu_type')
    
    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'family_type', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max', 
                                          'std','var'   
                                         ]
                              )
    
    X_columns = [agg + '_' + 'family_type' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='family_type')

    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'house_type', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max', 
                                          'std','var'   
                                         ]
                              )
    
    X_columns = [agg + '_' + 'house_type' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='house_type')

    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'work_phone', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max', 
                                          'std','var'   
                                         ]
                              )
    
    X_columns = [agg + '_' + 'work_phone' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='work_phone')
    
    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'phone', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max', 
                                          'std','var'   
                                         ]
                              )
    
    X_columns = [agg + '_' + 'phone' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='phone')
    
    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'email', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max', 
                                          'std','var'   
                                         ]
                              )
    
    X_columns = [agg + '_' + 'email' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='email')
    
    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'occyp_type', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max', 
                                          'std','var'   
                                         ]
                              )
    
    X_columns = [agg + '_' + 'occyp_type' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='occyp_type')

    """
    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'group_income_type_work', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max', 
                                          'std','var'   
                                         ]
                              )
    
    X_columns = [agg + '_' + 'group_income_type_work' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='group_income_type_work')    
    
    
    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'group_income_type_money', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max', 
                                          'std','var'   
                                         ]
                              )
    
    X_columns = [agg + '_' + 'group_income_type_money' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='group_income_type_money')    
    
    
    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'group_family_type_solo', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max', 
                                          'std','var'   
                                         ]
                              )
    
    X_columns = [agg + '_' + 'group_family_type_solo' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='group_family_type_solo')    
    

    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'group_family_type_married1', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max'  
                                         ]
                              )
    
    X_columns = [agg + '_' + 'group_family_type_married1' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='group_family_type_married1')    
    
    
    X_pivot_train = pd.pivot_table(data = df, 
                               values = num_columns,  
                               index = 'group_family_type_married2', 
                               aggfunc = ['sum','mean',       
                                          'median','min','max'   
                                         ]
                              )
    
    X_columns = [agg + '_' + 'group_family_type_married2' + '_' + column for agg,column in X_pivot_train.columns]
    X_pivot_train.columns = X_columns
    df = pd.merge(df, X_pivot_train, on='group_family_type_married2')    
    """
    # One-hot encoding
    df = pd.get_dummies(data=df,drop_first=True)
    
    return df

In [26]:
train = data[:len(data)-10000]
test = data[len(data)-10000:]


df_train = derived_variable(train)
df_test = derived_variable(test)

In [27]:
features = df_train.columns.difference(['credit'])

X = df_train[features]
y = df_train['credit']
X_test = df_test[features]

## LGBM  

: boosting_type(gbdt) 이외의 것으로 변경


: dart, goss, rf

* (전체모델) `KFold` 보다 `StratifiedKFold`의 성능이 더 낮게 나옴.

* (성능) One-hot encoding  >  Label encoding 

ver 4.

    * CV : 0.6931017417229036
    * PB : 0.6942892129


실험) One-hot encoding  v.s.  Label encoding 

    * 0.6931(One-hot) < 0.6937(Label)

    * edu_type -> object : 0.6937034671128072 
    * edu_type -> int : 0.6937034671128072
    * cat_feature option ~ X : 0.6937034671128072 


In [28]:
seed_num = 42
NUM_FOLDS = 10

In [29]:
lgbm_params = {
    "boosting" : 'gbdt', # gbdt, goss
    "random_state": seed_num,
    "n_jobs": -1,
    "early_stopping_round": 100,
    'num_iterations' : 1000,

    # "categorical_feature:name=" : cat_features,

    "metric": "multi_logloss",
    "objective" : "multiclass",

    "n_estimators": 10000,
    "learning_rate": 0.01,

    "max_depth": -1,   
    "num_leaves": 200,     
    "colsample_bytree": 0.8, 
    "max_bin": 100,
    "min_child_samples": 8,  # min_data_in_leaf

    "feature_fraction" : 0.2,
    'scale_pos_weight': 1.5
    # "lambda_l1" : 0.2,
    # "lambda_l2" : 0.1,

}

In [30]:
train_oof = np.zeros((len(df_train),3))
test_preds = 0

kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=seed_num)
# kf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=seed_num)

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(X, y))):
        tmp_train_df, tmp_val_df = X.iloc[train_ind], X.iloc[val_ind]
        train_target, val_target = y[train_ind], y[val_ind]
        
        model = LGBMClassifier(**lgbm_params)
        model.fit(
            tmp_train_df, 
            train_target, 
            eval_metric = 'multi_logloss',
            eval_set=[(tmp_val_df, val_target)], 
            early_stopping_rounds = 10, 
            verbose = False
        )
        
        temp_oof = model.predict_proba(tmp_val_df)
        temp_test = model.predict_proba(X_test)

        train_oof[val_ind,:] = temp_oof
        test_preds += temp_test/NUM_FOLDS
        
print(log_loss(y.astype('int'),train_oof))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


0.6913613434573483


In [31]:
print(log_loss(y.astype('int'),train_oof))

0.6913613434573483


In [33]:
lgb.plot_importance(model, figsize=(12, 300), color='black')

Output hidden; open in https://colab.research.google.com to view.