원본 커널 : https://www.kaggle.com/eliotbarr/stacking-test-sklearn-xgboost-catboost-lightgbm

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import skew

import xgboost as xgb
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression

from math import sqrt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import gc

In [3]:
NFOLDS = 3
SEED = 0
NROWS = None

path = 'C:\\Users\\user\\Downloads\\home-credit-default-risk\\'

data = pd.read_csv(path+'application_train.csv')
test = pd.read_csv(path+'application_test.csv')
prev = pd.read_csv(path+'previous_application.csv')


<br>

### 1.1 train 데이터 object 데이터 뽑기

In [4]:
# data type이 object
categorical_feats = [
    f for f in data.columns if data[f].dtype == 'object'
]

In [6]:
data.columns

Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=122)

In [12]:
categorical_feats[:5]

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE']

<br>

### 1.2 pd.factorize() - 범주형 숫자 인코딩

In [8]:
for f_ in categorical_feats:
    data[f_], indexer = pd.factorize(data[f_]) # 
    # data[f_] 는 인코딩된 값, # indexer 는 범주
    test[f_] = indexer.get_indexer(test[f_])
    

In [11]:
indexer.get_indexer(test['EMERGENCYSTATE_MODE'])

array([-1, -1, -1, ..., -1, -1, -1], dtype=int64)

In [13]:
gc.enable()

y_train = data['TARGET']
del data['TARGET']

<br>

### 2.1 prev 데이터 object 데이터 뽑기

In [14]:
prev_cat_features = [
    f_ for f_ in prev.columns if prev[f_].dtype == 'object'
]

In [15]:
for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])

In [16]:
avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']

del avg_prev['SK_ID_PREV']


<br>

### 3. data merge

In [17]:
x_train = data.merge(right = avg_prev.reset_index(), how ='left', on='SK_ID_CURR')
x_test = test.merge(right = avg_prev.reset_index(), how ='left', on='SK_ID_CURR')

x_train = x_train.fillna(0)
x_test = x_test.fillna(0)

In [19]:
ntrain = x_train.shape[0]
ntest = x_test.shape[0]

excluded_feats = ['SK_ID_CURR']
features = [f_ for f_ in x_train.columns if f_ not in excluded_feats]

x_train = x_train[features]
x_test = x_test[features]

In [20]:
kf = KFold(n_splits=NFOLDS, shuffle = True, random_state = SEED)

In [21]:
class SklearnWrapper(object):
    def __init__(self, clf, seed = 0, params = None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [22]:
class CatboostWrapper(object):
    def __init__(self, clf, seed = 0, params = None):
        params['random_seed'] =seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
    
    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [23]:
class LightGBMWrapper(object):
    def __init__(self, clf, seed=0, params = None):
        params['feature_fraction_seed'] = seed
        params['bagging_seed'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

In [24]:
class XgbWrapper(object):
    def __init__(self, seed= 0, params =None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)
        
    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label = y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)
        
    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [25]:
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest, ))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i , (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]
        
        clf.train(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
        
    oof_test[:] = oof_test_skf.mean(axis = 0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [26]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 200
}

catboost_params = {
    'iterations': 200,
    'learning_rate': 0.5,
    'depth': 3,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

lightgbm_params = {
    'n_estimators':200,
    'learning_rate':0.1,
    'num_leaves':123,
    'colsample_bytree':0.8,
    'subsample':0.9,
    'max_depth':15,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'min_split_gain':0.01,
    'min_child_weight':2    
}

In [29]:
xg = XgbWrapper(seed = SEED, params = xgb_params)
et = SklearnWrapper(ExtraTreesClassifier, SEED, et_params)
rf = SklearnWrapper(RandomForestClassifier, SEED, rf_params)
cb = CatboostWrapper(CatBoostClassifier, SEED, catboost_params)
lg = LightGBMWrapper(LGBMClassifier, SEED, lightgbm_params)

In [30]:
xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
cb_oof_train, cb_oof_test = get_oof(cb)

  if getattr(data, 'base', None) is not None and \


0:	total: 218ms	remaining: 43.5s
1:	total: 296ms	remaining: 29.3s
2:	total: 378ms	remaining: 24.8s
3:	total: 453ms	remaining: 22.2s
4:	total: 536ms	remaining: 20.9s
5:	total: 628ms	remaining: 20.3s
6:	total: 719ms	remaining: 19.8s
7:	total: 800ms	remaining: 19.2s
8:	total: 869ms	remaining: 18.4s
9:	total: 947ms	remaining: 18s
10:	total: 1.03s	remaining: 17.6s
11:	total: 1.12s	remaining: 17.5s
12:	total: 1.19s	remaining: 17.2s
13:	total: 1.27s	remaining: 16.9s
14:	total: 1.39s	remaining: 17.2s
15:	total: 1.47s	remaining: 16.9s
16:	total: 1.55s	remaining: 16.6s
17:	total: 1.62s	remaining: 16.4s
18:	total: 1.71s	remaining: 16.2s
19:	total: 1.79s	remaining: 16.2s
20:	total: 1.95s	remaining: 16.6s
21:	total: 2.04s	remaining: 16.5s
22:	total: 2.12s	remaining: 16.3s
23:	total: 2.19s	remaining: 16.1s
24:	total: 2.27s	remaining: 15.9s
25:	total: 2.34s	remaining: 15.7s
26:	total: 2.51s	remaining: 16.1s
27:	total: 2.59s	remaining: 15.9s
28:	total: 2.66s	remaining: 15.7s
29:	total: 2.74s	remaining

43:	total: 3.67s	remaining: 13s
44:	total: 3.74s	remaining: 12.9s
45:	total: 3.82s	remaining: 12.8s
46:	total: 3.92s	remaining: 12.7s
47:	total: 3.99s	remaining: 12.6s
48:	total: 4.07s	remaining: 12.5s
49:	total: 4.14s	remaining: 12.4s
50:	total: 4.22s	remaining: 12.3s
51:	total: 4.32s	remaining: 12.3s
52:	total: 4.42s	remaining: 12.3s
53:	total: 4.5s	remaining: 12.2s
54:	total: 4.57s	remaining: 12s
55:	total: 4.65s	remaining: 12s
56:	total: 4.73s	remaining: 11.9s
57:	total: 4.82s	remaining: 11.8s
58:	total: 4.9s	remaining: 11.7s
59:	total: 4.99s	remaining: 11.7s
60:	total: 5.08s	remaining: 11.6s
61:	total: 5.16s	remaining: 11.5s
62:	total: 5.24s	remaining: 11.4s
63:	total: 5.32s	remaining: 11.3s
64:	total: 5.39s	remaining: 11.2s
65:	total: 5.47s	remaining: 11.1s
66:	total: 5.55s	remaining: 11s
67:	total: 5.66s	remaining: 11s
68:	total: 5.74s	remaining: 10.9s
69:	total: 5.82s	remaining: 10.8s
70:	total: 5.9s	remaining: 10.7s
71:	total: 5.98s	remaining: 10.6s
72:	total: 6.07s	remaining:

84:	total: 6.59s	remaining: 8.92s
85:	total: 6.67s	remaining: 8.84s
86:	total: 6.75s	remaining: 8.76s
87:	total: 6.81s	remaining: 8.67s
88:	total: 6.87s	remaining: 8.57s
89:	total: 6.95s	remaining: 8.49s
90:	total: 7.03s	remaining: 8.42s
91:	total: 7.11s	remaining: 8.35s
92:	total: 7.2s	remaining: 8.28s
93:	total: 7.28s	remaining: 8.21s
94:	total: 7.36s	remaining: 8.13s
95:	total: 7.44s	remaining: 8.06s
96:	total: 7.52s	remaining: 7.99s
97:	total: 7.61s	remaining: 7.92s
98:	total: 7.7s	remaining: 7.85s
99:	total: 7.78s	remaining: 7.78s
100:	total: 7.84s	remaining: 7.69s
101:	total: 7.92s	remaining: 7.61s
102:	total: 8s	remaining: 7.54s
103:	total: 8.07s	remaining: 7.45s
104:	total: 8.15s	remaining: 7.38s
105:	total: 8.24s	remaining: 7.3s
106:	total: 8.32s	remaining: 7.23s
107:	total: 8.39s	remaining: 7.15s
108:	total: 8.47s	remaining: 7.07s
109:	total: 8.56s	remaining: 7s
110:	total: 8.63s	remaining: 6.92s
111:	total: 8.71s	remaining: 6.84s
112:	total: 8.79s	remaining: 6.77s
113:	total

In [31]:
print('XG-CV: {}'.format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print('ET-CV: {}'.format(sqrt(mean_squared_error(y_train, et_oof_train))))
print('RF-CV: {}'.format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print('CB-CV: {}'.format(sqrt(mean_squared_error(y_train, cb_oof_train))))

XG-CV: 0.2594740573387384
ET-CV: 0.26296502347137407
RF-CV: 0.2629856213006646
CB-CV: 0.3308268881697429


In [32]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train), axis = 1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, cb_oof_test), axis =1)

In [33]:
print('{}, {}'.format(x_train.shape, x_test.shape))

(307511, 4), (48744, 4)


In [34]:
logistic_regression = LogisticRegression()
logistic_regression.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [36]:
test['TARGET'] = logistic_regression.predict_proba(x_test)[:,1]
test[['SK_ID_CURR', 'TARGET']].to_csv(path+'first_submission.csv', index = False, float_format ='%.8f' )