In [148]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import torch
from sklearn.preprocessing import LabelEncoder, PowerTransformer, StandardScaler, \
                                    MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.metrics import roc_auc_score, log_loss, classification_report
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.cluster import KMeans
from function_dt_check import time_checker
from category_encoders.ordinal import OrdinalEncoder
import json
import os
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier, Pool
%matplotlib inline

In [149]:
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'Hancom Gothic'
plt.style.use('bmh')
plt.rc('font',size=15)

In [150]:
BASE_DIR = '/Users/HwaLang/Desktop/python/T academy/Kaggle_camp/'
train_path = os.path.join(BASE_DIR, 'data', 'MDC14', 'train.csv')
test_path  = os.path.join(BASE_DIR, 'data', 'MDC14', 'test.csv')

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

submission = pd.read_csv(f'{BASE_DIR}/data/MDC14/sample_submission.csv')

In [151]:
train.fillna('NaN', inplace=True) 
test.fillna('NaN', inplace=True)

In [152]:
train = train[(train['family_size'] <= 7)]
train = train.reset_index(drop=True)

In [153]:
train.drop(['FLAG_MOBIL'], axis=1, inplace=True)
test.drop(['FLAG_MOBIL'], axis=1, inplace=True)

In [154]:
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)

In [155]:
feats = ['DAYS_BIRTH', 'begin_month', 'DAYS_EMPLOYED']
for feat in feats:
    train[feat]=np.abs(train[feat])
    test[feat]=np.abs(test[feat])

In [156]:
for df in [train,test]:
    # before_EMPLOYED: 고용되기 전까지의 일수
    df['before_EMPLOYED'] = df['DAYS_BIRTH'] - df['DAYS_EMPLOYED']
    df['income_total_befofeEMP_ratio'] = df['income_total'] / df['before_EMPLOYED']
    df['before_EMPLOYED_m'] = np.floor(df['before_EMPLOYED'] / 30) - ((np.floor(df['before_EMPLOYED'] / 30) / 12).astype(int) * 12)
    df['before_EMPLOYED_w'] = np.floor(df['before_EMPLOYED'] / 7) - ((np.floor(df['before_EMPLOYED'] / 7) / 4).astype(int) * 4)
    
    #DAYS_BIRTH 파생변수- Age(나이), 태어난 월, 태어난 주(출생연도의 n주차)
    df['Age'] = df['DAYS_BIRTH'] // 365
    df['DAYS_BIRTH_m'] = np.floor(df['DAYS_BIRTH'] / 30) - ((np.floor(df['DAYS_BIRTH'] / 30) / 12).astype(int) * 12)
    df['DAYS_BIRTH_w'] = np.floor(df['DAYS_BIRTH'] / 7) - ((np.floor(df['DAYS_BIRTH'] / 7) / 4).astype(int) * 4)

    
    #DAYS_EMPLOYED_m 파생변수- EMPLOYED(근속연수), DAYS_EMPLOYED_m(고용된 달) ,DAYS_EMPLOYED_w(고용된 주(고용연도의 n주차))  
    df['EMPLOYED'] = df['DAYS_EMPLOYED'] // 365
    df['DAYS_EMPLOYED_m'] = np.floor(df['DAYS_EMPLOYED'] / 30) - ((np.floor(df['DAYS_EMPLOYED'] / 30) / 12).astype(int) * 12)
    df['DAYS_EMPLOYED_w'] = np.floor(df['DAYS_EMPLOYED'] / 7) - ((np.floor(df['DAYS_EMPLOYED'] / 7) / 4).astype(int) * 4)

    #ability: 소득/(살아온 일수+ 근무일수)
    df['ability'] = df['income_total'] / (df['DAYS_BIRTH'] + df['DAYS_EMPLOYED'])
    
    #income_mean: 소득/ 가족 수
    df['income_mean'] = df['income_total'] / df['family_size']
    
    #ID 생성: 각 컬럼의 값들을 더해서 고유한 사람을 파악(*한 사람이 여러 개 카드를 만들 가능성을 고려해 begin_month는 제외함)
    df['ID'] = \
    df['child_num'].astype(str) + '_' + df['income_total'].astype(str) + '_' +\
    df['DAYS_BIRTH'].astype(str) + '_' + df['DAYS_EMPLOYED'].astype(str) + '_' +\
    df['work_phone'].astype(str) + '_' + df['phone'].astype(str) + '_' +\
    df['email'].astype(str) + '_' + df['family_size'].astype(str) + '_' +\
    df['gender'].astype(str) + '_' + df['car'].astype(str) + '_' +\
    df['reality'].astype(str) + '_' + df['income_type'].astype(str) + '_' +\
    df['edu_type'].astype(str) + '_' + df['family_type'].astype(str) + '_' +\
    df['house_type'].astype(str) + '_' + df['occyp_type'].astype(str)

In [157]:
cols = ['child_num', 'DAYS_BIRTH', 'DAYS_EMPLOYED',]
train.drop(cols, axis=1, inplace=True)
test.drop(cols, axis=1, inplace=True)

In [158]:
numerical_feats = train.dtypes[train.dtypes != "object"].index.tolist()
numerical_feats.remove('credit')
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = train.dtypes[train.dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  19
Number of Categorical features:  9


In [159]:
for df in [train,test]:
    df['income_total'] = np.log1p(1+df['income_total'])

In [160]:
encoder = OrdinalEncoder(categorical_feats)
train[categorical_feats] = encoder.fit_transform(train[categorical_feats], train['credit'])
test[categorical_feats] = encoder.transform(test[categorical_feats])

train['ID'] = train['ID'].astype('int64')
test['ID'] = test['ID'].astype('int64')

In [161]:
kmeans_train = train.drop(['credit'], axis=1)
kmeans = KMeans(n_clusters=36, random_state=42).fit(kmeans_train)
train['cluster'] = kmeans.predict(kmeans_train)
test['cluster'] = kmeans.predict(test)

In [162]:
numerical_feats.remove('income_total')
scaler = StandardScaler()
train[numerical_feats] = scaler.fit_transform(train[numerical_feats])
test[numerical_feats] = scaler.transform(test[numerical_feats])

In [163]:
trainkeys = train.keys().to_list()
trainkeys.remove('credit')

In [164]:
train.head()

Unnamed: 0,index,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,work_phone,...,Age,DAYS_BIRTH_m,DAYS_BIRTH_w,EMPLOYED,DAYS_EMPLOYED_m,DAYS_EMPLOYED_w,ability,income_mean,ID,cluster
0,-1.731942,1,1,1,12.218505,1,1,1,1,-0.538321,...,-0.452826,0.442795,-0.443485,0.994253,-1.230046,-1.077087,-0.032496,0.002062,1,15
1,-1.731811,1,1,2,12.419174,1,2,2,2,-0.538321,...,-1.060773,0.442795,-0.443485,-0.250471,-0.424295,-1.077087,1.190137,-0.254157,2,22
2,-1.73168,2,2,2,13.017007,2,1,1,2,-0.538321,...,0.763069,-1.582567,0.451504,0.994253,-0.424295,-0.223607,1.186515,1.693108,3,12
3,-1.731549,1,1,2,12.218505,1,2,1,2,-0.538321,...,-0.192277,1.310808,1.346494,-0.09488,1.187206,0.629874,0.101168,0.002062,4,15
4,-1.731418,1,2,2,11.967193,3,1,1,2,-0.538321,...,-0.192277,1.021471,-1.338475,-0.09488,1.45579,-1.077087,-0.282885,-0.305401,5,22


In [165]:
train.drop(columns = ['index'], inplace=True)

In [186]:
test.drop(columns = ['index'], inplace=True)

In [166]:
trainkeys.remove('index')

In [167]:
data, label = train[trainkeys], train['credit']

In [168]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size = 0.2)

In [169]:
@time_checker
def train_model(x_data, y_data, params, k=5, num_boost_round = 200, verbose_eval = 100, early_stopping_rounds = 100, stratified = False, return_models = False):
    models = []
    
#     k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
    if stratified:
        k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data, y_data]
    else:
        k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data]
#     k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123) if stratified else KFold(n_splits=k, shuffle=True, random_state=123)
    
    
    for train_idx, val_idx in k_fold.split(*data):
        x_train, y_train = x_data.iloc[train_idx], y_data.iloc[train_idx]
        x_val, y_val = x_data.iloc[val_idx], y_data.iloc[val_idx]
    
        d_train = xgb.DMatrix(data = x_train, label = y_train)
        d_val = xgb.DMatrix(data = x_val, label = y_val)
        
        wlist = [(d_train, 'train'), (d_val, 'eval')]
        
        model = xgb.train(params=params, dtrain=d_train, num_boost_round = num_boost_round, evals=wlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval)
        models.append(model)
    
    print(f"{k} fold mean score:", np.mean([i.best_score for i in models]))
    
    if return_models:
        return models

@time_checker
def last_train(X_test, y_test, params, num_boost_round = 200):
    print("***최종 학습 전 하이퍼 파라미터 다시한번 확인!!***")
    
    d_test = xgb.DMatrix(data = X_test, label = y_test)
    model = xgb.train(params = params, dtrain = d_test, num_boost_round = num_boost_round)
    
    return model

def get_XGBparams(booster):
    config = json.loads(booster.save_config()) # your xgb booster object
    stack = [config]
    internal = {}
    while stack:
        obj = stack.pop()
        for k, v in obj.items():
            if k.endswith('_param'):
                for p_k, p_v in v.items():
                    internal[p_k] = p_v
            elif isinstance(v, dict):
                stack.append(v)
    return internal

In [170]:
def train_cat_model(x_data, y_data, cat_cols, x_test = None, k=5, 
                    num_boost_round = 200, verbose_eval = 100, 
                    early_stopping_rounds = 100, stratified = False, 
                    return_models = False, return_pred_data = False):
    models = []
    if return_pred_data:
        assert type(x_test) != type(None), "If return_pred_data is True, X_test data must be passed"
        oof_train = np.zeros([x_data.shape[0], len(np.unique(y_data))])
        oof_test  = np.zeros([x_test.shape[0], len(np.unique(y_data))])
    
    if stratified:
        k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data, y_data]
    else:
        k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data]



    for train_idx, val_idx in k_fold.split(*data):
        x_train, y_train = x_data.iloc[train_idx], y_data.iloc[train_idx]
        x_val, y_val = x_data.iloc[val_idx], y_data.iloc[val_idx]

        model = CatBoostClassifier()
        train_data = Pool(data=x_train, label=y_train, cat_features=cat_cols)
        valid_data = Pool(data=x_val, label=y_val, cat_features=cat_cols)
        model.fit(train_data, 
                  eval_set=valid_data, 
                  use_best_model=True, 
                  early_stopping_rounds=100, 
                  verbose=100)
        models.append(model)
        
        if return_pred_data:
            oof_train[val_idx] += model.predict_proba(x_val)
            oof_test           += model.predict_proba(x_test)/k
        
        
    print(f"{k} fold mean score:", np.mean([i.best_score_['validation']['MultiClass'] for i in models]))
    
    if return_models:
        return models
    
    if return_pred_data:
        return oof_train, oof_test

In [171]:
cat_cols = ['income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type', 'ID']

In [25]:
models = train_cat_model(X_train, y_train, 
                         cat_cols, 
                         return_models = True, 
                         stratified = True)

Learning rate set to 0.114262
0:	learn: 1.0350538	test: 1.0338581	best: 1.0338581 (0)	total: 171ms	remaining: 2m 50s
100:	learn: 0.7161785	test: 0.6776288	best: 0.6776288 (100)	total: 2.58s	remaining: 22.9s
200:	learn: 0.6895159	test: 0.6752061	best: 0.6750262 (197)	total: 5.17s	remaining: 20.6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6750261896
bestIteration = 197

Shrink model to first 198 iterations.
Learning rate set to 0.114262
0:	learn: 1.0344780	test: 1.0349782	best: 1.0349782 (0)	total: 22.7ms	remaining: 22.7s
100:	learn: 0.7125680	test: 0.6979928	best: 0.6979928 (100)	total: 2.81s	remaining: 25s
200:	learn: 0.6864949	test: 0.6974120	best: 0.6966422 (147)	total: 5.89s	remaining: 23.4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6966422327
bestIteration = 147

Shrink model to first 148 iterations.
Learning rate set to 0.114262
0:	learn: 1.0344759	test: 1.0350522	best: 1.0350522 (0)	total: 11.4ms	remaining: 11.4s
100:	learn: 0

In [26]:
cat_model = CatBoostClassifier()
train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)

In [27]:
cat_model.fit(train_data, early_stopping_rounds=100, verbose=100)

Learning rate set to 0.092454
0:	learn: 1.0467848	total: 18.7ms	remaining: 18.7s
100:	learn: 0.7128433	total: 2.47s	remaining: 22s
200:	learn: 0.6933421	total: 5.25s	remaining: 20.9s
300:	learn: 0.6769266	total: 8.32s	remaining: 19.3s
400:	learn: 0.6608961	total: 11.2s	remaining: 16.8s
500:	learn: 0.6446641	total: 14.2s	remaining: 14.1s
600:	learn: 0.6281730	total: 17.2s	remaining: 11.4s
700:	learn: 0.6124723	total: 20.3s	remaining: 8.64s
800:	learn: 0.5973685	total: 23.3s	remaining: 5.8s
900:	learn: 0.5836272	total: 26.4s	remaining: 2.9s
999:	learn: 0.5678838	total: 29.5s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x17af16e4e88>

In [28]:
oof_train, oof_test = \
train_cat_model(X_train, 
                y_train,
                cat_cols, 
                X_test,
                return_pred_data = True)

Learning rate set to 0.114262
0:	learn: 1.0339501	test: 1.0359612	best: 1.0359612 (0)	total: 26.8ms	remaining: 26.8s
100:	learn: 0.7102960	test: 0.7124339	best: 0.7122634 (98)	total: 2.46s	remaining: 21.9s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7122634453
bestIteration = 98

Shrink model to first 99 iterations.
Learning rate set to 0.114262
0:	learn: 1.0351155	test: 1.0338431	best: 1.0338431 (0)	total: 14.2ms	remaining: 14.2s
100:	learn: 0.7144005	test: 0.6869832	best: 0.6869832 (100)	total: 2.48s	remaining: 22.1s
200:	learn: 0.6874009	test: 0.6860406	best: 0.6859865 (151)	total: 5.47s	remaining: 21.8s
300:	learn: 0.6631787	test: 0.6862291	best: 0.6854076 (278)	total: 8.12s	remaining: 18.8s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6854075845
bestIteration = 278

Shrink model to first 279 iterations.
Learning rate set to 0.114262
0:	learn: 1.0349601	test: 1.0340840	best: 1.0340840 (0)	total: 17ms	remaining: 17s
100:	learn: 0.7176

In [29]:
oof_train.shape, oof_test.shape

((21160, 3), (5291, 3))

In [30]:
log_loss(y_train, oof_train)

0.6948103481657713

In [31]:
log_loss(y_test, oof_test)

0.6947316289825596

## Random Forest

In [172]:
def train_RF_model(x_data, y_data, params, x_test = None, k=5, 
                    num_boost_round = 200, verbose_eval = 100, 
                    early_stopping_rounds = 100, stratified = False, 
                    return_models = False, return_pred_data = False):
    models = []
    if return_pred_data:
        assert type(x_test) != type(None), "If return_pred_data is True, X_test data must be passed"
        oof_train = np.zeros([x_data.shape[0], len(np.unique(y_data))])
        oof_test  = np.zeros([x_test.shape[0], len(np.unique(y_data))])
    
    if stratified:
        k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data, y_data]
    else:
        k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data]
    


    for train_idx, val_idx in k_fold.split(*data):
        x_train, y_train = x_data.iloc[train_idx], y_data.iloc[train_idx]
        x_val, y_val = x_data.iloc[val_idx], y_data.iloc[val_idx]

        model = RandomForestClassifier(n_estimators      = params['n_estimators'],
                                       max_depth         = params['max_depth'],
                                       criterion         = params['criterion'],
                                       min_samples_leaf  = params['min_samples_leaf'],
                                       min_samples_split = params['min_samples_split'],
                                       n_jobs            = 4)
        
        model.fit(x_train, y_train)
        models.append(model)
        
        print(f"{k} Fold log loss:", log_loss(y_val, model.predict_proba(x_val)))
        
        if return_pred_data:
            oof_train[val_idx] += model.predict_proba(x_val)
            oof_test           += model.predict_proba(x_test)/k
        
        
#     print(f"{k} fold mean score:", np.mean([i.best_score_['validation']['MultiClass'] for i in models]))
    
    if return_models:
        return models
    
    if return_pred_data:
        return oof_train, oof_test

In [173]:
with open(r"C:\Users\HwaLang\Desktop\python\T academy\Kaggle_camp\credit_card_dacon\NNI\RF\RF_params_0.704.json") as f:
    RF_params = json.load(f)

In [34]:
RF_params

{'n_estimators': 7271,
 'max_depth': 35,
 'criterion': 'entropy',
 'min_samples_leaf': 6,
 'min_samples_split': 12,
 'scaler': 'minmax'}

## XGBoost

In [174]:
def train_XGB_model(x_data, y_data, params, x_test = None, k=5, 
                    num_boost_round = 200, verbose_eval = 100, 
                    early_stopping_rounds = 100, stratified = False, 
                    return_models = False, return_pred_data = False):
    models = []
    if return_pred_data:
        assert type(x_test) != type(None), "If return_pred_data is True, X_test data must be passed"
        oof_train = np.zeros([x_data.shape[0], len(np.unique(y_data))])
        oof_test  = np.zeros([x_test.shape[0], len(np.unique(y_data))])
    
    if stratified:
        k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data, y_data]
    else:
        k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data]
    
    cv_scores = list()

    for train_idx, val_idx in k_fold.split(*data):
        x_train, y_train = x_data.iloc[train_idx], y_data.iloc[train_idx]
        x_val, y_val = x_data.iloc[val_idx], y_data.iloc[val_idx]

        model = XGBClassifier(n_estimators     = 1000000,
                              subsample        = params['subsample'],
                              max_depth        = params['max_depth'],
                              colsample_bytree = params['colsample_bytree'],
                              eta              = params['eta'],
                              n_jobs           = 4)
        
        model.fit(x_train, y_train,
                  eval_set=[[x_train, y_train], [x_val, y_val]],
                  eval_metric='mlogloss',
                  early_stopping_rounds=100,
                  verbose=100)
        
        models.append(model)
        
        if return_pred_data:
            oof_train[val_idx] += model.predict_proba(x_val)
            oof_test           += model.predict_proba(x_test)/k
        
        
#     print(f"{k} fold mean score:", np.mean([i.best_score_['validation']['MultiClass'] for i in models]))
    
    if return_models:
        return models
    
    if return_pred_data:
        return oof_train, oof_test

In [175]:
with open(r"C:\Users\HwaLang\Desktop\python\T academy\Kaggle_camp\credit_card_dacon\NNI\XGB\1st_sol_params_0.696406.json") as f:
    XGB_params = json.load(f)

In [37]:
oof_train_XGB, oof_test_XGB = \
train_XGB_model(X_train, 
                y_train,
                XGB_params, 
                X_test,
                return_pred_data = True)

[0]	validation_0-mlogloss:1.04068	validation_1-mlogloss:1.05431




[100]	validation_0-mlogloss:0.21427	validation_1-mlogloss:0.76280
[150]	validation_0-mlogloss:0.14149	validation_1-mlogloss:0.81855
[0]	validation_0-mlogloss:1.04012	validation_1-mlogloss:1.05245
[100]	validation_0-mlogloss:0.21568	validation_1-mlogloss:0.73590
[150]	validation_0-mlogloss:0.14042	validation_1-mlogloss:0.78780
[0]	validation_0-mlogloss:1.03877	validation_1-mlogloss:1.05193
[100]	validation_0-mlogloss:0.21618	validation_1-mlogloss:0.73934
[144]	validation_0-mlogloss:0.15068	validation_1-mlogloss:0.78246
[0]	validation_0-mlogloss:1.03916	validation_1-mlogloss:1.05371
[100]	validation_0-mlogloss:0.21538	validation_1-mlogloss:0.72174
[151]	validation_0-mlogloss:0.14337	validation_1-mlogloss:0.76898
[0]	validation_0-mlogloss:1.04098	validation_1-mlogloss:1.05398
[100]	validation_0-mlogloss:0.21903	validation_1-mlogloss:0.77573
[142]	validation_0-mlogloss:0.15243	validation_1-mlogloss:0.82274


In [38]:
log_loss(y_train, oof_train_XGB)

0.7154997320541561

In [39]:
log_loss(y_test, oof_test_XGB)

0.7136794378540435

## lightGBM

In [176]:
def train_LGB_model(x_data, y_data, params, x_test = None, k=5, 
                    num_boost_round = 200, verbose_eval = 100, 
                    early_stopping_rounds = 100, stratified = False, 
                    return_models = False, return_pred_data = False):
    models = []
    if return_pred_data:
        assert type(x_test) != type(None), "If return_pred_data is True, X_test data must be passed"
        oof_train = np.zeros([x_data.shape[0], len(np.unique(y_data))])
        oof_test  = np.zeros([x_test.shape[0], len(np.unique(y_data))])
    
    if stratified:
        k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data, y_data]
    else:
        k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data]
    
    cv_scores = list()

    for train_idx, val_idx in k_fold.split(*data):
        x_train, y_train = x_data.iloc[train_idx], y_data.iloc[train_idx]
        x_val, y_val = x_data.iloc[val_idx], y_data.iloc[val_idx]

        model = LGBMClassifier(n_estimators     = 1000000,
                               subsample        = params['subsample'],
                               max_depth        = params['max_depth'],
                               colsample_bytree = params['colsample_bytree'],
                               learning_rate    = params['lr'],
                               n_jobs           = 4)
        
        model.fit(x_train, y_train,
                  eval_set=[[x_train, y_train], [x_val, y_val]],
                  eval_metric='multi_logloss',
                  early_stopping_rounds=100,
                  verbose=100)

        
        models.append(model)
        
        if return_pred_data:
            oof_train[val_idx] += model.predict_proba(x_val)
            oof_test           += model.predict_proba(x_test)/k
        
        
#     print(f"{k} fold mean score:", np.mean([i.best_score_['validation']['MultiClass'] for i in models]))
    
    if return_models:
        return models
    
    if return_pred_data:
        return oof_train, oof_test

In [177]:
with open(r"C:\Users\HwaLang\Desktop\python\T academy\Kaggle_camp\credit_card_dacon\NNI\LGBM\LGBM_parmas_7107.json") as f:
    LGB_params = json.load(f)

In [42]:
oof_train_LGB, oof_test_LGB = \
train_LGB_model(X_train, 
                y_train,
                LGB_params, 
                X_test,
                return_pred_data = True)

Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.587332	valid_1's multi_logloss: 0.756191
[200]	training's multi_logloss: 0.476508	valid_1's multi_logloss: 0.744994
[300]	training's multi_logloss: 0.39985	valid_1's multi_logloss: 0.743663
Early stopping, best iteration is:
[250]	training's multi_logloss: 0.434875	valid_1's multi_logloss: 0.742413
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.594476	valid_1's multi_logloss: 0.732487
[200]	training's multi_logloss: 0.481488	valid_1's multi_logloss: 0.71885
[300]	training's multi_logloss: 0.403398	valid_1's multi_logloss: 0.717737
Early stopping, best iteration is:
[236]	training's multi_logloss: 0.451092	valid_1's multi_logloss: 0.716585
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.594421	valid_1's multi_logloss: 0.733862
[200]	training's multi_logloss: 0.484274	valid_1's multi_logloss: 0.720

In [43]:
log_loss(y_train, oof_train_LGB)

0.7285805737517002

In [44]:
log_loss(y_test, oof_test_LGB)

0.7282079180196158

In [113]:
new_X_train = np.concatenate([oof_train, oof_train_LGB, oof_train_XGB], axis = 1)
new_X_train = pd.DataFrame(new_X_train, index = X_train.index)

In [112]:
new_X_test = np.concatenate([oof_test, oof_test_LGB, oof_test_XGB], axis = 1)
new_X_test = pd.DataFrame(new_X_test, index = X_test.index)

In [125]:
models = train_XGB_model(new_X_train, y_train, XGB_params, return_models=True)

[0]	validation_0-mlogloss:1.02149	validation_1-mlogloss:1.03851




[100]	validation_0-mlogloss:0.23147	validation_1-mlogloss:0.75980
[128]	validation_0-mlogloss:0.19038	validation_1-mlogloss:0.77364
[0]	validation_0-mlogloss:1.02326	validation_1-mlogloss:1.03616
[100]	validation_0-mlogloss:0.23425	validation_1-mlogloss:0.72719
[129]	validation_0-mlogloss:0.19270	validation_1-mlogloss:0.74178
[0]	validation_0-mlogloss:1.02231	validation_1-mlogloss:1.03653
[100]	validation_0-mlogloss:0.23481	validation_1-mlogloss:0.72438
[132]	validation_0-mlogloss:0.19118	validation_1-mlogloss:0.73832
[0]	validation_0-mlogloss:1.02149	validation_1-mlogloss:1.03821
[100]	validation_0-mlogloss:0.23459	validation_1-mlogloss:0.71995
[133]	validation_0-mlogloss:0.18644	validation_1-mlogloss:0.73332
[0]	validation_0-mlogloss:1.02238	validation_1-mlogloss:1.03840
[100]	validation_0-mlogloss:0.22836	validation_1-mlogloss:0.75187
[127]	validation_0-mlogloss:0.18993	validation_1-mlogloss:0.76495


In [126]:
for model in models:
    print(log_loss(y_test, model.predict_proba(new_X_test)))

0.7143310813056885
0.7132537063218948
0.7138759394791847
0.7145480732196751
0.709304477049957


In [136]:
stacked_X = pd.concat([new_X_train,new_X_test]).sort_index()

In [137]:
stacked_y = pd.concat([y_train, y_test]).sort_index()

In [138]:
models = train_cat_model(stacked_X, stacked_y, cat_cols=None, return_models=True)

Learning rate set to 0.114773
0:	learn: 1.0147814	test: 1.0125947	best: 1.0125947 (0)	total: 4.7ms	remaining: 4.7s
100:	learn: 0.6747220	test: 0.6810895	best: 0.6809687 (86)	total: 394ms	remaining: 3.51s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6809687295
bestIteration = 86

Shrink model to first 87 iterations.
Learning rate set to 0.114773
0:	learn: 1.0126813	test: 1.0160999	best: 1.0160999 (0)	total: 4.57ms	remaining: 4.56s
100:	learn: 0.6655921	test: 0.7131194	best: 0.7126164 (73)	total: 407ms	remaining: 3.62s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7126163636
bestIteration = 73

Shrink model to first 74 iterations.
Learning rate set to 0.114773
0:	learn: 1.0134394	test: 1.0134492	best: 1.0134492 (0)	total: 4.42ms	remaining: 4.41s
100:	learn: 0.6700063	test: 0.6968974	best: 0.6963565 (43)	total: 396ms	remaining: 3.52s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6963564752
bestIteration = 43

Shrink mod

In [139]:
for model in models:
    print(log_loss(y_test, model.predict_proba(new_X_test)))

0.6825198912930569
0.6824337396315
0.6884385662152506
0.6792339737317
0.6827488142110296


In [178]:
cat_models = train_cat_model(X_train,
                y_train,
                cat_cols, 
                X_test,
                return_models = True)

Learning rate set to 0.114262
0:	learn: 1.0330658	test: 1.0356520	best: 1.0356520 (0)	total: 11.4ms	remaining: 11.4s
100:	learn: 0.7085298	test: 0.7049034	best: 0.7048776 (98)	total: 2.3s	remaining: 20.5s
200:	learn: 0.6804322	test: 0.7054955	best: 0.7043537 (131)	total: 4.84s	remaining: 19.2s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7043537203
bestIteration = 131

Shrink model to first 132 iterations.
Learning rate set to 0.114262
0:	learn: 1.0337235	test: 1.0346017	best: 1.0346017 (0)	total: 22.1ms	remaining: 22.1s
100:	learn: 0.7097573	test: 0.6973224	best: 0.6967707 (94)	total: 2.29s	remaining: 20.4s
200:	learn: 0.6828045	test: 0.6962259	best: 0.6956834 (157)	total: 4.7s	remaining: 18.7s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.695683397
bestIteration = 157

Shrink model to first 158 iterations.
Learning rate set to 0.114262
0:	learn: 1.0346992	test: 1.0327935	best: 1.0327935 (0)	total: 19.3ms	remaining: 19.3s
100:	learn: 0.71

In [179]:
LGB_models = train_LGB_model(X_train, 
                y_train,
                LGB_params, 
                X_test,
                return_models = True)

Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.583492	valid_1's multi_logloss: 0.756717
[200]	training's multi_logloss: 0.473812	valid_1's multi_logloss: 0.745684
[300]	training's multi_logloss: 0.397463	valid_1's multi_logloss: 0.746904
Early stopping, best iteration is:
[242]	training's multi_logloss: 0.439385	valid_1's multi_logloss: 0.743947
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.588132	valid_1's multi_logloss: 0.742758
[200]	training's multi_logloss: 0.477986	valid_1's multi_logloss: 0.732046
[300]	training's multi_logloss: 0.401109	valid_1's multi_logloss: 0.732171
Early stopping, best iteration is:
[225]	training's multi_logloss: 0.457331	valid_1's multi_logloss: 0.730536
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.595899	valid_1's multi_logloss: 0.734618
[200]	training's multi_logloss: 0.484215	valid_1's multi_logloss: 0.7

In [206]:
XGB_models = train_XGB_model(X_train, 
                y_train,
                XGB_params, 
                X_test,
                return_models = True)

[0]	validation_0-mlogloss:1.03734	validation_1-mlogloss:1.05447
[100]	validation_0-mlogloss:0.21516	validation_1-mlogloss:0.75752
[149]	validation_0-mlogloss:0.14296	validation_1-mlogloss:0.81071
[0]	validation_0-mlogloss:1.03944	validation_1-mlogloss:1.05292
[100]	validation_0-mlogloss:0.21287	validation_1-mlogloss:0.75374
[148]	validation_0-mlogloss:0.14359	validation_1-mlogloss:0.80524
[0]	validation_0-mlogloss:1.03922	validation_1-mlogloss:1.05281
[100]	validation_0-mlogloss:0.22385	validation_1-mlogloss:0.72381
[152]	validation_0-mlogloss:0.14457	validation_1-mlogloss:0.77728
[0]	validation_0-mlogloss:1.03849	validation_1-mlogloss:1.05218
[100]	validation_0-mlogloss:0.21510	validation_1-mlogloss:0.73484
[151]	validation_0-mlogloss:0.14046	validation_1-mlogloss:0.78851
[0]	validation_0-mlogloss:1.04015	validation_1-mlogloss:1.05324
[100]	validation_0-mlogloss:0.21445	validation_1-mlogloss:0.73531
[152]	validation_0-mlogloss:0.14001	validation_1-mlogloss:0.78708


In [None]:
cat_models, XGB_models, LGB_models

In [191]:
np.zeros(test.shape)

(10000, 28)

In [194]:
cat_test = np.zeros([test.shape[0], 3])
for cm in cat_models:
    cat_test += cm.predict_proba(test)/len(cat_models)

In [207]:
XGB_test = np.zeros([test.shape[0], 3])
for cm in XGB_models:
    XGB_test += cm.predict_proba(test)/len(XGB_models)

In [208]:
LGB_test = np.zeros([test.shape[0], 3])
for cm in LGB_models:
    LGB_test += cm.predict_proba(test)/len(LGB_models)

In [214]:
test_prob = np.concatenate([cat_test, XGB_test, LGB_test], axis = 1)