In [218]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder, PowerTransformer, StandardScaler, \
                                    MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.metrics import roc_auc_score, log_loss, classification_report
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.cluster import KMeans
from function_dt_check import time_checker
from category_encoders.ordinal import OrdinalEncoder
import json
import os
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier, Pool
%matplotlib inline

In [12]:
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'Hancom Gothic'
plt.style.use('bmh')
plt.rc('font',size=15)

In [18]:
BASE_DIR = '/Users/HwaLang/Desktop/python/T academy/Kaggle_camp/'
train_path = os.path.join(BASE_DIR, 'data', 'MDC14', 'train.csv')
test_path  = os.path.join(BASE_DIR, 'data', 'MDC14', 'test.csv')

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

submission = pd.read_csv(f'{BASE_DIR}/data/MDC14/sample_submission.csv')

In [19]:
train.fillna('NaN', inplace=True) 
test.fillna('NaN', inplace=True)

In [20]:
train = train[(train['family_size'] <= 7)]
train = train.reset_index(drop=True)

In [21]:
train.drop(['FLAG_MOBIL'], axis=1, inplace=True)
test.drop(['FLAG_MOBIL'], axis=1, inplace=True)

In [22]:
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)

In [23]:
feats = ['DAYS_BIRTH', 'begin_month', 'DAYS_EMPLOYED']
for feat in feats:
    train[feat]=np.abs(train[feat])
    test[feat]=np.abs(test[feat])

In [24]:
for df in [train,test]:
    # before_EMPLOYED: 고용되기 전까지의 일수
    df['before_EMPLOYED'] = df['DAYS_BIRTH'] - df['DAYS_EMPLOYED']
    df['income_total_befofeEMP_ratio'] = df['income_total'] / df['before_EMPLOYED']
    df['before_EMPLOYED_m'] = np.floor(df['before_EMPLOYED'] / 30) - ((np.floor(df['before_EMPLOYED'] / 30) / 12).astype(int) * 12)
    df['before_EMPLOYED_w'] = np.floor(df['before_EMPLOYED'] / 7) - ((np.floor(df['before_EMPLOYED'] / 7) / 4).astype(int) * 4)
    
    #DAYS_BIRTH 파생변수- Age(나이), 태어난 월, 태어난 주(출생연도의 n주차)
    df['Age'] = df['DAYS_BIRTH'] // 365
    df['DAYS_BIRTH_m'] = np.floor(df['DAYS_BIRTH'] / 30) - ((np.floor(df['DAYS_BIRTH'] / 30) / 12).astype(int) * 12)
    df['DAYS_BIRTH_w'] = np.floor(df['DAYS_BIRTH'] / 7) - ((np.floor(df['DAYS_BIRTH'] / 7) / 4).astype(int) * 4)

    
    #DAYS_EMPLOYED_m 파생변수- EMPLOYED(근속연수), DAYS_EMPLOYED_m(고용된 달) ,DAYS_EMPLOYED_w(고용된 주(고용연도의 n주차))  
    df['EMPLOYED'] = df['DAYS_EMPLOYED'] // 365
    df['DAYS_EMPLOYED_m'] = np.floor(df['DAYS_EMPLOYED'] / 30) - ((np.floor(df['DAYS_EMPLOYED'] / 30) / 12).astype(int) * 12)
    df['DAYS_EMPLOYED_w'] = np.floor(df['DAYS_EMPLOYED'] / 7) - ((np.floor(df['DAYS_EMPLOYED'] / 7) / 4).astype(int) * 4)

    #ability: 소득/(살아온 일수+ 근무일수)
    df['ability'] = df['income_total'] / (df['DAYS_BIRTH'] + df['DAYS_EMPLOYED'])
    
    #income_mean: 소득/ 가족 수
    df['income_mean'] = df['income_total'] / df['family_size']
    
    #ID 생성: 각 컬럼의 값들을 더해서 고유한 사람을 파악(*한 사람이 여러 개 카드를 만들 가능성을 고려해 begin_month는 제외함)
    df['ID'] = \
    df['child_num'].astype(str) + '_' + df['income_total'].astype(str) + '_' +\
    df['DAYS_BIRTH'].astype(str) + '_' + df['DAYS_EMPLOYED'].astype(str) + '_' +\
    df['work_phone'].astype(str) + '_' + df['phone'].astype(str) + '_' +\
    df['email'].astype(str) + '_' + df['family_size'].astype(str) + '_' +\
    df['gender'].astype(str) + '_' + df['car'].astype(str) + '_' +\
    df['reality'].astype(str) + '_' + df['income_type'].astype(str) + '_' +\
    df['edu_type'].astype(str) + '_' + df['family_type'].astype(str) + '_' +\
    df['house_type'].astype(str) + '_' + df['occyp_type'].astype(str)

In [25]:
cols = ['child_num', 'DAYS_BIRTH', 'DAYS_EMPLOYED',]
train.drop(cols, axis=1, inplace=True)
test.drop(cols, axis=1, inplace=True)

In [26]:
numerical_feats = train.dtypes[train.dtypes != "object"].index.tolist()
numerical_feats.remove('credit')
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = train.dtypes[train.dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  19
Number of Categorical features:  9


In [27]:
for df in [train,test]:
    df['income_total'] = np.log1p(1+df['income_total'])

In [28]:
encoder = OrdinalEncoder(categorical_feats)
train[categorical_feats] = encoder.fit_transform(train[categorical_feats], train['credit'])
test[categorical_feats] = encoder.transform(test[categorical_feats])

train['ID'] = train['ID'].astype('int64')
test['ID'] = test['ID'].astype('int64')

In [29]:
kmeans_train = train.drop(['credit'], axis=1)
kmeans = KMeans(n_clusters=36, random_state=42).fit(kmeans_train)
train['cluster'] = kmeans.predict(kmeans_train)
test['cluster'] = kmeans.predict(test)

In [30]:
numerical_feats.remove('income_total')
scaler = StandardScaler()
train[numerical_feats] = scaler.fit_transform(train[numerical_feats])
test[numerical_feats] = scaler.transform(test[numerical_feats])

In [31]:
trainkeys = train.keys().to_list()
trainkeys.remove('credit')

In [32]:
train.head()

Unnamed: 0,index,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,work_phone,...,Age,DAYS_BIRTH_m,DAYS_BIRTH_w,EMPLOYED,DAYS_EMPLOYED_m,DAYS_EMPLOYED_w,ability,income_mean,ID,cluster
0,-1.731942,1,1,1,12.218505,1,1,1,1,-0.538321,...,-0.452826,0.442795,-0.443485,0.994253,-1.230046,-1.077087,-0.032496,0.002062,1,15
1,-1.731811,1,1,2,12.419174,1,2,2,2,-0.538321,...,-1.060773,0.442795,-0.443485,-0.250471,-0.424295,-1.077087,1.190137,-0.254157,2,22
2,-1.73168,2,2,2,13.017007,2,1,1,2,-0.538321,...,0.763069,-1.582567,0.451504,0.994253,-0.424295,-0.223607,1.186515,1.693108,3,12
3,-1.731549,1,1,2,12.218505,1,2,1,2,-0.538321,...,-0.192277,1.310808,1.346494,-0.09488,1.187206,0.629874,0.101168,0.002062,4,15
4,-1.731418,1,2,2,11.967193,3,1,1,2,-0.538321,...,-0.192277,1.021471,-1.338475,-0.09488,1.45579,-1.077087,-0.282885,-0.305401,5,22


In [265]:
train.drop(columns = ['index'], inplace=True)

In [268]:
trainkeys.remove('index')

In [269]:
data, label = train[trainkeys], train['credit']

In [270]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size = 0.2)

In [271]:
@time_checker
def train_model(x_data, y_data, params, k=5, num_boost_round = 200, verbose_eval = 100, early_stopping_rounds = 100, stratified = False, return_models = False):
    models = []
    
#     k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
    if stratified:
        k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data, y_data]
    else:
        k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data]
#     k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123) if stratified else KFold(n_splits=k, shuffle=True, random_state=123)
    
    
    for train_idx, val_idx in k_fold.split(*data):
        x_train, y_train = x_data.iloc[train_idx], y_data.iloc[train_idx]
        x_val, y_val = x_data.iloc[val_idx], y_data.iloc[val_idx]
    
        d_train = xgb.DMatrix(data = x_train, label = y_train)
        d_val = xgb.DMatrix(data = x_val, label = y_val)
        
        wlist = [(d_train, 'train'), (d_val, 'eval')]
        
        model = xgb.train(params=params, dtrain=d_train, num_boost_round = num_boost_round, evals=wlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval)
        models.append(model)
    
    print(f"{k} fold mean score:", np.mean([i.best_score for i in models]))
    
    if return_models:
        return models

@time_checker
def last_train(X_test, y_test, params, num_boost_round = 200):
    print("***최종 학습 전 하이퍼 파라미터 다시한번 확인!!***")
    
    d_test = xgb.DMatrix(data = X_test, label = y_test)
    model = xgb.train(params = params, dtrain = d_test, num_boost_round = num_boost_round)
    
    return model

def get_XGBparams(booster):
    config = json.loads(booster.save_config()) # your xgb booster object
    stack = [config]
    internal = {}
    while stack:
        obj = stack.pop()
        for k, v in obj.items():
            if k.endswith('_param'):
                for p_k, p_v in v.items():
                    internal[p_k] = p_v
            elif isinstance(v, dict):
                stack.append(v)
    return internal

In [272]:
def train_cat_model(x_data, y_data, cat_cols, x_test = None, k=5, 
                    num_boost_round = 200, verbose_eval = 100, 
                    early_stopping_rounds = 100, stratified = False, 
                    return_models = False, return_pred_data = False):
    models = []
    if return_pred_data:
        assert type(x_test) != type(None), "If return_pred_data is True, X_test data must be passed"
        oof_train = np.zeros([x_data.shape[0], len(np.unique(y_data))])
        oof_test  = np.zeros([x_test.shape[0], len(np.unique(y_data))])
    
    if stratified:
        k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data, y_data]
    else:
        k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data]



    for train_idx, val_idx in k_fold.split(*data):
        x_train, y_train = x_data.iloc[train_idx], y_data.iloc[train_idx]
        x_val, y_val = x_data.iloc[val_idx], y_data.iloc[val_idx]

        model = CatBoostClassifier()
        train_data = Pool(data=x_train, label=y_train, cat_features=cat_cols)
        valid_data = Pool(data=x_val, label=y_val, cat_features=cat_cols)
        model.fit(train_data, 
                  eval_set=valid_data, 
                  use_best_model=True, 
                  early_stopping_rounds=100, 
                  verbose=100)
        models.append(model)
        
        if return_pred_data:
            oof_train[val_idx] += model.predict_proba(x_val)
            oof_test           += model.predict_proba(x_test)/k
        
        
    print(f"{k} fold mean score:", np.mean([i.best_score_['validation']['MultiClass'] for i in models]))
    
    if return_models:
        return models
    
    if return_pred_data:
        return oof_train, oof_test

In [273]:
cat_cols = ['income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type', 'ID']

In [None]:
models = train_cat_model(X_train, y_train, 
                         cat_cols, 
                         return_models = True, 
                         stratified = True)

In [188]:
cat_model = CatBoostClassifier()
train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)

In [62]:
cat_model.fit(train_data, early_stopping_rounds=100, verbose=100)

Learning rate set to 0.093512
0:	learn: 1.0460853	total: 35ms	remaining: 35s
100:	learn: 0.7049386	total: 2.75s	remaining: 24.4s
200:	learn: 0.6888987	total: 5.69s	remaining: 22.6s
300:	learn: 0.6740002	total: 8.72s	remaining: 20.3s
400:	learn: 0.6604176	total: 11.9s	remaining: 17.7s
500:	learn: 0.6456102	total: 14.9s	remaining: 14.9s
600:	learn: 0.6314189	total: 18s	remaining: 12s
700:	learn: 0.6177031	total: 21.1s	remaining: 9.01s
800:	learn: 0.6038913	total: 24.2s	remaining: 6.01s
900:	learn: 0.5896405	total: 27.2s	remaining: 2.99s
999:	learn: 0.5775733	total: 30.3s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x21910a8e6c8>

In [275]:
oof_train, oof_test = \
train_cat_model(X_train, 
                y_train,
                cat_cols, 
                X_test,
                return_pred_data = True)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.114262
0:	learn: 1.0345192	test: 1.0319239	best: 1.0319239 (0)	total: 115ms	remaining: 1m 54s
100:	learn: 0.7172598	test: 0.6745758	best: 0.6745758 (100)	total: 16.5s	remaining: 2m 27s
200:	learn: 0.6915700	test: 0.6730706	best: 0.6729763 (191)	total: 33.8s	remaining: 2m 14s
300:	learn: 0.6675533	test: 0.6723003	best: 0.6722588 (298)	total: 51.7s	remaining: 2m
400:	learn: 0.6432754	test: 0.6732031	best: 0.6722277 (316)	total: 1m 10s	remaining: 1m 45s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6722276719
bestIteration = 316

Shrink model to first 317 iterations.
Learning rate set to 0.114262
0:	learn: 1.0333498	test: 1.0340394	best: 1.0340394 (0)	total: 106ms	remaining: 1m 45s
100:	learn: 0.7093765	test: 0.6999286	best: 0.6999286 (100)	total: 16.6s	remaining: 2m 27s
200:	learn: 0.6822070	test: 0.6983957	best: 0.6981992 (168)	total: 32.9s	remaining: 2m 10s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6981991587
best

In [276]:
oof_train.shape, oof_test.shape

((21160, 3), (5291, 3))

In [277]:
log_loss(y_train, oof_train)

0.686316629894177

In [278]:
log_loss(y_test, oof_test)

0.703847516022256

## Random Forest

In [249]:
def train_RF_model(x_data, y_data, params, x_test = None, k=5, 
                    num_boost_round = 200, verbose_eval = 100, 
                    early_stopping_rounds = 100, stratified = False, 
                    return_models = False, return_pred_data = False):
    models = []
    if return_pred_data:
        assert type(x_test) != type(None), "If return_pred_data is True, X_test data must be passed"
        oof_train = np.zeros([x_data.shape[0], len(np.unique(y_data))])
        oof_test  = np.zeros([x_test.shape[0], len(np.unique(y_data))])
    
    if stratified:
        k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data, y_data]
    else:
        k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data]
    


    for train_idx, val_idx in k_fold.split(*data):
        x_train, y_train = x_data.iloc[train_idx], y_data.iloc[train_idx]
        x_val, y_val = x_data.iloc[val_idx], y_data.iloc[val_idx]

        model = RandomForestClassifier(n_estimators      = params['n_estimators'],
                                       max_depth         = params['max_depth'],
                                       criterion         = params['criterion'],
                                       min_samples_leaf  = params['min_samples_leaf'],
                                       min_samples_split = params['min_samples_split'],
                                       n_jobs            = 4)
        
        model.fit(x_train, y_train)
        models.append(model)
        
        print(f"{k} Fold log loss:", log_loss(y_val, model.predict_proba(x_val)))
        
        if return_pred_data:
            oof_train[val_idx] += model.predict_proba(x_val)
            oof_test           += model.predict_proba(x_test)/k
        
        
#     print(f"{k} fold mean score:", np.mean([i.best_score_['validation']['MultiClass'] for i in models]))
    
    if return_models:
        return models
    
    if return_pred_data:
        return oof_train, oof_test

In [250]:
with open(r"C:\Users\HwaLang\Desktop\python\T academy\Kaggle_camp\credit_card_dacon\NNI\RF\RF_params_0.704.json") as f:
    RF_params = json.load(f)

In [257]:
with open(r"C:\Users\HwaLang\Desktop\python\T academy\Kaggle_camp\credit_card_dacon\NNI\RF\model\RF_model0.7194.pkl", "rb") as f:
    RF_model = pickle.load(f)

In [251]:
RF_params

{'n_estimators': 7271,
 'max_depth': 35,
 'criterion': 'entropy',
 'min_samples_leaf': 6,
 'min_samples_split': 12,
 'scaler': 'minmax'}

In [252]:
oof_train_RF, oof_test_RF = \
train_RF_model(X_train, 
               y_train,
               RF_params, 
               X_test,
               return_pred_data = True)

5 Fold log loss: 0.742402665476749


KeyboardInterrupt: 

## XGBoost

In [None]:
def train_XGB_model(x_data, y_data, params, x_test = None, k=5, 
                    num_boost_round = 200, verbose_eval = 100, 
                    early_stopping_rounds = 100, stratified = False, 
                    return_models = False, return_pred_data = False):
    models = []
    if return_pred_data:
        assert type(x_test) != type(None), "If return_pred_data is True, X_test data must be passed"
        oof_train = np.zeros([x_data.shape[0], len(np.unique(y_data))])
        oof_test  = np.zeros([x_test.shape[0], len(np.unique(y_data))])
    
    if stratified:
        k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data, y_data]
    else:
        k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data]
    
    cv_scores = list()

    for train_idx, val_idx in k_fold.split(*data):
        x_train, y_train = x_data.iloc[train_idx], y_data.iloc[train_idx]
        x_val, y_val = x_data.iloc[val_idx], y_data.iloc[val_idx]

        model = XGBClassifier(n_estimators     = 1000000,
                              subsample        = params['subsample'],
                              max_depth        = params['max_depth'],
                              colsample_bytree = params['colsample_bytree'],
                              eta              = params['eta'],
                              n_jobs           = 4)
        
        model.fit(x_train, y_train,
                  eval_set=[[x_train, y_train], [x_valid, y_valid]],
                  eval_metric='mlogloss',
                  early_stopping_rounds=100,
                  verbose=100)
        
        models.append(model)
        
        if return_pred_data:
            oof_train[val_idx] += model.predict_proba(x_val)
            oof_test           += model.predict_proba(x_test)/k
        
        
#     print(f"{k} fold mean score:", np.mean([i.best_score_['validation']['MultiClass'] for i in models]))
    
    if return_models:
        return models
    
    if return_pred_data:
        return oof_train, oof_test

In [None]:
with open(r"C:\Users\HwaLang\Desktop\python\T academy\Kaggle_camp\credit_card_dacon\NNI\XGB\1st_sol_params_0.696406.json") as f:
    XGB_params = json.load(f)

In [None]:
oof_train_XGB, oof_test_XGB = \
train_XGB_model(X_train, 
                y_train,
                XGB_params, 
                X_test,
                return_pred_data = True)