In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder, PowerTransformer, StandardScaler, \
                                    MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.metrics import roc_auc_score, log_loss, classification_report
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline, make_union, make_pipeline
from sklearn.decomposition import PCA
from sklearn.utils import resample
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from bayes_opt import BayesianOptimization
from function_dt_check import time_checker
from category_encoders.ordinal import OrdinalEncoder
import json
from catboost import CatBoostClassifier, Pool
%matplotlib inline

In [2]:
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'Hancom Gothic'
plt.style.use('bmh')
plt.rc('font',size=15)

In [3]:
path = './data/'
train = pd.read_csv(path+'train.csv')
train = train.drop(['index'], axis=1)

test = pd.read_csv(path+'test.csv')
test = test.drop(['index'], axis=1)

submission = pd.read_csv(path+'sample_submission.csv')

In [4]:
train.fillna('NaN', inplace=True) 
test.fillna('NaN', inplace=True)

In [5]:
train = train[(train['family_size'] <= 7)]
train = train.reset_index(drop=True)

In [6]:
train.drop(['FLAG_MOBIL'], axis=1, inplace=True)
test.drop(['FLAG_MOBIL'], axis=1, inplace=True)

In [7]:
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)

In [8]:
feats = ['DAYS_BIRTH', 'begin_month', 'DAYS_EMPLOYED']
for feat in feats:
    train[feat]=np.abs(train[feat])
    test[feat]=np.abs(test[feat])

In [9]:
for df in [train,test]:
    # before_EMPLOYED: 고용되기 전까지의 일수
    df['before_EMPLOYED'] = df['DAYS_BIRTH'] - df['DAYS_EMPLOYED']
    df['income_total_befofeEMP_ratio'] = df['income_total'] / df['before_EMPLOYED']
    df['before_EMPLOYED_m'] = np.floor(df['before_EMPLOYED'] / 30) - ((np.floor(df['before_EMPLOYED'] / 30) / 12).astype(int) * 12)
    df['before_EMPLOYED_w'] = np.floor(df['before_EMPLOYED'] / 7) - ((np.floor(df['before_EMPLOYED'] / 7) / 4).astype(int) * 4)
    
    #DAYS_BIRTH 파생변수- Age(나이), 태어난 월, 태어난 주(출생연도의 n주차)
    df['Age'] = df['DAYS_BIRTH'] // 365
    df['DAYS_BIRTH_m'] = np.floor(df['DAYS_BIRTH'] / 30) - ((np.floor(df['DAYS_BIRTH'] / 30) / 12).astype(int) * 12)
    df['DAYS_BIRTH_w'] = np.floor(df['DAYS_BIRTH'] / 7) - ((np.floor(df['DAYS_BIRTH'] / 7) / 4).astype(int) * 4)

    
    #DAYS_EMPLOYED_m 파생변수- EMPLOYED(근속연수), DAYS_EMPLOYED_m(고용된 달) ,DAYS_EMPLOYED_w(고용된 주(고용연도의 n주차))  
    df['EMPLOYED'] = df['DAYS_EMPLOYED'] // 365
    df['DAYS_EMPLOYED_m'] = np.floor(df['DAYS_EMPLOYED'] / 30) - ((np.floor(df['DAYS_EMPLOYED'] / 30) / 12).astype(int) * 12)
    df['DAYS_EMPLOYED_w'] = np.floor(df['DAYS_EMPLOYED'] / 7) - ((np.floor(df['DAYS_EMPLOYED'] / 7) / 4).astype(int) * 4)

    #ability: 소득/(살아온 일수+ 근무일수)
    df['ability'] = df['income_total'] / (df['DAYS_BIRTH'] + df['DAYS_EMPLOYED'])
    
    #income_mean: 소득/ 가족 수
    df['income_mean'] = df['income_total'] / df['family_size']
    
    #ID 생성: 각 컬럼의 값들을 더해서 고유한 사람을 파악(*한 사람이 여러 개 카드를 만들 가능성을 고려해 begin_month는 제외함)
    df['ID'] = \
    df['child_num'].astype(str) + '_' + df['income_total'].astype(str) + '_' +\
    df['DAYS_BIRTH'].astype(str) + '_' + df['DAYS_EMPLOYED'].astype(str) + '_' +\
    df['work_phone'].astype(str) + '_' + df['phone'].astype(str) + '_' +\
    df['email'].astype(str) + '_' + df['family_size'].astype(str) + '_' +\
    df['gender'].astype(str) + '_' + df['car'].astype(str) + '_' +\
    df['reality'].astype(str) + '_' + df['income_type'].astype(str) + '_' +\
    df['edu_type'].astype(str) + '_' + df['family_type'].astype(str) + '_' +\
    df['house_type'].astype(str) + '_' + df['occyp_type'].astype(str)

In [10]:
cols = ['child_num', 'DAYS_BIRTH', 'DAYS_EMPLOYED',]
train.drop(cols, axis=1, inplace=True)
test.drop(cols, axis=1, inplace=True)

In [11]:
numerical_feats = train.dtypes[train.dtypes != "object"].index.tolist()
numerical_feats.remove('credit')
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = train.dtypes[train.dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  18
Number of Categorical features:  9


In [12]:
for df in [train,test]:
    df['income_total'] = np.log1p(1+df['income_total'])

In [13]:
encoder = OrdinalEncoder(categorical_feats)
train[categorical_feats] = encoder.fit_transform(train[categorical_feats], train['credit'])
test[categorical_feats] = encoder.transform(test[categorical_feats])

train['ID'] = train['ID'].astype('int64')
test['ID'] = test['ID'].astype('int64')

In [14]:
kmeans_train = train.drop(['credit'], axis=1)
kmeans = KMeans(n_clusters=36, random_state=42).fit(kmeans_train)
train['cluster'] = kmeans.predict(kmeans_train)
test['cluster'] = kmeans.predict(test)

In [15]:
numerical_feats.remove('income_total')
scaler = StandardScaler()
train[numerical_feats] = scaler.fit_transform(train[numerical_feats])
test[numerical_feats] = scaler.transform(test[numerical_feats])

In [16]:
trainkeys = train.keys().to_list()
trainkeys.remove('credit')

In [17]:
train.head()

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,work_phone,phone,...,Age,DAYS_BIRTH_m,DAYS_BIRTH_w,EMPLOYED,DAYS_EMPLOYED_m,DAYS_EMPLOYED_w,ability,income_mean,ID,cluster
0,1,1,1,12.218505,1,1,1,1,-0.538321,-0.645632,...,-0.452826,0.442795,-0.443485,0.994253,-1.230046,-1.077087,-0.032496,0.002062,1,35
1,1,1,2,12.419174,1,2,2,2,-0.538321,-0.645632,...,-1.060773,0.442795,-0.443485,-0.250471,-0.424295,-1.077087,1.190137,-0.254157,2,7
2,2,2,2,13.017007,2,1,1,2,-0.538321,1.54887,...,0.763069,-1.582567,0.451504,0.994253,-0.424295,-0.223607,1.186515,1.693108,3,18
3,1,1,2,12.218505,1,2,1,2,-0.538321,1.54887,...,-0.192277,1.310808,1.346494,-0.09488,1.187206,0.629874,0.101168,0.002062,4,35
4,1,2,2,11.967193,3,1,1,2,-0.538321,-0.645632,...,-0.192277,1.021471,-1.338475,-0.09488,1.45579,-1.077087,-0.282885,-0.305401,5,7


In [18]:
X_train, y_train = train[trainkeys], train['credit']

In [19]:
@time_checker
def train_model(x_data, y_data, params, k=5, num_boost_round = 200, verbose_eval = 100, early_stopping_rounds = 100, stratified = False, return_models = False):
    models = []
    
#     k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
    if stratified:
        k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data, y_data]
    else:
        k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data]
#     k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123) if stratified else KFold(n_splits=k, shuffle=True, random_state=123)
    
    
    for train_idx, val_idx in k_fold.split(*data):
        x_train, y_train = x_data.iloc[train_idx], y_data.iloc[train_idx]
        x_val, y_val = x_data.iloc[val_idx], y_data.iloc[val_idx]
    
        d_train = xgb.DMatrix(data = x_train, label = y_train)
        d_val = xgb.DMatrix(data = x_val, label = y_val)
        
        wlist = [(d_train, 'train'), (d_val, 'eval')]
        
        model = xgb.train(params=params, dtrain=d_train, num_boost_round = num_boost_round, evals=wlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval)
        models.append(model)
    
    print(f"{k} fold mean score:", np.mean([i.best_score for i in models]))
    
    if return_models:
        return models

@time_checker
def last_train(X_test, y_test, params, num_boost_round = 200):
    print("***최종 학습 전 하이퍼 파라미터 다시한번 확인!!***")
    
    d_test = xgb.DMatrix(data = X_test, label = y_test)
    model = xgb.train(params = params, dtrain = d_test, num_boost_round = num_boost_round)
    
    return model

def get_XGBparams(booster):
    config = json.loads(booster.save_config()) # your xgb booster object
    stack = [config]
    internal = {}
    while stack:
        obj = stack.pop()
        for k, v in obj.items():
            if k.endswith('_param'):
                for p_k, p_v in v.items():
                    internal[p_k] = p_v
            elif isinstance(v, dict):
                stack.append(v)
    return internal

In [36]:
xgb_params = {
    'booster': 'gbtree',
    'learning_rate': 0.12133255671935729,
#     'gamma': 1,
    'max_depth': 13,
#     'objective': 'multi:softmax',
    'objective': 'multi:softprob',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'subsample': 0.8056400011874829,
    'colsample_bytree': 0.6187614912442929,
    'gpu_id': 0, # GPU
    'tree_method': 'gpu_hist',
#     'seed':1324
    }

models = train_model(X_train, y_train, xgb_params, num_boost_round = 700, stratified = True, return_models=True)

[0]	train-mlogloss:1.02675	eval-mlogloss:1.04111
[100]	train-mlogloss:0.18824	eval-mlogloss:0.76988
[138]	train-mlogloss:0.13797	eval-mlogloss:0.82009
[0]	train-mlogloss:1.03021	eval-mlogloss:1.04243
[100]	train-mlogloss:0.20053	eval-mlogloss:0.73750
[140]	train-mlogloss:0.14516	eval-mlogloss:0.78243
[0]	train-mlogloss:1.02589	eval-mlogloss:1.03988
[100]	train-mlogloss:0.19568	eval-mlogloss:0.74771
[141]	train-mlogloss:0.14143	eval-mlogloss:0.79897
[0]	train-mlogloss:1.02723	eval-mlogloss:1.04102
[100]	train-mlogloss:0.20005	eval-mlogloss:0.74642
[141]	train-mlogloss:0.14227	eval-mlogloss:0.79765
[0]	train-mlogloss:1.02693	eval-mlogloss:1.04087
[100]	train-mlogloss:0.19791	eval-mlogloss:0.74555
[144]	train-mlogloss:0.13685	eval-mlogloss:0.80734
5 fold mean score: 0.6978488
train_model learning time: 140.05625772476196


In [35]:
xgb_params = {
    'booster': 'gbtree',
    'learning_rate': 0.3,
    'gamma': 1,
    'max_depth': 15,
#     'objective': 'multi:softmax',
    'objective': 'multi:softprob',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'gpu_id': 0, # GPU
    'tree_method': 'gpu_hist',
    'seed':1324
    }

models = train_model(X_train, y_train, xgb_params, num_boost_round = 700, stratified = True, return_models=True)

[0]	train-mlogloss:0.93910	eval-mlogloss:0.97467
[100]	train-mlogloss:0.38340	eval-mlogloss:0.72906
[148]	train-mlogloss:0.37462	eval-mlogloss:0.73097
[0]	train-mlogloss:0.93899	eval-mlogloss:0.97411
[100]	train-mlogloss:0.38520	eval-mlogloss:0.70993
[133]	train-mlogloss:0.37835	eval-mlogloss:0.71066
[0]	train-mlogloss:0.93983	eval-mlogloss:0.97263
[100]	train-mlogloss:0.38274	eval-mlogloss:0.71234
[133]	train-mlogloss:0.37817	eval-mlogloss:0.71369
[0]	train-mlogloss:0.93787	eval-mlogloss:0.97342
[100]	train-mlogloss:0.38411	eval-mlogloss:0.71011
[132]	train-mlogloss:0.38026	eval-mlogloss:0.71040
[0]	train-mlogloss:0.94069	eval-mlogloss:0.97341
[100]	train-mlogloss:0.38702	eval-mlogloss:0.71731
[139]	train-mlogloss:0.37931	eval-mlogloss:0.71858
5 fold mean score: 0.7130948
train_model learning time: 17.51230001449585


In [21]:
def train_cat_model(x_data, y_data, cat_cols, k=5, 
                    num_boost_round = 200, verbose_eval = 100, 
                    early_stopping_rounds = 100, stratified = False, 
                    return_models = False):
    models = []
    
#     k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
    if stratified:
        k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data, y_data]
    else:
        k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data]
#     k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123) if stratified else KFold(n_splits=k, shuffle=True, random_state=123)
    
    
    for train_idx, val_idx in k_fold.split(*data):
        x_train, y_train = x_data.iloc[train_idx], y_data.iloc[train_idx]
        x_val, y_val = x_data.iloc[val_idx], y_data.iloc[val_idx]

        model = CatBoostClassifier()
        train_data = Pool(data=x_train, label=y_train, cat_features=cat_cols)
        valid_data = Pool(data=x_val, label=y_val, cat_features=cat_cols)
        model.fit(train_data, eval_set=valid_data, use_best_model=True, early_stopping_rounds=100, verbose=100)
        models.append(model)
    
#     print(f"{k} fold mean score:", np.mean([i.best_score for i in models]))
    
    if return_models:
        return models

In [22]:
cat_cols = ['income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type', 'ID']

In [23]:
models = train_cat_model(X_train, y_train, cat_cols, return_models = True)

Learning rate set to 0.114773
0:	learn: 1.0355876	test: 1.0357518	best: 1.0357518 (0)	total: 216ms	remaining: 3m 35s
100:	learn: 0.7101622	test: 0.6809943	best: 0.6809943 (100)	total: 3.99s	remaining: 35.5s
200:	learn: 0.6870712	test: 0.6794566	best: 0.6789316 (161)	total: 8.31s	remaining: 33s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6789316413
bestIteration = 161

Shrink model to first 162 iterations.
Learning rate set to 0.114773
0:	learn: 1.0346349	test: 1.0359460	best: 1.0359460 (0)	total: 36.1ms	remaining: 36.1s
100:	learn: 0.7068730	test: 0.6946013	best: 0.6946013 (100)	total: 3.75s	remaining: 33.4s
200:	learn: 0.6860307	test: 0.6940101	best: 0.6938711 (193)	total: 8.06s	remaining: 32.1s
300:	learn: 0.6656596	test: 0.6950971	best: 0.6938585 (205)	total: 12.5s	remaining: 29.1s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6938584617
bestIteration = 205

Shrink model to first 206 iterations.
Learning rate set to 0.114773
0:	learn: 

In [24]:
cat_model = CatBoostClassifier(learning_rate = 0.1)
train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)

In [25]:
cat_model.fit(train_data, early_stopping_rounds=100, verbose=100)

0:	learn: 1.0433443	total: 37.3ms	remaining: 37.3s
100:	learn: 0.7030099	total: 3.44s	remaining: 30.6s
200:	learn: 0.6870719	total: 7.71s	remaining: 30.6s
300:	learn: 0.6728285	total: 12.1s	remaining: 28s
400:	learn: 0.6573662	total: 17s	remaining: 25.4s
500:	learn: 0.6441099	total: 21.4s	remaining: 21.4s
600:	learn: 0.6304447	total: 25.8s	remaining: 17.2s
700:	learn: 0.6172613	total: 30s	remaining: 12.8s
800:	learn: 0.6042767	total: 34.5s	remaining: 8.56s
900:	learn: 0.5917884	total: 39.2s	remaining: 4.3s
999:	learn: 0.5783965	total: 43.2s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x204d65f8908>

In [26]:
pred = cat_model.predict_proba(test)

In [27]:
pred.shape

(10000, 3)

In [28]:
submission.iloc[:, 1:] = pred

In [29]:
submission.head()

Unnamed: 0,index,0,1,2
0,26457,0.13381,0.197049,0.669141
1,26458,0.334796,0.261194,0.404009
2,26459,0.028673,0.063094,0.908234
3,26460,0.056031,0.065618,0.878351
4,26461,0.101319,0.228913,0.669768


In [30]:
submission.iloc[:, 1:] = 0

In [31]:
submission.head()

Unnamed: 0,index,0,1,2
0,26457,0,0,0
1,26458,0,0,0
2,26459,0,0,0
3,26460,0,0,0
4,26461,0,0,0


In [32]:
for model in models:
    submission.iloc[:, 1:] += model.predict_proba(test)/5

In [33]:
submission.head()

Unnamed: 0,index,0,1,2
0,26457,0.101221,0.159938,0.738841
1,26458,0.313959,0.215667,0.470374
2,26459,0.038043,0.083354,0.878603
3,26460,0.051991,0.084526,0.863483
4,26461,0.08626,0.252289,0.661451
