In [29]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder, PowerTransformer, StandardScaler, \
                                    MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.metrics import roc_auc_score, log_loss, classification_report
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline, make_union, make_pipeline
from sklearn.decomposition import PCA
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from bayes_opt import BayesianOptimization
from function_dt_check import time_checker
import json

In [30]:
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'Hancom Gothic'
plt.style.use('bmh')
plt.rc('font',size=15)

In [31]:
path = './data/'
train = pd.read_csv(path+'train.csv')
train = train.drop(['index'], axis=1)

test = pd.read_csv(path+'test.csv')
test = test.drop(['index'], axis=1)

submission = pd.read_csv(path+'sample_submission.csv')

In [32]:
X_train, y_train = train.iloc[:,:-1], train.iloc[:,-1]

In [33]:
cat_columns = [c for c, t in zip(X_train.dtypes.index, X_train.dtypes) if t == 'O'] 
num_columns = [c for c    in X_train.columns if c not in cat_columns]

In [34]:
train.head()

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [35]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         26457 non-null  object 
 1   car            26457 non-null  object 
 2   reality        26457 non-null  object 
 3   child_num      26457 non-null  int64  
 4   income_total   26457 non-null  float64
 5   income_type    26457 non-null  object 
 6   edu_type       26457 non-null  object 
 7   family_type    26457 non-null  object 
 8   house_type     26457 non-null  object 
 9   DAYS_BIRTH     26457 non-null  int64  
 10  DAYS_EMPLOYED  26457 non-null  int64  
 11  FLAG_MOBIL     26457 non-null  int64  
 12  work_phone     26457 non-null  int64  
 13  phone          26457 non-null  int64  
 14  email          26457 non-null  int64  
 15  occyp_type     18286 non-null  object 
 16  family_size    26457 non-null  float64
 17  begin_month    26457 non-null  float64
dtypes: flo

In [36]:
train.isna().sum()

gender              0
car                 0
reality             0
child_num           0
income_total        0
income_type         0
edu_type            0
family_type         0
house_type          0
DAYS_BIRTH          0
DAYS_EMPLOYED       0
FLAG_MOBIL          0
work_phone          0
phone               0
email               0
occyp_type       8171
family_size         0
begin_month         0
credit              0
dtype: int64

### occyp_type만 결측치 존재

In [37]:
y_train.value_counts()

2.0    16968
1.0     6267
0.0     3222
Name: credit, dtype: int64

### 클래스 불균형 존재

In [38]:
@time_checker
def train_model(x_data, y_data, params, k=5, num_boost_round = 200, verbose_eval = 100, early_stopping_rounds = 100, stratified = False, return_models = False):
    models = []
    
#     k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
    if stratified:
        k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data, y_data]
    else:
        k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
        data = [x_data]
#     k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=123) if stratified else KFold(n_splits=k, shuffle=True, random_state=123)
    
    
    for train_idx, val_idx in k_fold.split(*data):
        x_train, y_train = x_data.iloc[train_idx], y_data.iloc[train_idx]
        x_val, y_val = x_data.iloc[val_idx], y_data.iloc[val_idx]
    
        d_train = xgb.DMatrix(data = x_train, label = y_train)
        d_val = xgb.DMatrix(data = x_val, label = y_val)
        
        wlist = [(d_train, 'train'), (d_val, 'eval')]
        
        model = xgb.train(params=params, dtrain=d_train, num_boost_round = num_boost_round, evals=wlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval)
        models.append(model)
    
    print(f"{k} fold mean score:", np.mean([i.best_score for i in models]))
    
    if return_models:
        return models

@time_checker
def last_train(X_test, y_test, params, num_boost_round = 200):
    print("***최종 학습 전 하이퍼 파라미터 다시한번 확인!!***")
    
    d_test = xgb.DMatrix(data = X_test, label = y_test)
    model = xgb.train(params = params, dtrain = d_test, num_boost_round = num_boost_round)
    
    return model

def get_XGBparams(booster):
    config = json.loads(booster.save_config()) # your xgb booster object
    stack = [config]
    internal = {}
    while stack:
        obj = stack.pop()
        for k, v in obj.items():
            if k.endswith('_param'):
                for p_k, p_v in v.items():
                    internal[p_k] = p_v
            elif isinstance(v, dict):
                stack.append(v)
    return internal

In [39]:
plt.rc('font', size = 10)
X_train[num_columns].hist(figsize = (9, 10), bins=30);

In [40]:
X_train['FLAG_MOBIL'].value_counts()

1    26457
Name: FLAG_MOBIL, dtype: int64

In [41]:
test['FLAG_MOBIL'].value_counts()

1    10000
Name: FLAG_MOBIL, dtype: int64

FLAG_MOBILE은 모든 값이 1이므로 삭제

In [42]:
X_train = X_train.drop('FLAG_MOBIL', axis = 1)
test = test.drop('FLAG_MOBIL', axis = 1)
num_columns.remove("FLAG_MOBIL")

In [43]:
plt.figure(figsize = (12, 4))
plt.rc('font', size = 15)

plt.subplot(1,3,1)
plt.boxplot(X_train['child_num']);
plt.xticks([1], ['child_num'])

plt.subplot(1,3,2)
plt.boxplot(X_train['income_total']);
plt.xticks([1], ['income_total'])

plt.subplot(1,3,3)
plt.boxplot(X_train['family_size']);
plt.xticks([1], ['family_size']);

In [44]:
skew_col = ['child_num', 'income_total', 'family_size']
OH_col = ['income_type', 'family_type', 'house_type', 'occyp_type']

In [45]:
pipline = Pipeline([
        ("stdard_scaler", StandardScaler()),
        ("Yeo-Johnson", PowerTransformer(method='yeo-johnson'))
    ])

In [46]:
pipline.fit_transform(X_train[skew_col])

array([[-0.6630603 ,  0.46990897, -0.06168216],
       [ 1.39481456,  0.87925267,  0.96173668],
       [-0.6630603 ,  1.96426356, -0.06168216],
       ...,
       [-0.6630603 ,  1.19725036, -0.06168216],
       [-0.6630603 ,  0.09046305, -1.62051665],
       [-0.6630603 , -1.54591439, -0.06168216]])

In [47]:
def data_preprocessing(data, fit_transform = False, piplines = None):
    
    if fit_transform == False:
        assert piplines, 'If fit_transform is false, pipline must be passed'
    
    proc_data = data.copy()
    
    for i in proc_data[cat_columns].columns:
        unique = proc_data[i].unique()
        if len(unique) == 2:
            proc_data[i] = proc_data[i].apply(lambda x:1 if x == unique[0] else 0)
            
    proc_data = proc_data.fillna("missing")
    
    proc_data = pd.get_dummies(proc_data, columns = OH_col)
    
    proc_data['edu_type'] = proc_data['edu_type'].replace({'Lower secondary':0,
                                                           'Secondary / secondary special':1,
                                                           'Incomplete higher':2,
                                                           'Higher education':3,
                                                           'Academic degree':4})
    cols = proc_data.columns

    if fit_transform:
        std_scaler = StandardScaler()
        YJ_transf = PowerTransformer(method='yeo-johnson')
        
        proc_data = std_scaler.fit_transform(proc_data)
        proc_data = pd.DataFrame(proc_data, columns=cols)
        proc_data[skew_col] = YJ_transf.fit_transform(proc_data[skew_col])
        
        return proc_data, [std_scaler, YJ_transf]
    
    else:
        std_scaler = piplines[0]
        YJ_transf = piplines[1]
        
        proc_data = std_scaler.transform(proc_data)
        proc_data = pd.DataFrame(proc_data, columns=cols)
        proc_data[skew_col] = YJ_transf.transform(proc_data[skew_col])
        
        return proc_data
        

In [48]:
x_train, piplines = data_preprocessing(X_train, fit_transform = True)
x_test = data_preprocessing(test, piplines = piplines)

In [49]:
piplines

[StandardScaler(), PowerTransformer()]

In [50]:
x_train.isna().sum()

gender                              0
car                                 0
reality                             0
child_num                           0
income_total                        0
edu_type                            0
DAYS_BIRTH                          0
DAYS_EMPLOYED                       0
work_phone                          0
phone                               0
email                               0
family_size                         0
begin_month                         0
income_type_Commercial associate    0
income_type_Pensioner               0
income_type_State servant           0
income_type_Student                 0
income_type_Working                 0
family_type_Civil marriage          0
family_type_Married                 0
family_type_Separated               0
family_type_Single / not married    0
family_type_Widow                   0
house_type_Co-op apartment          0
house_type_House / apartment        0
house_type_Municipal apartment      0
house_type_O

In [51]:
len(X_train['occyp_type'].unique()) == len([col for col in x_train.keys() if col.startswith("occyp")])

True

### 생각해 볼점
유일하게 결측치가 존재하는 컬럼인 occyp_type은 직업 유형을 나타낸 컬럼임. 생각해 볼 2가지 처리 방법이 있음.
1. occyp_type은 카테고리형 데이터 임 -> 결측되었다는 정보 자체를 하나의 컬럼으로 onehot incoding한 후 모델에 넣어서 돌리기
2. MICE로 imputation한 후 돌리기

우선 1안으로 실험

In [52]:
x_train.shape, y_train.shape

((26457, 48), (26457,))

In [53]:
xgb_params = {
    'booster': 'gbtree',
    'learning_rate': 0.3,
    'gamma': 0.3,
    'max_depth': 3,
#     'objective': 'multi:softmax',
    'objective': 'multi:softprob',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'seed':1324
    }

In [54]:
pca = PCA(n_components=0.8)
x_train_pca = pca.fit_transform(x_train)
x_train_pca.shape

(26457, 29)

In [55]:
models = train_model(x_train, y_train, xgb_params, num_boost_round = 700, stratified = True, return_models=True)

[0]	train-mlogloss:0.97811	eval-mlogloss:0.97988
[100]	train-mlogloss:0.73846	eval-mlogloss:0.78894
[200]	train-mlogloss:0.69479	eval-mlogloss:0.77848
[300]	train-mlogloss:0.66432	eval-mlogloss:0.77257
[400]	train-mlogloss:0.64023	eval-mlogloss:0.76889
[500]	train-mlogloss:0.63552	eval-mlogloss:0.76789
[521]	train-mlogloss:0.63552	eval-mlogloss:0.76789
[0]	train-mlogloss:0.97843	eval-mlogloss:0.97925
[100]	train-mlogloss:0.73607	eval-mlogloss:0.78227
[200]	train-mlogloss:0.69351	eval-mlogloss:0.76772
[300]	train-mlogloss:0.66260	eval-mlogloss:0.75878
[400]	train-mlogloss:0.64080	eval-mlogloss:0.75387
[500]	train-mlogloss:0.63021	eval-mlogloss:0.75162
[548]	train-mlogloss:0.63021	eval-mlogloss:0.75162
[0]	train-mlogloss:0.97910	eval-mlogloss:0.97764
[100]	train-mlogloss:0.73998	eval-mlogloss:0.78022
[200]	train-mlogloss:0.69456	eval-mlogloss:0.76748
[300]	train-mlogloss:0.65817	eval-mlogloss:0.76010
[400]	train-mlogloss:0.63968	eval-mlogloss:0.75698
[500]	train-mlogloss:0.63488	eval-mlo

In [56]:
model = last_train(x_train, y_train, xgb_params, 700)

***최종 학습 전 하이퍼 파라미터 다시한번 확인!!***
last_train learning time: 5.004458904266357


In [57]:
result = model.predict(xgb.DMatrix(x_test))

In [58]:
result

array([[0.07346022, 0.12216133, 0.80437845],
       [0.12354086, 0.12117603, 0.75528306],
       [0.13675895, 0.22094147, 0.6422996 ],
       ...,
       [0.07561718, 0.20075288, 0.7236299 ],
       [0.0594825 , 0.18499026, 0.75552726],
       [0.08669662, 0.16240537, 0.750898  ]], dtype=float32)

In [59]:
submission[['0', '1', '2']] = result

In [60]:
submission.head()

Unnamed: 0,index,0,1,2
0,26457,0.07346,0.122161,0.804378
1,26458,0.123541,0.121176,0.755283
2,26459,0.136759,0.220941,0.6423
3,26460,0.16568,0.122165,0.712154
4,26461,0.059492,0.074284,0.866224


### oversampling

In [61]:
x_smote_resampled, y_smote_resampled = SMOTE().fit_resample(x_train, y_train)
x_adasyn_resampled, y_adasyn_resampled = ADASYN().fit_resample(x_train, y_train)
x_Bsmote_resampled, y_Bsmote_resampled = BorderlineSMOTE().fit_resample(x_train, y_train)

In [62]:
xr, xt, yr, yt = train_test_split(x_smote_resampled, y_smote_resampled, test_size = 0.3)

In [63]:
train_model(x_smote_resampled, y_smote_resampled, xgb_params, num_boost_round = 700, verbose_eval = 140)

[0]	train-mlogloss:1.07892	eval-mlogloss:1.08069
[140]	train-mlogloss:0.78262	eval-mlogloss:0.80900
[280]	train-mlogloss:0.67397	eval-mlogloss:0.72050
[420]	train-mlogloss:0.61291	eval-mlogloss:0.67763
[560]	train-mlogloss:0.57462	eval-mlogloss:0.65468
[699]	train-mlogloss:0.54518	eval-mlogloss:0.63862
[0]	train-mlogloss:1.07908	eval-mlogloss:1.07844
[140]	train-mlogloss:0.78775	eval-mlogloss:0.81352
[280]	train-mlogloss:0.67122	eval-mlogloss:0.71981
[420]	train-mlogloss:0.61331	eval-mlogloss:0.68101
[560]	train-mlogloss:0.57257	eval-mlogloss:0.65661
[699]	train-mlogloss:0.53860	eval-mlogloss:0.63996
[0]	train-mlogloss:1.07906	eval-mlogloss:1.07915
[140]	train-mlogloss:0.77899	eval-mlogloss:0.80681
[280]	train-mlogloss:0.67433	eval-mlogloss:0.72239
[420]	train-mlogloss:0.61264	eval-mlogloss:0.67937
[560]	train-mlogloss:0.57242	eval-mlogloss:0.65643
[699]	train-mlogloss:0.54432	eval-mlogloss:0.64135
[0]	train-mlogloss:1.07853	eval-mlogloss:1.08015
[140]	train-mlogloss:0.77873	eval-mlogl

In [64]:
train_model(x_adasyn_resampled, y_adasyn_resampled, xgb_params, num_boost_round = 700, verbose_eval = 140)

[0]	train-mlogloss:1.07860	eval-mlogloss:1.07893
[140]	train-mlogloss:0.79683	eval-mlogloss:0.82362
[280]	train-mlogloss:0.69555	eval-mlogloss:0.74272
[420]	train-mlogloss:0.63524	eval-mlogloss:0.70151
[560]	train-mlogloss:0.59837	eval-mlogloss:0.67983
[699]	train-mlogloss:0.57585	eval-mlogloss:0.66715
[0]	train-mlogloss:1.07908	eval-mlogloss:1.07957
[140]	train-mlogloss:0.79316	eval-mlogloss:0.82297
[280]	train-mlogloss:0.69192	eval-mlogloss:0.74502
[420]	train-mlogloss:0.63492	eval-mlogloss:0.70659
[560]	train-mlogloss:0.59526	eval-mlogloss:0.68487
[699]	train-mlogloss:0.56710	eval-mlogloss:0.66960
[0]	train-mlogloss:1.07668	eval-mlogloss:1.07709
[140]	train-mlogloss:0.78946	eval-mlogloss:0.82068
[280]	train-mlogloss:0.69373	eval-mlogloss:0.74607
[420]	train-mlogloss:0.63530	eval-mlogloss:0.70636
[560]	train-mlogloss:0.59795	eval-mlogloss:0.68550
[699]	train-mlogloss:0.57061	eval-mlogloss:0.67068
[0]	train-mlogloss:1.07813	eval-mlogloss:1.07905
[140]	train-mlogloss:0.79380	eval-mlogl

In [65]:
train_model(x_Bsmote_resampled, y_Bsmote_resampled, xgb_params, num_boost_round = 700, verbose_eval = 140)

[0]	train-mlogloss:1.07903	eval-mlogloss:1.08074
[140]	train-mlogloss:0.79345	eval-mlogloss:0.81927
[280]	train-mlogloss:0.68004	eval-mlogloss:0.72203
[420]	train-mlogloss:0.61911	eval-mlogloss:0.68144
[560]	train-mlogloss:0.57910	eval-mlogloss:0.65714
[699]	train-mlogloss:0.55839	eval-mlogloss:0.64663
[0]	train-mlogloss:1.07874	eval-mlogloss:1.07869
[140]	train-mlogloss:0.79322	eval-mlogloss:0.82154
[280]	train-mlogloss:0.68027	eval-mlogloss:0.72702
[420]	train-mlogloss:0.61733	eval-mlogloss:0.67991
[560]	train-mlogloss:0.58121	eval-mlogloss:0.65733
[699]	train-mlogloss:0.55730	eval-mlogloss:0.64355
[0]	train-mlogloss:1.07920	eval-mlogloss:1.07885
[140]	train-mlogloss:0.79461	eval-mlogloss:0.82221
[280]	train-mlogloss:0.67903	eval-mlogloss:0.72833
[420]	train-mlogloss:0.61711	eval-mlogloss:0.68495
[560]	train-mlogloss:0.57260	eval-mlogloss:0.65723
[699]	train-mlogloss:0.54139	eval-mlogloss:0.64066
[0]	train-mlogloss:1.07887	eval-mlogloss:1.08011
[140]	train-mlogloss:0.79239	eval-mlogl

In [66]:
xr = xr.reset_index(drop = True)
yr = yr.reset_index(drop = True)

In [67]:
xgb_params = {
    'booster': 'gbtree',
    'learning_rate': 0.3,
    'gamma': 1,
    'max_depth': 3,
#     'objective': 'multi:softmax',
    'objective': 'multi:softprob',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'gpu_id': 0, # GPU
    'tree_method': 'gpu_hist',
    'seed':1324
    }
models = train_model(x_adasyn_resampled, y_adasyn_resampled, xgb_params, num_boost_round = 1000, verbose_eval = 200, return_models = True)

[0]	train-mlogloss:1.08333	eval-mlogloss:1.08329
[200]	train-mlogloss:0.75208	eval-mlogloss:0.78735
[400]	train-mlogloss:0.66718	eval-mlogloss:0.72133
[600]	train-mlogloss:0.62743	eval-mlogloss:0.69439
[800]	train-mlogloss:0.60132	eval-mlogloss:0.67590
[999]	train-mlogloss:0.58012	eval-mlogloss:0.66181
[0]	train-mlogloss:1.08061	eval-mlogloss:1.08109
[200]	train-mlogloss:0.75166	eval-mlogloss:0.78732
[400]	train-mlogloss:0.66758	eval-mlogloss:0.72222
[600]	train-mlogloss:0.62678	eval-mlogloss:0.69502
[800]	train-mlogloss:0.59806	eval-mlogloss:0.67540
[999]	train-mlogloss:0.57867	eval-mlogloss:0.66547
[0]	train-mlogloss:1.08183	eval-mlogloss:1.08207
[200]	train-mlogloss:0.75054	eval-mlogloss:0.78533
[400]	train-mlogloss:0.66786	eval-mlogloss:0.72168
[600]	train-mlogloss:0.62623	eval-mlogloss:0.69240
[800]	train-mlogloss:0.59983	eval-mlogloss:0.67558
[999]	train-mlogloss:0.58001	eval-mlogloss:0.66421
[0]	train-mlogloss:1.08054	eval-mlogloss:1.08074
[200]	train-mlogloss:0.74799	eval-mlogl

SMOTE를 사용하여 oversampling 후 학습을 진행하였음.
학습시 train, valid, test score가 모두 올랐으나 제출하였을 때 점수는 그대로 였음. 
1. oversampling 이전에 먼저 test set을 분리 후 다시 실험
2. oversampling 폐기

우선 1안 수행

In [68]:
xr, xt, yr, yt = train_test_split(x_train, y_train, test_size = 0.3)
xr = xr.reset_index(drop = True)
yr = yr.reset_index(drop = True)
xt = xt.reset_index(drop = True)
yt = yt.reset_index(drop = True)

xr_adasyn, yr_adasyn = ADASYN().fit_resample(xr, yr)

xgb_params = {
    'booster': 'gbtree',
    'learning_rate': 0.3,
    'gamma': 1,
    'max_depth': 3,
#     'objective': 'multi:softmax',
    'objective': 'multi:softprob',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'gpu_id': 0, # GPU
    'tree_method': 'gpu_hist',
    'seed':1324
    }

models = train_model(xr_adasyn, yr_adasyn, 
                     xgb_params, num_boost_round = 300, 
                     verbose_eval = 200, return_models = True)

[0]	train-mlogloss:1.07948	eval-mlogloss:1.08008
[200]	train-mlogloss:0.72136	eval-mlogloss:0.76614
[299]	train-mlogloss:0.67292	eval-mlogloss:0.72974
[0]	train-mlogloss:1.07713	eval-mlogloss:1.07648
[200]	train-mlogloss:0.72176	eval-mlogloss:0.77203
[299]	train-mlogloss:0.66970	eval-mlogloss:0.73351
[0]	train-mlogloss:1.07830	eval-mlogloss:1.07923
[200]	train-mlogloss:0.72129	eval-mlogloss:0.77159
[299]	train-mlogloss:0.67291	eval-mlogloss:0.73508
[0]	train-mlogloss:1.07668	eval-mlogloss:1.07748
[200]	train-mlogloss:0.72106	eval-mlogloss:0.77851
[299]	train-mlogloss:0.67449	eval-mlogloss:0.74620
[0]	train-mlogloss:1.07634	eval-mlogloss:1.07688
[200]	train-mlogloss:0.71777	eval-mlogloss:0.76500
[299]	train-mlogloss:0.67569	eval-mlogloss:0.73623
5 fold mean score: 0.7361525999999999
train_model learning time: 12.819717645645142


In [69]:
for mod in models:
    print(log_loss(pd.get_dummies(yt), mod.predict(xgb.DMatrix(xt))))

0.8295004104190635
0.8291808436914497
0.8341620832535704
0.8306998954941565
0.836213702944399


train-test split --> train data oversampling --> training & test 순서로 실험결과 성능 향상 없이 과적합 발생
- [ ] oversampling 이전에 먼저 test set을 분리 후 다시 실험
- [x] **oversampling 폐기**

### downsampling

In [70]:
from imblearn.under_sampling import OneSidedSelection, TomekLinks, RandomUnderSampler

In [71]:
xr, xt, yr, yt = train_test_split(x_train, y_train, test_size = 0.3, stratify = y_train)
xr = xr.reset_index(drop = True)
yr = yr.reset_index(drop = True)
xt = xt.reset_index(drop = True)
yt = yt.reset_index(drop = True)

xr_OSS, yr_OSS = OneSidedSelection(random_state = 756).fit_resample(xr, yr)

xgb_params = {
    'booster': 'gbtree',
    'learning_rate': 0.3,
    'gamma': 1,
    'max_depth': 3,
#     'objective': 'multi:softmax',
    'objective': 'multi:softprob',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'gpu_id': 0, # GPU
    'tree_method': 'gpu_hist',
    'seed':1324
    }

models = train_model(xr_OSS, yr_OSS, 
                     xgb_params, num_boost_round = 1000, 
                     verbose_eval = 200, return_models = True)

[0]	train-mlogloss:0.99180	eval-mlogloss:0.98848
[200]	train-mlogloss:0.67510	eval-mlogloss:0.75975
[400]	train-mlogloss:0.62518	eval-mlogloss:0.75520
[600]	train-mlogloss:0.59597	eval-mlogloss:0.75316
[800]	train-mlogloss:0.57195	eval-mlogloss:0.75212
[990]	train-mlogloss:0.55711	eval-mlogloss:0.75014
[0]	train-mlogloss:0.99027	eval-mlogloss:0.99416
[200]	train-mlogloss:0.66709	eval-mlogloss:0.79021
[400]	train-mlogloss:0.61624	eval-mlogloss:0.78523
[600]	train-mlogloss:0.58807	eval-mlogloss:0.77944
[706]	train-mlogloss:0.57673	eval-mlogloss:0.78019
[0]	train-mlogloss:0.99025	eval-mlogloss:0.99209
[200]	train-mlogloss:0.67178	eval-mlogloss:0.77333
[400]	train-mlogloss:0.62182	eval-mlogloss:0.76192
[600]	train-mlogloss:0.59257	eval-mlogloss:0.75757
[800]	train-mlogloss:0.57120	eval-mlogloss:0.75397
[990]	train-mlogloss:0.55379	eval-mlogloss:0.75366
[0]	train-mlogloss:0.99093	eval-mlogloss:0.99103
[200]	train-mlogloss:0.67990	eval-mlogloss:0.75254
[400]	train-mlogloss:0.63154	eval-mlogl

In [72]:
for mod in models:
    print(log_loss(pd.get_dummies(yt), mod.predict(xgb.DMatrix(xt))))

0.8042205899096668
0.799852478577321
0.8049384882328628
0.8058791990977594
0.8010758388838897


In [73]:
print(classification_report(yt.values, np.argmax(mod.predict(xgb.DMatrix(xt)), axis=1)))

              precision    recall  f1-score   support

         0.0       0.40      0.09      0.15       967
         1.0       0.65      0.29      0.40      1880
         2.0       0.71      0.95      0.81      5091

    accuracy                           0.69      7938
   macro avg       0.59      0.44      0.45      7938
weighted avg       0.66      0.69      0.63      7938



In [74]:
yt.value_counts()

2.0    5091
1.0    1880
0.0     967
Name: credit, dtype: int64

In [81]:
yt.values

array([1., 2., 2., ..., 2., 1., 2.])

In [80]:
pd.get_dummies(yt)

Unnamed: 0,0.0,1.0,2.0
0,0,1,0
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
7933,0,1,0
7934,0,0,1
7935,0,0,1
7936,0,1,0


In [77]:
mod.predict(xgb.DMatrix(xt))

array([[0.09064728, 0.22937799, 0.6799748 ],
       [0.16900523, 0.15119497, 0.67979974],
       [0.0894729 , 0.18212627, 0.72840077],
       ...,
       [0.0663086 , 0.1478953 , 0.78579605],
       [0.10580068, 0.1710029 , 0.72319645],
       [0.27454805, 0.17520015, 0.5502518 ]], dtype=float32)

In [78]:
np.argmax(mod.predict(xgb.DMatrix(xt)), axis=1)

array([2, 2, 2, ..., 2, 2, 2], dtype=int64)

In [79]:
yt.values

array([1., 2., 2., ..., 2., 1., 2.])