In [3]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier, Pool
import torch
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.metrics import log_loss
from category_encoders.ordinal import OrdinalEncoder
import glob
import pickle
import multiprocessing
n_cpus = multiprocessing.cpu_count()

In [39]:
BASE_DIR = '/Users/HwaLang/Desktop/python/T academy/Kaggle_camp/'

scaler_dict = {
    "minmax": MinMaxScaler,
    "standard": StandardScaler   
}

def select_scaler(name):
    print(f"Select {name} Scaler")
    
    return scaler_dict[name]()

def preprocess(x_train, x_valid, x_test, target):
    # global num_columns, cat_columns

    tmp_x_train = x_train.copy()
    tmp_x_valid = x_valid.copy()    
    tmp_x_test  = x_test.copy()
# ---------------------------
    tmp_x_train.drop(columns = ['FLAG_MOBIL'], inplace = True)
    tmp_x_valid.drop(columns = ['FLAG_MOBIL'], inplace = True)
    tmp_x_test.drop(columns = ['FLAG_MOBIL'], inplace = True)

    tmp_x_train['DAYS_EMPLOYED'] = tmp_x_train['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)
    tmp_x_valid['DAYS_EMPLOYED'] = tmp_x_valid['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)
    tmp_x_test['DAYS_EMPLOYED'] = tmp_x_test['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)

    feats = ['DAYS_BIRTH', 'begin_month', 'DAYS_EMPLOYED']
    for feat in feats:
        tmp_x_train[feat]=np.abs(tmp_x_train[feat])
        tmp_x_valid[feat]=np.abs(tmp_x_valid[feat])
        tmp_x_test[feat]=np.abs(tmp_x_test[feat])
    
    for df in [tmp_x_train, tmp_x_valid, tmp_x_test]:
        # before_EMPLOYED: 고용되기 전까지의 일수
        df['before_EMPLOYED'] = df['DAYS_BIRTH'] - df['DAYS_EMPLOYED']
        df['income_total_befofeEMP_ratio'] = df['income_total'] / df['before_EMPLOYED']
        df['before_EMPLOYED_m'] = np.floor(df['before_EMPLOYED'] / 30) - ((np.floor(df['before_EMPLOYED'] / 30) / 12).astype(int) * 12)
        df['before_EMPLOYED_w'] = np.floor(df['before_EMPLOYED'] / 7) - ((np.floor(df['before_EMPLOYED'] / 7) / 4).astype(int) * 4)
        
        #DAYS_BIRTH 파생변수- Age(나이), 태어난 월, 태어난 주(출생연도의 n주차)
        df['Age'] = df['DAYS_BIRTH'] // 365
        df['DAYS_BIRTH_m'] = np.floor(df['DAYS_BIRTH'] / 30) - ((np.floor(df['DAYS_BIRTH'] / 30) / 12).astype(int) * 12)
        df['DAYS_BIRTH_w'] = np.floor(df['DAYS_BIRTH'] / 7) - ((np.floor(df['DAYS_BIRTH'] / 7) / 4).astype(int) * 4)

        
        #DAYS_EMPLOYED_m 파생변수- EMPLOYED(근속연수), DAYS_EMPLOYED_m(고용된 달) ,DAYS_EMPLOYED_w(고용된 주(고용연도의 n주차))  
        df['EMPLOYED'] = df['DAYS_EMPLOYED'] // 365
        df['DAYS_EMPLOYED_m'] = np.floor(df['DAYS_EMPLOYED'] / 30) - ((np.floor(df['DAYS_EMPLOYED'] / 30) / 12).astype(int) * 12)
        df['DAYS_EMPLOYED_w'] = np.floor(df['DAYS_EMPLOYED'] / 7) - ((np.floor(df['DAYS_EMPLOYED'] / 7) / 4).astype(int) * 4)

        #ability: 소득/(살아온 일수+ 근무일수)
        df['ability'] = df['income_total'] / (df['DAYS_BIRTH'] + df['DAYS_EMPLOYED'])
        
        #income_mean: 소득/ 가족 수
        df['income_mean'] = df['income_total'] / df['family_size']
        
        #ID 생성: 각 컬럼의 값들을 더해서 고유한 사람을 파악(*한 사람이 여러 개 카드를 만들 가능성을 고려해 begin_month는 제외함)
        df['ID'] = \
        df['child_num'].astype(str) + '_' + df['income_total'].astype(str) + '_' +\
        df['DAYS_BIRTH'].astype(str) + '_' + df['DAYS_EMPLOYED'].astype(str) + '_' +\
        df['work_phone'].astype(str) + '_' + df['phone'].astype(str) + '_' +\
        df['email'].astype(str) + '_' + df['family_size'].astype(str) + '_' +\
        df['gender'].astype(str) + '_' + df['car'].astype(str) + '_' +\
        df['reality'].astype(str) + '_' + df['income_type'].astype(str) + '_' +\
        df['edu_type'].astype(str) + '_' + df['family_type'].astype(str) + '_' +\
        df['house_type'].astype(str) + '_' + df['occyp_type'].astype(str)

    cols = ['child_num', 'DAYS_BIRTH', 'DAYS_EMPLOYED',]
    tmp_x_train.drop(cols, axis=1, inplace=True)
    tmp_x_valid.drop(cols, axis=1, inplace=True)
    tmp_x_test.drop(cols, axis=1, inplace=True)

    cat_columns = [c for (c, t) in zip(tmp_x_train.dtypes.index, tmp_x_train.dtypes) if t == 'O'] 
    num_columns = [c for c in tmp_x_train.columns if c not in cat_columns]

    YJ_transform = PowerTransformer(method='yeo-johnson')
    tmp_x_train['income_total'] = YJ_transform.fit_transform(tmp_x_train['income_total'].values.reshape(-1, 1))
    tmp_x_valid['income_total'] = YJ_transform.transform(tmp_x_valid['income_total'].values.reshape(-1, 1))
    tmp_x_test['income_total'] = YJ_transform.transform(tmp_x_test['income_total'].values.reshape(-1, 1))


    tmp_x_train.reset_index(drop=True, inplace=True)
    tmp_x_valid.reset_index(drop=True, inplace=True)
    
    num_columns.remove("income_total")

    scaler = select_scaler("standard")
    tmp_x_train[num_columns] = scaler.fit_transform(tmp_x_train[num_columns])
    tmp_x_valid[num_columns] = scaler.transform(tmp_x_valid[num_columns])
    tmp_x_test[num_columns]  = scaler.transform(tmp_x_test[num_columns])

    ode = OrdinalEncoder(cat_columns)
    tmp_x_train[cat_columns] = ode.fit_transform(tmp_x_train[cat_columns], target)
    tmp_x_valid[cat_columns] = ode.transform(tmp_x_valid[cat_columns])
    tmp_x_test[cat_columns]  = ode.transform(tmp_x_test[cat_columns])

    tmp_x_train['ID'] = tmp_x_train['ID'].astype('int64')
    tmp_x_valid['ID'] = tmp_x_valid['ID'].astype('int64')
    tmp_x_test['ID'] = tmp_x_test['ID'].astype('int64')

# ---------------------------    
    return tmp_x_train, tmp_x_valid, tmp_x_test, cat_columns, num_columns


In [40]:
train_path = os.path.join(BASE_DIR, 'data', 'MDC14', 'train.csv')
test_path  = os.path.join(BASE_DIR, 'data', 'MDC14', 'test.csv')

data = pd.read_csv(train_path)
test = pd.read_csv(test_path)

data.fillna("NaN", inplace = True)
test.fillna("NaN", inplace = True)

label = data['credit'] 

data.drop(columns=['index', 'credit'], inplace=True)
test.drop(columns=['index'], inplace=True) 

le = LabelEncoder()
label = le.fit_transform(label)

X_train, X_test, Y_train, Y_test = train_test_split(data, label, test_size=0.2)

print(X_train.shape, X_test.shape)

X_train, X_test, test, cat_cols, num_cols = preprocess(X_train, X_test, test, label)

(21165, 18) (5292, 18)
Select standard Scaler


  loglike = -n_samples / 2 * np.log(x_trans.var())


In [48]:
oof_train = np.zeros((5, X_train.shape[0], 3))
oof_test  = np.zeros((5, X_test.shape[0], 3))

In [49]:
with open(r"C:\Users\HwaLang\Desktop\python\T academy\Kaggle_camp\credit_card_dacon\NNI\catboost\model\cat_model0.6756.pkl", "rb") as f:
    cat_model = pickle.load(f)
params = cat_model.get_params()

In [50]:
# 예측하려는 유형에 따라 KFold or StratifiedKFold 선택
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for train_index, valid_index in skf.split(X_train, Y_train):
    x_train, y_train = X_train.iloc[train_index, :], Y_train[train_index]
    x_valid, y_valid = X_train.iloc[valid_index, :], Y_train[valid_index]

#     x_train, x_valid, x_test = preprocess(x_train, x_valid, test, y_train)

    model = CatBoostClassifier()

    cat_cols = ['income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type', 'ID']

    train_data = Pool(data=x_train, label=y_train, cat_features=cat_cols)
    valid_data = Pool(data=x_valid, label=y_valid, cat_features=cat_cols)

    model.fit(train_data, eval_set=valid_data, 
              use_best_model=True, 
              early_stopping_rounds=100, 
              verbose=100)

    oof_train[0, valid_index,] += model.predict_proba(x_valid)
    oof_test[0, :,]        += model.predict_proba(X_test) / n_splits

Learning rate set to 0.114262
0:	learn: 1.0369531	test: 1.0367023	best: 1.0367023 (0)	total: 31.5ms	remaining: 31.4s
100:	learn: 0.7225396	test: 0.6937173	best: 0.6937173 (100)	total: 2.08s	remaining: 18.5s
200:	learn: 0.6944234	test: 0.6914847	best: 0.6913701 (198)	total: 4.35s	remaining: 17.3s
300:	learn: 0.6708604	test: 0.6925205	best: 0.6911628 (227)	total: 6.67s	remaining: 15.5s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.691162758
bestIteration = 227

Shrink model to first 228 iterations.
Learning rate set to 0.114262
0:	learn: 1.0370417	test: 1.0365240	best: 1.0365240 (0)	total: 20.3ms	remaining: 20.3s
100:	learn: 0.7235900	test: 0.6991136	best: 0.6990485 (98)	total: 2.09s	remaining: 18.6s
200:	learn: 0.6978491	test: 0.6978453	best: 0.6978063 (196)	total: 4.36s	remaining: 17.3s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6978063178
bestIteration = 196

Shrink model to first 197 iterations.
Learning rate set to 0.114262
0:	learn: 