In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random
import os
from sklearn.metrics import log_loss
import torch

d = "C:\kaggle_data\credit_card"

train = pd.read_csv(d + '\\' +'train.csv')
test = pd.read_csv(d + '\\' +'test.csv')
ss = pd.read_csv(d + '\\' +'sample_submission.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

y = train.pop('credit')

def simple_pp(train, test):
    
    # Married, Civil marriage
    train['income_per_size'] = np.log(train['income_total']/train['family_size'])
    test['income_per_size'] = np.log(test['income_total']/test['family_size'])
    train.loc[(train['family_type']=='Married')|(train['family_type']=='Civil marriage'),'income_per_size']\
    = train['income_per_size'] * 2
    test.loc[(test['family_type']=='Married')|(test['family_type']=='Civil marriage'),'income_per_size']\
    = test['income_per_size'] * 2
    
    train['gender'] =train['gender'].apply(lambda x: int(x=='F'))
    test['gender'] =test['gender'].apply(lambda x: int(x=='F'))
    
    
    def simple_marry(x):
        if x == 'Married' or x =='Civil marriage':
            return '0'
        elif x == 'Separated' or x == 'Widow':
            return '1'
        else:
            return '2'

    for df in [train,test]:
        df['family_bins'] = df['family_type'].apply(simple_marry)

    # car와 reality를 합친 새로운 칼럼 careality
    train['car'] =train['car'].apply(lambda x: int(x=='Y'))
    train['reality'] =train['reality'].apply(lambda x: int(x=='Y'))
    test['car'] =test['car'].apply(lambda x: int(x=='Y'))
    test['reality'] =test['reality'].apply(lambda x: int(x=='Y'))

    train['careality'] = train['car'] + train['reality']
    train = train.drop(['car', 'reality'],1)

    test['careality'] = test['car'] + test['reality']
    test = test.drop(['car', 'reality'],1)



    object_col = []
    for col in train.columns:
        if train[col].dtype == 'object':
            object_col.append(col)

    enc = OneHotEncoder()
    enc.fit(train.loc[:,object_col])


    train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
                 columns=enc.get_feature_names(object_col))
    train.drop(object_col, axis=1, inplace=True)
    train = pd.concat([train, train_onehot_df], axis=1)

    test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
                 columns=enc.get_feature_names(object_col))
    test.drop(object_col, axis=1, inplace=True)
    test = pd.concat([test, test_onehot_df], axis=1)

    ## 제곱
    for df in [train,test]:
        df['income_per_size'] = df['income_per_size'].apply(lambda x: x**2)
    
    return train,test

def log_pp(train,test):
    
    # Married, Civil marriage
    train['income_per_size'] = np.log(train['income_total']/train['family_size'])
    test['income_per_size'] = np.log(test['income_total']/test['family_size'])
    train.loc[(train['family_type']=='Married')|(train['family_type']=='Civil marriage'),'income_per_size']\
    = train['income_per_size'] * 2
    test.loc[(test['family_type']=='Married')|(test['family_type']=='Civil marriage'),'income_per_size']\
    = test['income_per_size'] * 2

    def simple_marry(x):
        if x == 'Married' or x =='Civil marriage':
            return '0'
        elif x == 'Separated' or x == 'Widow':
            return '1'
        else:
            return '2'

    for df in [train,test]:
        df['family_bins'] = df['family_type'].apply(simple_marry)

    # car와 reality를 합친 새로운 칼럼 careality
    train['car'] =train['car'].apply(lambda x: int(x=='Y'))
    train['reality'] =train['reality'].apply(lambda x: int(x=='Y'))
    test['car'] =test['car'].apply(lambda x: int(x=='Y'))
    test['reality'] =test['reality'].apply(lambda x: int(x=='Y'))

    train['careality'] = train['car'] + train['reality']
    train = train.drop(['car', 'reality'],1)

    test['careality'] = test['car'] + test['reality']
    test = test.drop(['car', 'reality'],1)



    object_col = []
    for col in train.columns:
        if train[col].dtype == 'object':
            object_col.append(col)

    enc = OneHotEncoder()
    enc.fit(train.loc[:,object_col])


    train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
                 columns=enc.get_feature_names(object_col))
    train.drop(object_col, axis=1, inplace=True)
    train = pd.concat([train, train_onehot_df], axis=1)

    test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
                 columns=enc.get_feature_names(object_col))
    test.drop(object_col, axis=1, inplace=True)
    test = pd.concat([test, test_onehot_df], axis=1)

    ## 제곱
    for df in [train,test]:
        df['income_per_size'] = df['income_per_size'].apply(lambda x: x**2)
    
    
    log_cols = ['income_per_size', 'income_total', 'begin_month','DAYS_BIRTH']
    
    for col in log_cols:
        train[col] = np.log1p(train[col].abs())
        test[col] = np.log1p(test[col].abs())
    
    
    train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].abs()
    test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].abs()
        
    m = train['DAYS_EMPLOYED'].mean()
    s = train['DAYS_EMPLOYED'].std()
        
    for df in [train,test]:        
        df['DAYS_EMPLOYED'] = (df['DAYS_EMPLOYED']-m)/s
        
    return train, test

def scale_pp(train,test):
    
    # Married, Civil marriage
    train['income_per_size'] = np.log(train['income_total']/train['family_size'])
    test['income_per_size'] = np.log(test['income_total']/test['family_size'])
    train.loc[(train['family_type']=='Married')|(train['family_type']=='Civil marriage'),'income_per_size']\
    = train['income_per_size'] * 2
    test.loc[(test['family_type']=='Married')|(test['family_type']=='Civil marriage'),'income_per_size']\
    = test['income_per_size'] * 2

    def simple_marry(x):
        if x == 'Married' or x =='Civil marriage':
            return '0'
        elif x == 'Separated' or x == 'Widow':
            return '1'
        else:
            return '2'

    for df in [train,test]:
        df['family_bins'] = df['family_type'].apply(simple_marry)

    # income_total을 로그변환
    train['income_total'] = np.log(train['income_total'])
    test['income_total'] = np.log(test['income_total'])


    # car와 reality를 합친 새로운 칼럼 careality
    train['car'] =train['car'].apply(lambda x: int(x=='Y'))
    train['reality'] =train['reality'].apply(lambda x: int(x=='Y'))
    test['car'] =test['car'].apply(lambda x: int(x=='Y'))
    test['reality'] =test['reality'].apply(lambda x: int(x=='Y'))

    train['careality'] = train['car'] + train['reality']
    train = train.drop(['car', 'reality'],1)

    test['careality'] = test['car'] + test['reality']
    test = test.drop(['car', 'reality'],1)



    object_col = []
    for col in train.columns:
        if train[col].dtype == 'object':
            object_col.append(col)

    enc = OneHotEncoder()
    enc.fit(train.loc[:,object_col])


    train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
                 columns=enc.get_feature_names(object_col))
    train.drop(object_col, axis=1, inplace=True)
    train = pd.concat([train, train_onehot_df], axis=1)

    test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
                 columns=enc.get_feature_names(object_col))
    test.drop(object_col, axis=1, inplace=True)
    test = pd.concat([test, test_onehot_df], axis=1)

    ## 제곱
    for df in [train,test]:
        df['income_per_size'] = df['income_per_size'].apply(lambda x: x**2)
    
    
    scale_cols = ['income_per_size', 'income_total', 'begin_month','DAYS_BIRTH','DAYS_EMPLOYED']
    
    for col in scale_cols:
        m = train[col].mean()
        s = train[col].std()
        
        train[col] = (train[col]-m)/s
        test[col] = (test[col]-m)/s
        
    return train, test

In [2]:
MAX_EPOCHS = 1000
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

print("Using {}".format(DEVICE))

Using cuda


In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

forest = RandomForestClassifier(n_estimators = 1000, 
                                    max_depth = 300, 
                                    min_samples_leaf = 2, 
                                    min_samples_split = 6,
                                    max_features = 9)

nb = GaussianNB()
xgboost = XGBClassifier(#tree_method = "gpu_hist",
                        n_estimators = 700,
                        max_depth = 25,
                        learning_rate= 0.1,
                        min_child_weight = 2,
                        subsample = 0.85,
                        colsample_bytree = 0.31,
                        gamma = 0,
                        max_delta_step = 0.07,
                        nthread = -1,
                        eval_metric = 'mlogloss')

knn_16 = KNeighborsClassifier(n_neighbors = 16, metric='euclidean', weights='distance')

lightgbm = LGBMClassifier(boosting_type='gbdt',
                          objective='multiclass',
                          n_estimators=1000,
                          max_depth = 9,
                          subsample=0.9,
                          subsample_freq=2,
                          colsample_bytree=0.7,
                          n_jobs=-1,
                          eval_metric = 'mlogloss')

tabnet = TabNetClassifier(n_d=9, n_a=9,
                            n_steps=6,
                            gamma=1.006,
                            n_independent=4,
                            n_shared=4,
                            lambda_sparse=0.01994,
                            seed=42,

                            optimizer_fn=torch.optim.Adam,
                            optimizer_params=dict(lr=1e-2), 
                            scheduler_params = {"gamma": 0.95,
                              "step_size": 20},
                            scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15,
                            device_name = 'auto',
                            verbose=0)


models = [tabnet, forest, nb, xgboost, lightgbm, knn_16]

In [4]:
def oof_train(model,X, y, test):
    """
    모델, X, y, test를 넣어주면
    oof로 만든 새로운 train과 test셋의 앙상블 결과를 반환
    
    X_oof: train을 oof로 학습, 예측하여 만든 새로운 train set
    test_oof: skf로 학습한 총 10개의 모델 결과를 soft voting
    """
    if model != tabnet:
    
        skf = StratifiedKFold(n_splits=10, shuffle=False)
        X_oof = np.zeros(y.shape)
        ss = np.zeros((10000,3))
    #     y_test = np.zeros(())
        for train_idx, valid_idx in skf.split(X, y):
            X_train, y_train = X.iloc[train_idx,:], y[train_idx]
            X_valid, y_valid = X.iloc[valid_idx,:], y[valid_idx]

            model.fit(X_train, y_train)
            X_oof[valid_idx] += model.predict(X.iloc[valid_idx])

            ss += model.predict_proba(test)

        test_oof = np.argmax(ss,1)


        return X_oof, test_oof
    
    
    elif model == tabnet:
        
        skf = StratifiedKFold(n_splits=10, shuffle=False)
        X_oof = np.zeros(y.shape)
        ss = np.zeros((10000,3))
    
    
        for train_idx, valid_idx in skf.split(X, y):
            X_train, y_train = X.iloc[train_idx,:].values, y[train_idx].values
            X_valid, y_valid = X.iloc[valid_idx,:].values, y[valid_idx].values

            model = TabNetClassifier(n_d=9, n_a=9,
                                    n_steps=6,
                                    gamma=1.006,
                                    n_independent=4,
                                    n_shared=4,
                                    lambda_sparse=0.01994,
                                    seed=42,

                                    optimizer_fn=torch.optim.Adam,
                                    optimizer_params=dict(lr=1e-2), 
                                    scheduler_params = {"gamma": 0.95,
                                      "step_size": 20},
                                    scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15,
                                    device_name = 'auto',
                                    verbose=0)      
            
            model.fit(X_train=X_train, y_train=y_train,
                      eval_set=[(X_valid, y_valid)],
                      eval_metric=['logloss'],
                      max_epochs=MAX_EPOCHS ,
                      patience=50, # please be patient ^^
                      batch_size=32768,
                      virtual_batch_size=16384)
            
            X_oof[valid_idx] += model.predict(X.iloc[valid_idx].values)
            ss += model.predict_proba(test.values)

        test_oof = np.argmax(ss,1)
        
        return X_oof, test_oof

In [5]:
x_train, x_test = simple_pp(train, test)

# 연속형 로그 변환
x_train_log, x_test_log = log_pp(train, test)

# 연속형 표준화
x_train_scale, x_test_scale = scale_pp(train,test)

In [None]:
trains = [x_train, x_train_log, x_train_scale]
tests = [x_test, x_test_log, x_test_scale]
columns = []

train_lv0 = []
test_lv0 = []
for model in models:
    cnt = 0
    model_name = str(model).split('(')[0]
    for X, test in zip(trains, tests):
        print(f'{model_name}의 {cnt+1}번째 학습중')
        col_name = str(model).split('(')[0]+'_'+str(cnt+1)
        
        X_oof, test_oof = oof_train(model, X, y, test)
        train_lv0.append(X_oof)
        test_lv0.append(test_oof)
        columns.append(col_name)
        cnt += 1

TabNetClassifier의 1번째 학습중


In [None]:
train_0 = pd.DataFrame(np.array(train_lv0).T, columns=columns)

In [None]:
test_0 = pd.DataFrame(np.array(test_lv0).T, columns=columns)