비슷한 대회 : https://www.kaggle.com/c/home-credit-default-risk
최종적으로 object형 변수는 따로 전처리를 하지 않고, catboost 모델에서 cat_features 파라미터로 전달하였을 때 가장 좋은 점수를 얻을 수 있었습니다.
Category features를 사용하기 위해서는 One-Hot-Encoding등 데이터를 전처리할 필요가 있지만,
Catboost에서는 사용자가 다른 작업을 하지 않아도 자동으로 이를 변환하여 사용할 수 있습니다.

In [3]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss
from tensorflow.keras.utils import to_categorical

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [4]:
#pip install tensorflow

In [23]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.drop(['index'],axis = 1,inplace = True)
test.drop(['index'],axis =1, inplace = True)

In [24]:
def days_to_year(x):
    return (x*-1)/365

def minus(x):
    return x*-1

# 이상치제거  
다른 변수들은 이상ㅊ치 제거해도 큰 변화 없음.child_num은 값의 차이가 많이 났다.

In [25]:
def remove_outlier(train, column):
    
    df = train[column]
    
    # 1분위수
    quan_25 = np.percentile(df.values, 25)
    
    # 3분위수
    quan_75 = np.percentile(df.values, 75)
    
    iqr = quan_75 - quan_25
    
    lowest = quan_25 - iqr * 5
    highest = quan_75 + iqr * 5
    outlier_index = df[(df < lowest) | (df > highest)].index
    print('outlier의 수 : ' , len(outlier_index))
    print(df.iloc[outlier_index])
    train.drop(outlier_index, axis = 0, inplace = True)
    
    return train

In [26]:
cols = ['child_num']
for column in cols:  
    train = remove_outlier(train,column)

train.reset_index(drop = True,inplace = True)
len(train)

outlier의 수 :  6
8462     14
9021     14
10731    19
25313     7
25390    14
25638     7
Name: child_num, dtype: int64


26451

# 파생 변수 만들기

In [29]:
def add_var(data):
    #개인 inx
    data['personal_id'] = data['gender']+'-'+data['DAYS_BIRTH'].astype(str)+'-'+data['income_total'].astype(str)+'-'+data['income_type'].astype(str)
    
    data['personal_begin_id'] = data['gender']+'-'+data['DAYS_BIRTH'].astype(str)+'-'+data['income_total'].astype(str)+'-'+data['income_type'].astype(str)+'-'+data['begin_month'].astype(str)
    
    #이외의 변수들을 조합하여 변수 생성 : 이것도 모델 설명력을 올려줄까? 
    data['g_r_c'] = data['gender']+'-'+data['reality']+'-'+data['car']
    data['p_w_e'] = data['phone'].astype(str)+'-'+data['work_phone'].astype(str)+'-'+data['email'].astype(str)
    
    return data
    

In [30]:
train = add_var(train)
test = add_var(test)

# 숫자형 변수 전처리

In [37]:
def numeric_preprocess(data):
    #income total
    #만단위로 생성
    data['income_total'] = data['income_total']/10000
    #편차 제곱 변수 생성
    data['income_total_dev'] = (data['income_total'] - data['income_total'].mean())**2
    #로그 변환
    data['income_total_log'] = data['income_total'].apply(np.log1p)
    
    #DAYS_EMPLOYED
    data.loc[data['DAYS_EMPLOYED']>=0,'DAYS_EMPLOYED']=0
    #data.DAYS_EMPLOYED.apply(lambda x: 0 if x>=0 else x)
    data['DAYS_EMPLOYED'] = data['DAYS_EMPLOYED'].apply(days_to_year)
    #log trans
    data['DAYS_EMPLOYED_log'] = data['DAYS_EMPLOYED'].apply(np.log1p)
    
    #begin_month
    #양수처리
    data['begin_month'] = data['begin_month'].apply(minus)
    
    #DAYS_BIRTH
    #days to year
    data['DAYS_BIRTH'] = data['DAYS_BIRTH'].apply(days_to_year)
    
    #Ratio var
    data['EMPLOEYD_BIRTH_RATIO'] = data['DAYS_EMPLOYED']/data['DAYS_BIRTH']
    data['INCOME_EMPLOYED_RATIO'] = data['income_total']/data['DAYS_EMPLOYED']
    data['INCOME_BITH_RATIO'] = data['income_total']/data['DAYS_BIRTH']
    
    # 가족수 - 자식수
    data['diff_fam_child'] = data['family_size'] - data['child_num']
    # chid_num과 family_size는 다음과 같이 최대 2와 5가 되도록 전처리
    data.loc[data['child_num'] >= 2,'child_num'] = 2
    data.loc[data['family_size'] >= 5,'child_num'] = 5
    # 가족수와 자녀수 sum 변수 추가
    data['FAM_CHILD_SUM'] = data[['child_num', 'family_size']].sum(axis=1)
    
    #*income을 가족 수 및 자식 수로 나눈 비율
    data['INCOME_FAM_RATIO'] = data['income_total']/data['family_size']
    data['INCOME_child_num_RATIO'] = data['income_total']/data['child_num']
    
    #**일을하게 된 시점 변수 추가
    data['BIRTH_MINUS_EMPLOYED'] = data['DAYS_BIRTH'] - data['DAYS_EMPLOYED']
    # income total 변수에 before_EMPLOYED로 나눈 변수 추가
    data['INCOME_BIRTH_MINUS_EMPLOYED_RATIO'] = data['income_total']/data['BIRTH_MINUS_EMPLOYED']
    
    return data 

In [39]:
train = numeric_preprocess(train)
test = numeric_preprocess(test)

# occcyp_type 처리

In [40]:
def occype_process(data):
    
    # occyp_type 변수에만 있는 결측치를 'NAN' 값으로 대체
    data['occyp_type'] = data['occyp_type'].fillna('NAN')
    # 경력이 없고 직업군이 none인 사람은 no_work로 대체
    data.loc[(data['DAYS_EMPLOYED'] == 0) & (data['occyp_type'] == 'NAN'), 'occyp_type'] = 'no_work'
    print(data['occyp_type'].value_counts(), '\n\n')

    return data

In [41]:
train = occype_process(train)
test = occype_process(test)

no_work                  8171
Laborers                 4512
Core staff               2646
Sales staff              2539
Managers                 2167
Drivers                  1572
High skill tech staff    1040
Accountants               902
Medicine staff            864
Cooking staff             457
Security staff            424
Cleaning staff            401
Private service staff     243
Low-skill Laborers        127
Waiters/barmen staff      123
Secretaries                97
Realty agents              63
HR staff                   62
IT staff                   41
Name: occyp_type, dtype: int64 


Laborers                 1699
no_work                  1697
NAN                      1455
Sales staff               946
Core staff                945
Managers                  845
Drivers                   563
Medicine staff            343
High skill tech staff     343
Accountants               339
Cooking staff             198
Security staff            168
Cleaning staff            148
Privat

# 구간화 변수 생성

In [None]:
def make_bin(df, variable, n):
    
    data = df
    count, bin_dividers = np.histogram(data[variable], bins=n)
    bin_names=[str(i) for i in range(n)]
    data['%s_bin' % variable] = pd.cut(x=data[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)
    data['%s_bin' % variable] = pd.factorize(data['%s_bin' % variable])[0]
    print(data['%s_bin' % variable], '\n\n')
    
    return data

In [None]:
# days_birth만 구간화 했을 떄 가장 성능이 좋았음
train = make_bin(train, 'DAYS_BIRTH', n=10)
test = make_bin(test, 'DAYS_BIRTH', n=10)

In [None]:
# 불필요 변수 제거
train = train.drop(['income_total', 'DAYS_EMPLOYED', 'FLAG_MOBIL'], axis=1)
test = test.drop(['income_total', 'DAYS_EMPLOYED', 'FLAG_MOBIL'], axis=1)

In [None]:
#전처리 완료된 최종 데이터
train.shape, test.shape

In [None]:
train_X = train.drop(['credit'],axis=1)
train_y = train['credit']
train_X.info()

In [None]:
# object형 변수는 cat_features에 추가
cat_features = [f for f in train_x.columns if train_x[f].dtype == 'object']

def column_index(df, cat_features):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, cat_features, sorter=sidx)]

cat_features_idx = column_index(train_x, cat_features)    
print("Cat features are: %s" % [f for f in cat_features])
print(cat_features_idx)

In [None]:
cat_models={}

def cat_kfold(max_depth, learning_rate, random_seed):
    
    folds=StratifiedKFold(n_splits=10, shuffle=True, random_state=55)
    outcomes=[]
    sub=np.zeros((test.shape[0], 3))  
    
    for seed in random_seed:
        for n_fold, (train_index, val_index) in enumerate(folds.split(train_x, train_y)):
            print(f'===================================={n_fold+1}============================================')
            
            X_train, X_val = train_x.iloc[train_index], train_x.iloc[val_index]
            y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]

            # early_stopping 50에서 가장 좋은 점수를 내는 learning_rate를 활용
            cat = CatBoostClassifier(n_estimators=3000, max_depth=max_depth, random_seed=seed, learning_rate=learning_rate, bootstrap_type ='Bernoulli')
            cat.fit(X_train, y_train,
                  eval_set=[(X_train, y_train), (X_val, y_val)],
                  early_stopping_rounds=50, cat_features=cat_features,
                  verbose=100)

            cat_models[n_fold] = cat

            # val 데이터 예측
            predictions = cat.predict_proba(X_val)
            # test 데이터 예측
            test_predictions = cat.predict_proba(test)

            # val 데이터 예측 logloss 값 저장
            logloss=log_loss(to_categorical(y_val), predictions)
            outcomes.append(logloss)
            print(f"FOLD {n_fold+1} : logloss:{logloss}")

            # test 데이터 예측 결과 종합
            # 최종 적으로는 kolds 횟수 만큼 나눠서 평균 값을 활용
            sub+=test_predictions

            print(f'================================================================================\n\n')

    # 저장된 val 데이터 예측 logloss 값의 평균 값으로 성능을 비교
    mean_outcome=np.mean(outcomes)
    print("Mean:{}".format(mean_outcome))
    
    return sub/(folds.n_splits * len(random_seed))

In [None]:
def plot_feature_importance(importance, names, model_type):
    
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    
    plt.figure(figsize=(10,8))
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [None]:
plot_feature_importance(cat_models[0].get_feature_importance(), train_x.columns,'CatBOOST')

In [None]:
cat = CatBoostClassifier(n_estimators=443, max_depth=8, random_seed=2, learning_rate =0.04, bootstrap_type ='Bernoulli')
cat.fit(train_x, train_y, cat_features=cat_features, verbose=50)
test_predictions = cat.predict_proba(test)