In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import warnings, random
warnings.filterwarnings(action='ignore')

from sklearn.metrics import log_loss, confusion_matrix, precision_score, recall_score, f1_score, roc_curve
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split, validation_curve, cross_val_score, GridSearchCV

from sklearn.cluster import KMeans
from catboost import CatBoostClassifier, Pool

import matplotlib
import mglearn
import joblib
import math

from sklearn.neural_network import MLPClassifier
from statsmodels.graphics.tsaplots import plot_acf, acf
from sklearn.linear_model import Ridge,Lasso,ElasticNet, LinearRegression, SGDRegressor, LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC, SVR
from sklearn.compose import make_column_transformer,ColumnTransformer
import math
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
import sklearn.metrics as metrics
from xgboost import XGBClassifier, XGBRFRegressor, XGBRegressor
from lightgbm import LGBMRegressor, LGBMClassifier
import os

matplotlib.rcParams['font.family']='Malgun Gothic'
plt.rcParams['font.size']=14
matplotlib.rcParams['axes.unicode_minus'] = False

In [7]:
print('numpy version :', np.__version__)
print('pandas version :', pd.__version__)
print('matplotlib version :', mpl.__version__)

numpy version : 1.19.2
pandas version : 1.2.4
matplotlib version : 3.3.2


## 데이터 불러오기

In [8]:
BASE_DIR = './data'
train_path = os.path.join(BASE_DIR, 'train.csv')
test_path = os.path.join(BASE_DIR, 'test.csv')

In [9]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
target = train['credit']

## 데이터 전처리

### 1. 결측치 처리

In [10]:
train.fillna('NaN', inplace=True) 
test.fillna('NaN', inplace=True)

### 2. 이상치 처리
- train['family_size'] > 7 인 데이터 제거

In [11]:
train = train[(train['family_size'] <= 7)]
train = train.reset_index(drop=True)

## Feature Engineering

### 1. 의미없는 변수 제거
- index 제거
- FLAG_MOBIL 삭제:모든 값이 1로 동일

In [12]:
train.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)
test.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)

### 2. DAYS_EMPLOYED
- 양수인 데이터는 현재 무직자로 판단, 0 처리

In [13]:
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)

### 3. DAYS_BIRTH, begin_month, DAYS_EMPLOYED
- 음수값 -> 양수 변환

In [14]:
feats = ['DAYS_BIRTH', 'begin_month', 'DAYS_EMPLOYED']
for feat in feats:
    train[feat]=np.abs(train[feat])
    test[feat]=np.abs(test[feat])

### 4. 파생변수
- numeric 변수는 최대한 다양한 특징을 보일 수 있도록 생성
- category 변수는 여러가지를 조합해 보았지만 전체 변수를 합친 ID 하나만 만들었을때 가장 logloss가 낮았음
- ref) rollcake님 글 https://dacon.io/competitions/official/235713/codeshare/2526?page=1&dtype=recent

In [15]:
for df in [train,test]:
    # before_EMPLOYED: 고용되기 전까지의 일수
    df['before_EMPLOYED'] = df['DAYS_BIRTH'] - df['DAYS_EMPLOYED']
    df['income_total_befofeEMP_ratio'] = df['income_total'] / df['before_EMPLOYED']
    df['before_EMPLOYED_m'] = np.floor(df['before_EMPLOYED'] / 30) - ((np.floor(df['before_EMPLOYED'] / 30) / 12).astype(int) * 12)
    df['before_EMPLOYED_w'] = np.floor(df['before_EMPLOYED'] / 7) - ((np.floor(df['before_EMPLOYED'] / 7) / 4).astype(int) * 4)
    
    #DAYS_BIRTH 파생변수- Age(나이), 태어난 월, 태어난 주(출생연도의 n주차)
    df['Age'] = df['DAYS_BIRTH'] // 365
    df['DAYS_BIRTH_m'] = np.floor(df['DAYS_BIRTH'] / 30) - ((np.floor(df['DAYS_BIRTH'] / 30) / 12).astype(int) * 12)
    df['DAYS_BIRTH_w'] = np.floor(df['DAYS_BIRTH'] / 7) - ((np.floor(df['DAYS_BIRTH'] / 7) / 4).astype(int) * 4)

    
    #DAYS_EMPLOYED_m 파생변수- EMPLOYED(근속연수), DAYS_EMPLOYED_m(고용된 달) ,DAYS_EMPLOYED_w(고용된 주(고용연도의 n주차))  
    df['EMPLOYED'] = df['DAYS_EMPLOYED'] // 365
    df['DAYS_EMPLOYED_m'] = np.floor(df['DAYS_EMPLOYED'] / 30) - ((np.floor(df['DAYS_EMPLOYED'] / 30) / 12).astype(int) * 12)
    df['DAYS_EMPLOYED_w'] = np.floor(df['DAYS_EMPLOYED'] / 7) - ((np.floor(df['DAYS_EMPLOYED'] / 7) / 4).astype(int) * 4)

    #ability: 소득/(살아온 일수+ 근무일수)
    df['ability'] = df['income_total'] / (df['DAYS_BIRTH'] + df['DAYS_EMPLOYED'])
    
    #income_mean: 소득/ 가족 수
    df['income_mean'] = df['income_total'] / df['family_size']
    
    #ID 생성: 각 컬럼의 값들을 더해서 고유한 사람을 파악(*한 사람이 여러 개 카드를 만들 가능성을 고려해 begin_month는 제외함)
    df['ID'] = \
    df['child_num'].astype(str) + '_' + df['income_total'].astype(str) + '_' +\
    df['DAYS_BIRTH'].astype(str) + '_' + df['DAYS_EMPLOYED'].astype(str) + '_' +\
    df['work_phone'].astype(str) + '_' + df['phone'].astype(str) + '_' +\
    df['email'].astype(str) + '_' + df['family_size'].astype(str) + '_' +\
    df['gender'].astype(str) + '_' + df['car'].astype(str) + '_' +\
    df['reality'].astype(str) + '_' + df['income_type'].astype(str) + '_' +\
    df['edu_type'].astype(str) + '_' + df['family_type'].astype(str) + '_' +\
    df['house_type'].astype(str) + '_' + df['occyp_type'].astype(str)

### 5. 파생변수와 다중공선을 보이는 컬럼 삭제

In [16]:
cols = ['child_num', 'DAYS_BIRTH', 'DAYS_EMPLOYED',]
train.drop(cols, axis=1, inplace=True)
test.drop(cols, axis=1, inplace=True)

## Scaling, Encoding

### 1. Numeric, Category 컬럼 분류

In [17]:
numerical_feats = train.dtypes[train.dtypes != "object"].index.tolist()
numerical_feats.remove('credit')
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = train.dtypes[train.dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  18
Number of Categorical features:  9


In [18]:
numerical_feats

['income_total',
 'work_phone',
 'phone',
 'email',
 'family_size',
 'begin_month',
 'before_EMPLOYED',
 'income_total_befofeEMP_ratio',
 'before_EMPLOYED_m',
 'before_EMPLOYED_w',
 'Age',
 'DAYS_BIRTH_m',
 'DAYS_BIRTH_w',
 'EMPLOYED',
 'DAYS_EMPLOYED_m',
 'DAYS_EMPLOYED_w',
 'ability',
 'income_mean']

In [19]:
categorical_feats

['gender',
 'car',
 'reality',
 'income_type',
 'edu_type',
 'family_type',
 'house_type',
 'occyp_type',
 'ID']

### 2. Log Scale
- income_total

In [20]:
for df in [train,test]:
    df['income_total'] = np.log1p(1+df['income_total'])

In [21]:
np.log1p(1+df['income_total'])

0       2.612327
1       2.625613
2       2.576214
3       2.612327
4       2.661927
          ...   
9995    2.654544
9996    2.654544
9997    2.680078
9998    2.646226
9999    2.674575
Name: income_total, Length: 10000, dtype: float64

### 3. OrdinalEncoder
- 카테고리 변수는 ordinal_encoder 변환
- ID는 변환 후 정수 처리

In [22]:
encoder = OrdinalEncoder(categorical_feats)
train[categorical_feats] = encoder.fit_transform(train[categorical_feats], train['credit'])
test[categorical_feats] = encoder.transform(test[categorical_feats])

train['ID'] = train['ID'].astype('int64')
test['ID'] = test['ID'].astype('int64')

### 4. 클러스터링 구성
- 타겟을 결정짓는 뚜렷한 특징을 갖는 피쳐를 찾지 못해 clustering 시도

In [23]:
kmeans_train = train.drop(['credit'], axis=1)
kmeans = KMeans(n_clusters=36, random_state=42).fit(kmeans_train)
train['cluster'] = kmeans.predict(kmeans_train)
test['cluster'] = kmeans.predict(test)

### 5. StandardScale
- 이미 로그변환을 진행한 income_total을 제외한 나머지 numeric 컬럼 정규화

In [24]:
numerical_feats.remove('income_total')
scaler = StandardScaler()
train[numerical_feats] = scaler.fit_transform(train[numerical_feats])
test[numerical_feats] = scaler.transform(test[numerical_feats])

## Modeling - catboost
- fold 수를 5부터 17까지 돌려보고 최적 fold 15로 판단 후 선택
- parameter를 default로 두는 것이 logloss가 가장 낮았음
- ref) Catboost Documentation - https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html

In [25]:
n_est = 2000
seed = 42
n_fold = 15
n_class = 3

target = 'credit'
X = train.drop(target, axis=1)
y = train[target]
X_test = test

In [26]:
skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
folds=[]
for train_idx, valid_idx in skfold.split(X, y):
        folds.append((train_idx, valid_idx))

cat_pred = np.zeros((X.shape[0], n_class))
cat_pred_test = np.zeros((X_test.shape[0], n_class))
cat_cols = ['income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type', 'ID']
for fold in range(n_fold):
  print(f'\n----------------- Fold {fold} -----------------\n')
  train_idx, valid_idx = folds[fold]
  X_train, X_valid, y_train, y_valid = X.iloc[train_idx], X.iloc[valid_idx], y[train_idx], y[valid_idx]
  train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)
  valid_data = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

  model_cat = CatBoostClassifier()
  model_cat.fit(train_data, eval_set=valid_data, use_best_model=True, early_stopping_rounds=100, verbose=100)
  
  cat_pred[valid_idx] = model_cat.predict_proba(X_valid)
  cat_pred_test += model_cat.predict_proba(X_test) / n_fold
  print(f'CV Log Loss Score: {log_loss(y_valid, cat_pred[valid_idx]):.6f}')
    
print(f'\tLog Loss: {log_loss(y, cat_pred):.6f}')


----------------- Fold 0 -----------------

Learning rate set to 0.115127
0:	learn: 1.0346960	test: 1.0333820	best: 1.0333820 (0)	total: 172ms	remaining: 2m 51s
100:	learn: 0.7044704	test: 0.6427096	best: 0.6427096 (100)	total: 4.49s	remaining: 39.9s
200:	learn: 0.6843687	test: 0.6403953	best: 0.6402982 (176)	total: 9.31s	remaining: 37s
300:	learn: 0.6680286	test: 0.6392776	best: 0.6392019 (296)	total: 13.9s	remaining: 32.3s
400:	learn: 0.6515064	test: 0.6392338	best: 0.6388684 (397)	total: 18.4s	remaining: 27.5s
500:	learn: 0.6338447	test: 0.6414953	best: 0.6386980 (412)	total: 22.7s	remaining: 22.6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6386980408
bestIteration = 412

Shrink model to first 413 iterations.
CV Log Loss Score: 0.638698

----------------- Fold 1 -----------------

Learning rate set to 0.115127
0:	learn: 1.0346110	test: 1.0339653	best: 1.0339653 (0)	total: 12.5ms	remaining: 12.5s
100:	learn: 0.7034876	test: 0.6759823	best: 0.6757272 (93)	to

In [27]:
target = 'credit'
X = train.drop(target, axis=1)
y = train[target]
X_test = test

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify = y, random_state = 1)

In [28]:
X

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,work_phone,phone,...,Age,DAYS_BIRTH_m,DAYS_BIRTH_w,EMPLOYED,DAYS_EMPLOYED_m,DAYS_EMPLOYED_w,ability,income_mean,ID,cluster
0,1,1,1,12.218505,1,1,1,1,-0.538321,-0.645632,...,-0.452826,0.442795,-0.443485,0.994253,-1.230046,-1.077087,-0.032496,0.002062,1,35
1,1,1,2,12.419174,1,2,2,2,-0.538321,-0.645632,...,-1.060773,0.442795,-0.443485,-0.250471,-0.424295,-1.077087,1.190137,-0.254157,2,7
2,2,2,2,13.017007,2,1,1,2,-0.538321,1.548870,...,0.763069,-1.582567,0.451504,0.994253,-0.424295,-0.223607,1.186515,1.693108,3,18
3,1,1,2,12.218505,1,2,1,2,-0.538321,1.548870,...,-0.192277,1.310808,1.346494,-0.094880,1.187206,0.629874,0.101168,0.002062,4,35
4,1,2,2,11.967193,3,1,1,2,-0.538321,-0.645632,...,-0.192277,1.021471,-1.338475,-0.094880,1.455790,-1.077087,-0.282885,-0.305401,5,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26446,1,1,1,12.323865,3,2,1,2,-0.538321,-0.645632,...,-0.887074,0.153458,-0.443485,-0.094880,0.381456,1.483355,0.723641,-0.612864,3301,31
26447,1,1,2,12.100723,2,1,3,2,-0.538321,-0.645632,...,-0.192277,-0.135880,-1.338475,0.060710,1.455790,-0.223607,-0.143427,-0.151670,8753,10
26448,1,2,1,12.586227,2,2,2,3,-0.538321,-0.645632,...,-1.408172,-1.582567,-1.338475,-0.094880,0.650039,1.483355,1.932411,0.616988,8754,32
26449,2,1,2,12.049431,2,3,4,2,-0.538321,-0.645632,...,-1.408172,-1.003892,-0.443485,-0.872832,-0.424295,1.483355,0.824160,0.955197,8755,13


In [29]:
model_lgb = make_pipeline(StandardScaler(), LGBMClassifier(learning_rate = 0.05, max_depth = 5, min_samples_leaf = 1, n_estimators = 100))
model_lgb.fit(X, y)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('lgbmclassifier',
                 LGBMClassifier(learning_rate=0.05, max_depth=5,
                                min_samples_leaf=1))])

In [30]:
# grid search
model_xgb = Pipeline( [ ('scl', StandardScaler()), ('xgb', XGBClassifier(n_jobs = 12)) ] )
param_value = {'xgb__n_estimators' : [200, 250, 300, 350],
               'xgb__learning_rate' : [0.01, 0.05, 0.1, 0.5],
               'xgb__max_depth' : list(range(2, 20, 3)),
               'xgb__subsample' : [0.25, 0.5, 0.75, 1],
               'xgb__min_child_weight' : [0.5, 1, 1.5, 2]
              }
gridSearch = GridSearchCV(model_xgb, param_grid = param_value, cv = 5, n_jobs = 12, verbose = 1)
gridSearch.fit(X, y)

Fitting 5 folds for each of 1536 candidates, totalling 7680 fits


KeyboardInterrupt: 

In [None]:
model_xgb = Pipeline( [ ('scl', StandardScaler()), ('xgb', XGBClassifier(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, gamma=0, gpu_id=-1,
                              importance_type='gain',
                              interaction_constraints='', learning_rate=0.05,
                              max_delta_step=0, max_depth=17, min_child_weight=1.5,
                              monotone_constraints='()',
                              n_estimators=300, n_jobs=12, num_parallel_tree=1,
                              random_state=0, reg_alpha=0, reg_lambda=1,
                              scale_pos_weight=1, subsample=0.5,
                              tree_method='exact', validate_parameters=1,
                              verbosity=None)) ] )
model_xgb.fit(X, y)

## Feautre Importance
- ID의 중요도가 상당히 높게 나오는 것을 볼 수 있었음
- plot_feature_importance 함수
- ref) https://stackoverflow.com/questions/64988694/how-can-i-get-the-feature-importance-of-a-catboost-in-a-pandas-dataframe

In [None]:
def plot_feature_importance(importance,names,model_type):
    
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    plt.figure(figsize=(10,8))

    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])

    plt.title(model_type + ' Feature Importance')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Names')

In [None]:
plot_feature_importance(model_cat.get_feature_importance(),X_test.columns,'CATBOOST')

In [None]:
sub = pd.read_csv(os.path.join(BASE_DIR, 'sample_submission.csv'))

In [None]:
model_lgb.predict_proba(test)

In [None]:
log_loss(y, model_lgb.predict_proba(X))

In [None]:
log_loss(y, model_xgb.predict_proba(X))

In [None]:
sub.iloc[:, 1:] = model_lgb.predict_proba(test)
sub

In [None]:
sub.to_csv('20210731_lgbm.csv', index=False)