In [1]:
# 경고(Warning)가 출력되지 않도록 설정.
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('data/data-v01.csv')
data.shape

(150000, 11)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   seriousdlqin2yrs                      150000 non-null  int64  
 1   revolvingutilizationofunsecuredlines  150000 non-null  float64
 2   age                                   150000 non-null  int64  
 3   numberoftime30-59dayspastduenotworse  150000 non-null  int64  
 4   debtratio                             150000 non-null  float64
 5   monthlyincome                         150000 non-null  float64
 6   numberofopencreditlinesandloans       150000 non-null  int64  
 7   numberoftimes90dayslate               150000 non-null  int64  
 8   numberrealestateloansorlines          150000 non-null  int64  
 9   numberoftime60-89dayspastduenotworse  150000 non-null  int64  
 10  numberofdependents                    150000 non-null  float64
dtype

In [5]:
data.head()

Unnamed: 0,seriousdlqin2yrs,revolvingutilizationofunsecuredlines,age,numberoftime30-59dayspastduenotworse,debtratio,monthlyincome,numberofopencreditlinesandloans,numberoftimes90dayslate,numberrealestateloansorlines,numberoftime60-89dayspastduenotworse,numberofdependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,12645.0,7,0,1,0,0.0


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score

import pickle

In [7]:
# X, y 분리
y = data.seriousdlqin2yrs
X = data.drop('seriousdlqin2yrs', axis='columns')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

In [9]:
y_train.shape,y_test.shape

((112500,), (37500,))

In [10]:
# y의 label별 비율
np.unique(y_train, return_counts=True)[1]/y_train.size

array([0.93315556, 0.06684444])

## Feature Scaler생성

In [11]:
scaler = StandardScaler()
# scaler = MinMaxScaler()

## Base-line 모델 정의

In [12]:
# knn. losgistic은 scaler 적용
knn = make_pipeline(scaler, KNeighborsClassifier())
lr = make_pipeline(scaler, LogisticRegression(max_iter=2000, random_state=0))
rf = RandomForestClassifier(random_state=0)
grb = GradientBoostingClassifier(random_state=0)
xgb = XGBClassifier(random_state=0)

In [13]:
knn.fit(X_train, y_train)
lr.fit(X_train, y_train)
xgb.fit(X_train, y_train)
grb.fit(X_train, y_train)
rf.fit(X_train, y_train)

In [14]:
base_line = [knn, lr, xgb, grb, rf]
model_names = ['KNN', 'LogisticRegression', 'XGBoost', 'GradientBoosting', 'RandomForest']

In [15]:
for model, name in zip(base_line, model_names):

    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    
    pred_train_proba = model.predict_proba(X_train)
    pred_test_proba = model.predict_proba(X_test)
    
    acc_train = np.round(accuracy_score(y_train,pred_train),3)
    acc_test = np.round(accuracy_score(y_test, pred_test), 3)
    
    auc_train = np.round(roc_auc_score(y_train, pred_train_proba[:, 1]), 3)
    auc_test = np.round(roc_auc_score(y_test, pred_test_proba[:, 1]), 3)
    
    print(f'{name}')
    print(f'train정확도:{acc_train}, Test정확도:{acc_test}\t train AUC:{auc_train}, Test AUC:{auc_test}')
    print('='*50)

KNN
train정확도:0.942, Test정확도:0.933	 train AUC:0.951, Test AUC:0.694
LogisticRegression
train정확도:0.934, Test정확도:0.934	 train AUC:0.804, Test AUC:0.803
XGBoost
train정확도:0.949, Test정확도:0.936	 train AUC:0.917, Test AUC:0.861
GradientBoosting
train정확도:0.939, Test정확도:0.936	 train AUC:0.868, Test AUC:0.866
RandomForest
train정확도:0.999, Test정확도:0.935	 train AUC:1.0, Test AUC:0.842


# GridSearchCV를 이용한 하이퍼파라미터 튜닝

### XGBoost

In [16]:
param = {
    'learning_rate':[0.01,0.1,0.5,1],
    'n_estimators':[100,200,300,400,500],
    'max_depth':range(1,6),
    'subsample':[0.6,0.7,0.8,0.9,1],
}

In [17]:
rs_xgb = RandomizedSearchCV(XGBClassifier(random_state=0), 
                            param, 
                            n_iter=60, 
                            scoring='roc_auc',
                            cv=5,
                            n_jobs=-1 )

In [None]:
rs_xgb.fit(X_train, y_train)

In [None]:
rs_xgb.best_params_

In [None]:
rs_df = pd.DataFrame(rs_xgb.cv_results_)
rs_df.sort_values('rank_test_score').head()

In [None]:
# 상세하게 찾기
param = {
    "subsample":[0.6,0.7,0.8,0.9,1], 
    "max_depth":[2,3,4]
}
gs_xgb = GridSearchCV(XGBClassifier(n_estimators=400, learning_rate=0.1, random_state=0), 
                      param, 
                      scoring='roc_auc', 
                      cv=5,
                      n_jobs=-1 )

gs_xgb.fit(X_train, y_train)

In [None]:
gs_xgb.best_params_

In [None]:
gs_xgb.best_score_

In [None]:
gs_df = pd.DataFrame(gs_xgb.cv_results_)
gs_df.sort_values('rank_test_score').head()

In [None]:
best_model_xgb = gs_xgb.best_estimator_

In [None]:
# 모델저장
import os
save_dir = 'saved_model'
os.makedirs(save_dir, exist_ok=True) #디렉토리 만들기(없으면 만들고 있으면 안 만든다.)


xgb_file_path = os.path.join(save_dir, 'xgb_best.pkl')
best_model_xgb.save_model(xgb_file_path) #xgb객체.save_model(경로) => 모델을 파일로 저장.

In [None]:
# Load Model
saved_xgb = XGBClassifier()
saved_xgb.load_model(xgb_file_path)

### GradientBoosting

In [None]:
param_gb = {
    'learning_rate':[0.001, 0.01,0.1,0.5,1,10],
    'n_estimators':[100,200,300,400,500],
    'max_depth':range(1,6),
    'subsample':[0.6,0.7,0.8,0.9,1],
}
rs_gb = RandomizedSearchCV(GradientBoostingClassifier(random_state=0), 
                           param_distributions=param_gb, 
                           n_iter=60, 
                           cv=5, 
                           scoring='roc_auc', 
                           n_jobs=-1)

In [None]:
rs_gb.fit(X_train, y_train)

In [None]:
rs_gb.best_params_

In [None]:
rs_gb.best_score_

In [None]:
rs_df2 = pd.DataFrame(rs_gb.cv_results_)
rs_df2.sort_values('rank_test_score').head()

In [None]:
param={
    "subsample":[0.6, 0.7, 0.8], 
    "n_estimators":[300,400,500],
    "max_depth":[2,3,4]
}
gs_gb = GridSearchCV(GradientBoostingClassifier(learning_rate=0.1, random_state=0),
                    param_grid=param, 
                    scoring='roc_auc',
                    cv=5,
                    n_jobs=-1)

In [None]:
gs_gb.fit(X_train, y_train)

In [None]:
gs_gb.best_params_

In [None]:
gs_gb.best_score_

In [None]:
gs_df2 = pd.DataFrame(gs_gb.cv_results_)
gs_df2.sort_values('rank_test_score').head()

In [None]:
best_model_gb = gs_gb.best_estimator_

In [None]:
# 모델 저장
gb_file_path = os.path.join(save_dir, 'gradient_boosting_best.pkl')
with open(gb_file_path, 'wb') as fw:
    pickle.dump(best_model_gb, fw) 

In [None]:
gb_file_path = os.path.join(save_dir, 'gradient_boosting_best.pkl')
with open(gb_file_path, 'rb') as fr:
    saved_gb = pickle.load(fr) 

### RandomForest

In [None]:
param_rf = {
    'n_estimators':[100,200,300,400,500],
    'max_depth':range(1,5),
    'max_features':range(5,11)
}
rs_rf = RandomizedSearchCV(RandomForestClassifier(random_state=0), 
                                 param_distributions=param_rf, 
                                 n_iter=60, 
                                 cv=5, 
                                 scoring='roc_auc', 
                                 n_jobs=-1)

In [None]:
rs_rf.fit(X_train, y_train)

In [None]:
rs_rf.best_params_

In [None]:
rs_rf.best_score_

In [None]:
rs_df3 = pd.DataFrame(rs_rf.cv_results_)
rs_df3.sort_values('rank_test_score').head()

In [None]:
param = {
    "n_estimators":[400,500,600,700], 
    "max_features":[3,4,5,6],
    "max_depth":[3,4,5,6]
}

gs_rf = GridSearchCV(RandomForestClassifier(random_state=0),
                    param_grid=param, 
                    scoring='roc_auc',
                    cv=5,
                    n_jobs=-1)

In [None]:
gs_rf.fit(X_train, y_train)

In [None]:
gs_rf.best_params_

In [None]:
gs_rf.best_score_

In [None]:
gs_df3 = pd.DataFrame(gs_rf.cv_results_)
gs_df3.sort_values('rank_test_score').head()

In [None]:
best_model_rf = gs_rf.best_estimator_

In [None]:
rf_file_path = os.path.join(save_dir, "random_forest_best.pkl")
with open(rf_file_path, 'wb') as fo:
    pickle.dump(best_model_rf, fo)

In [None]:
rf_file_path = os.path.join(save_dir, "random_forest_best.pkl")
with open(rf_file_path, 'rb') as fo:
    saved_rf = pickle.load(fo)


## VotingClassifier
- best model들 사용

In [None]:
from sklearn.ensemble import VotingClassifier
estimators = [('xgb', saved_xgb), ('gradient boost', saved_gb), ('random forest', saved_rf)]
voting_clf = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)

In [None]:
voting_clf.fit(X_train, y_train)

In [None]:
pred_proba  = voting_clf.predict_proba(X_test)
auc_score = roc_auc_score(y_test, pred_proba[:, 1])
auc_score

- xgboost : 0.8657477556440014
- grandient boosting : 0.8651901976338401
- RandomForest: 0.861018496990928
- voting: 0.8618094979187302

## Test Set 으로 검증

In [None]:
def test(estimator, X, y):
    pred_prob = estimator.predict_proba(X)
    return roc_auc_score(y, pred_prob[:, 1])

In [None]:
# XGB
test(saved_xgb, X_test, y_test)

In [None]:
# Gradient Boosting
test(saved_gb, X_test, y_test)

### Test set 최종 검증결과
- xgboost : 0.870387089730717
- grandient boosting : 0.8734470316904399