# 모듈 불러오기

#### 기본

In [None]:
import numpy as np
import pandas as pd
import joblib

In [None]:
# !conda install numpy 
# !conda install pandas
# !conda install scikit-learn
# !conda install scipy
# !conda install tensorflow
# !conda install matplotlib
# !conda install seaborn
# !pip install ydata-profiling

# !pip install xgboost
# !pip install lightgbm
# !pip install catboost
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension
# !pip install hyperopt
# !pip install -U imbalanced-learn
# !pip install missingno
# !pip install shap

#### 전처리

In [None]:
from sklearn.model_selection import train_test_split

from sklearn import impute
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer

#### 리샘플링

In [None]:
from imblearn.over_sampling import (
    RandomOverSampler, 
    ADASYN, 
    SMOTE
)
from imblearn.under_sampling import (
    RandomUnderSampler, 
    TomekLinks, 
    CondensedNearestNeighbour, 
    OneSidedSelection, 
    EditedNearestNeighbours, 
    NeighbourhoodCleaningRule
)

#### 분석

In [None]:
from scipy.stats import skew, kurtosis
from scipy.stats import ttest_ind, f_oneway, pearsonr, chi2_contingency
from ydata_profiling import ProfileReport

#### 회귀

In [None]:
from sklearn.linear_model import LinearRegression as RL
from sklearn.neighbors import KNeighborsRegressor as KNNR
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.svm import SVR
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor as RFR
from xgboost import XGBRegressor as XGBR
from lightgbm import LGBMRegressor as LGBMR
from catboost import CatBoostRegressor as CBR

from lightgbm import plot_importance as lgbm_plot_importance
from xgboost import plot_importance as xgb_plot_importance
from catboost import Pool

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

#### 분류

In [None]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNNC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier as RFC
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBMC
from catboost import CatBoostClassifier as CBC

from sklearn.metrics import confusion_matrix as cmatrix
from sklearn.metrics import classification_report as creport
from sklearn.metrics import recall_score as recall
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

#### 교차검증

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from catboost import cv

from hyperopt import hp
from hyperopt import STATUS_OK
from hyperopt import fmin, tpe, Trials

from sklearn.model_selection import (
    StratifiedKFold, # 분류
    KFold, # 회귀
    # GroupKFold, 
    # RepeatedKFold, 
    # StratifiedGroupKFold, 
    # RepeatedStratifiedKFold
)

#### 비즈니스 이해

In [None]:
from sklearn.inspection import permutation_importance
from sklearn.inspection import PartialDependenceDisplay, partial_dependence

#### 시각화

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.rc('font', family='Malgun Gothic')
sns.set(font="Malgun Gothic",
        rc={"axes.unicode_minus":False}, # 마이너스 부호 깨짐 현상 해결
        style='darkgrid')  

import warnings
warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format = 'retina'

import shap
tf.compat.v1.disable_v2_behavior() # shap 그래프 tf1 버전 지원 tf2 비활성화

# 데이터 로드

In [None]:
folder_path = '../분류데이터'

x_train = pd.read_csv(f'{folder_path}/x_train.csv', sep=',', encoding='utf-8')
y_train = pd.read_csv(f'{folder_path}/y_train.csv', sep=',', encoding='utf-8')
x_val = pd.read_csv(f'{folder_path}/x_test.csv', sep=',', encoding='utf-8')
y_val = pd.read_csv(f'{folder_path}/y_test.csv', sep=',', encoding='utf-8')

# HyperOpt CV

- search space
- 목적 함수
- 목적 함수의 최솟값을 찾는 함수

In [None]:
# 검색공간
xgbc_search_space = {'max_depth': hp.quniform('max_depth', 5, 20, 1), 
                'min_child_weight': hp.quniform('min_child_weight', 1, 2, 1),
                'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
                }

In [None]:
# 목적함수
def objective_func(xgbc_search_space):
    # fmin은 실수값을 반환해서 정수형 변환
    xgb_clf = XGBC(n_estimators = 100, 
                   max_depth = int(xgbc_search_space['max_depth']),
                   min_child_weight = int(xgbc_search_space['min_child_weight']),
                   learning_rate = xgbc_search_space['learning_rate'],
                   colsample_bytree = xgbc_search_space['colsample_bytree'],
                   eval_metric = 'logloss'
                   )
    cv_score = cross_val_score(xgb_clf, x_train, y_train, scoring=make_scorer(accuracy, greater_is_better=True), cv=3)
    
    # fmin은 최소를 찾는 함수이고, accuracy는 높을수록 좋은 점수라서 -1 곱하기
    return {'loss':-1 * np.mean(cv_score), 'status': STATUS_OK}

In [None]:
trial = Trials()
xgbc_best_score = fmin(fn=objective_func,        # 목적함수
                  space=xgbc_search_space,  # 검색공간
                  algo=tpe.suggest,         # Tree structured parsen estimator (항상 넣을 것)
                  max_evals=5,              # 최대 수행횟수
                  trials=trial,             # 입력한 결과값 저장할 trials 객체 (문서 리스트들)
                  rstate=np.random.default_rng(seed=0) # 랜덤시드값 안주는 것이 좋은 성능으로 나옴
                  )

print('best:', xgbc_best_score)

In [None]:
trial.results, trial.vals, trial.best_trial

In [None]:
print(f'colsample_bytree : {round(xgbc_best_score["colsample_bytree"], 5)}')
print(f'max_depth : {int(xgbc_best_score["max_depth"])}')
print(f'min_child_weight : {int(xgbc_best_score["min_child_weight"])}')

In [None]:
# fmin으로 도출한 파라미터를 그대로 다시 학습
model = XGBC(
               n_estimators=200, 
               max_depth=int(xgbc_best_score['max_depth']), 
               min_child_weight=int(xgbc_best_score['min_child_weight']), 
               colsample_bytree=round(xgbc_best_score['colsample_bytree'], 5)   
              )

model.fit(x_train, y_train, early_stopping_rounds=100, 
            eval_metric="auc", eval_set=[(x_train, y_train), (x_val, y_val)])

y_pred = model.predict_proba(x_test)[:,1]

xgb_roc_score = roc_auc_score(y_test, y_pred)

print(f'ROC AUC: {xgb_roc_score:.4f}')

In [None]:
def objective_func(search_space):
    model =  LGBMC(
                      n_estimators=100, 
                      num_leaves=int(search_space['num_leaves']),
                      max_depth=int(search_space['max_depth']),
                      min_child_samples=int(search_space['min_child_samples']), 
                      subsample=search_space['subsample'],
                      learning_rate=search_space['learning_rate']
                     )

    roc_auc_list = []
    
    kf = StratifiedKFold(n_splits=3)
    for train_index, val_index in kf.split(x_train):
        # KFold로 데이터 분리
        x_train_kf, y_train_kf = x_train.iloc[train_index], y_train.iloc[train_index]
        x_val, y_val = x_train.iloc[val_index], y_train.iloc[val_index]

        model.fit(
                    x_train_kf, y_train_kf, 
                    early_stopping_rounds=30, 
                    eval_metric="auc",
                    eval_set=[(x_train_kf, y_train_kf), (x_val, y_val)]
                    )

        # 1로 예측한 확률값 추출후 roc auc 계산
        score = roc_auc_score(y_val, model.predict_proba(x_val)[:, 1]) 
        roc_auc_list.append(score)
    
    return -1 * np.mean(roc_auc_list)

In [None]:
trials = Trials()

# fmin()함수를 호출. max_evals지정된 횟수만큼 반복 후 목적함수의 최소값을 가지는 최적 입력값 추출. 
lgbmc_best_score = fmin(fn=objective_func, space=lgbmc_search_space, algo=tpe.suggest,
            max_evals=50, # 최대 반복 횟수를 지정합니다.
            trials=trials, rstate=np.random.default_rng(seed=30))

print('best:', lgbmc_best_score)

In [None]:
lgbm_clf =  LGBMC(
                  n_estimators=500, 
                  num_leaves=int(lgbmc_best_score['num_leaves']),
                  max_depth=int(lgbmc_best_score['max_depth']),
                  min_child_samples=int(lgbmc_best_score['min_child_samples']), 
                  subsample=round(lgbmc_best_score['subsample'], 5),
                  learning_rate=round(lgbmc_best_score['learning_rate'], 5)
                 )

lgbm_clf.fit(x_train, y_train, early_stopping_rounds=100, 
            eval_metric="auc",eval_set=[(x_train, y_train), (x_val, y_val)])

y_pred = lgbm_clf.predict_proba(x_test)[:,1]

lgbm_roc_score = roc_auc_score(y_test, y_pred)
print(f'ROC AUC: {lgbm_roc_score:.4f}')

In [None]:
# trials에 담긴 각 시도의 x, y, loss를 데이터프레임으로 확인
losses = [loss_dict['loss'] for loss_dict in trial.results]

result_df = pd.DataFrame({'x': trial.vals['x'],
                          'y': trial.vals['y'],
                          'losses': losses})
result_df