In [155]:
import numpy as np
import xgboost as xgb
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from xgboost import plot_importance


%matplotlib inline

In [156]:
import os

### 데이터를 불러오자

In [157]:
dir = os.getcwd()
x_train = pd.read_csv(dir + "/data/preprocessed_data/x_train.csv")
x_test = pd.read_csv(dir + "/data/preprocessed_data/x_test.csv")
x_valid = pd.read_csv(dir + "/data/preprocessed_data/x_valid.csv")
y_train = pd.read_csv(dir + "/data/preprocessed_data/y_train.csv")
y_test = pd.read_csv(dir + "/data/preprocessed_data/y_test.csv")
y_valid = pd.read_csv(dir + "/data/preprocessed_data/y_valid.csv")

In [158]:
y_train.replace(-1,0,inplace=True)
y_test.replace(-1,0,inplace=True)
y_valid.replace(-1,0,inplace=True)

### Model Train

In [159]:
param = {'n_estimators': 20,
    'learning_rate': 0.3}

In [160]:
model = xgb.XGBClassifier(**param)
model.fit(x_train, y_train)
valid_pred = model.predict(x_valid)
accuracy = accuracy_score(y_valid, valid_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 88.19%


### optuna 최적화

In [161]:
param_bounds =  {'objective': 'logistic',
 'learning_rate': (0.1, 0.3),
 'n_estimators': (20, 100),
 'max_depth': (3, 12),
 'early_stopping_rounds': (30, 50)}



### Bayesian Optimization

In [165]:
from bayes_opt import BayesianOptimization


In [179]:
# 탐색 대상 함수 (XGBRegressor)
def XGB_cv(max_depth,learning_rate, n_estimators, gamma
            ,min_child_weight, subsample
            ,colsample_bytree, silent=True, nthread=-1):

    # 모델 정의
    model = xgb.XGBClassifier(max_depth=int(max_depth),
                            learning_rate=learning_rate,
                            n_estimators=int(n_estimators),
                            gamma=gamma,
                            min_child_weight=min_child_weight,
                            subsample=subsample,
                            colsample_bytree=colsample_bytree, 
                            nthread=nthread
                            )
    # 모델 훈련
    model.fit(x_train, y_train)

    # 예측값 출력
    y_pred= model.predict(x_valid)

    return f1_score(y_valid, y_pred)



In [181]:
pbounds = {'max_depth': (3, 7),
                'learning_rate': (0.01, 0.3),
                'n_estimators': (20,100),
                'gamma': (0, 100),
                'min_child_weight': (0, 3),
                'subsample': (0.5, 1),
                'colsample_bytree' :(0.2, 1)
                }

bo=BayesianOptimization(f=XGB_cv, pbounds=pbounds, verbose=2, random_state=1 )    

bo.maximize(init_points=2, n_iter=10)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.92     [0m | [0m0.5336   [0m | [0m72.03    [0m | [0m0.01003  [0m | [0m4.209    [0m | [0m0.4403   [0m | [0m27.39    [0m | [0m0.5931   [0m |
| [95m2        [0m | [95m0.926    [0m | [95m0.4764   [0m | [95m39.68    [0m | [95m0.1663   [0m | [95m4.677    [0m | [95m2.056    [0m | [95m36.36    [0m | [95m0.9391   [0m |
| [0m3        [0m | [0m0.9189   [0m | [0m0.2046   [0m | [0m40.19    [0m | [0m0.1041   [0m | [0m3.836    [0m | [0m2.294    [0m | [0m37.26    [0m | [0m0.6836   [0m |
| [95m4        [0m | [95m0.9295   [0m | [95m0.8121   [0m | [95m10.95    [0m | [95m0.1614   [0m | [95m6.073    [0m | [95m2.061    [0m | [95m30.64    [0m | [95m0.601    [0m |
| [0m5        [0m | [0m0.9253  

In [186]:
# model = xgb.XGBClassifier(max_depth = 6,learning_rate=0.1614, n_estimators = 30, gamma=10.95
#             ,min_child_weight=2.061 , subsample=0.601 
#             ,colsample_bytree=0.8121)
# model.fit(x_train, y_train)
# print(accuracy_score(y_train, model.predict(x_train)))
# print(accuracy_score(y_valid, model.predict(x_valid)))
# print(f1_score(y_train, model.predict(x_train)))
f1_score(y_valid, model.predict(x_valid))

0.9296174536760312