In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn. neighbors import LocalOutlierFactor
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures

from scipy.stats import expon, reciprocal
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier
from missingpy import MissForest

import warnings
warnings.filterwarnings('ignore')



In [2]:
df = pd.read_csv('./data/galaxy_final.csv', index_col=0)
X = df.drop('sold', axis=1)
y = df.sold
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11,
                                                       stratify=y, shuffle=True)

# XGBoost Tuning
- https://www.kaggle.com/lifesailor/xgboost
- https://brunch.co.kr/@snobberys/137
- https://xzz201920.medium.com/xgbosst-booster-gbtree-v-s-dart-v-s-gblinear-82d8fcbb07d2

General Parameter
- booster: tree 기반 모델 / 선형 모델
    - https://xzz201920.medium.com/xgbosst-booster-gbtree-v-s-dart-v-s-gblinear-82d8fcbb07d2
    - skip_drop(default = 0, range [0, 1]) is the probability of skipping dropout. It has a higher priority than other DART parameters.
        - If skip_drop = 1, the dropout procedure would be skipped and dart is the same as gbtree.
    - If skip_drop≠0, rate_drop (default = 0, range [0, 1]) will drop a fraction of the trees before the model update in every iteration.
        - dropout makes dart between gbtree and random forest: “If no tree is dropped, dart is the same as (gbtree); if all the trees are dropped, dart is no different than random forest.”
- silent: 메세지 조절
- nthread: 병렬 처리 조절

Boost Parameter
- eta: Learning rate(일반적으로 0.01 - 0.2)
- min_child_weight: min_child_weight를 기준으로 추가 분기 결정(크면 Underfitting)
- max_depth: Tree 깊이 수
- max_leaf_node: 하나의 트리에서 node 개수
- gamma: split 하기 위한 최소의 loss 감소 정의
- subsample: 데이터 중 샘플링(0.5 - 1)
- colsample_bytree: column 중 sampling(0.5 - 1)
- colsample_bylevel: 각 level마다 샘플링 비율
- lambda: L2 nrom
- alpha: L1 norm
- scale_pos_weight: positive, negative weight 지정
- 기타 등

Learning Task Parameter
- object: 목적함수 종류
    - binary:logistic(이진 분류)
    - multi:softmax(다중 분류)
    - multi-softprob(다중 확률)
- eval_metric: 평가 지표
    - rmse – root mean square error
    - mae – mean absolute error
    - logloss – negative log-likelihood
    - error – Binary classification error rate (0.5 threshold)
    - merror – Multiclass classification error rate
    - mlogloss – Multiclass logloss
    - auc: Area under the curve
seed

### Overview
- high learning rate(0.05 - 0.3)를 선택하고 이 학습률에 맞는 tree 개수를 선정한다.
- tree-specific parameter를 수정한다.
- max_depth, min_child_weight, gamma, subsample, colsample_bytree
- regularization parameter를 수정한다.
- 학습률을 낮추고 다시 반복한다.

### Initialization
초기값은 다음과 같이 선정한다.
- max_depth = 5: 보통 4-6 를 시작점으로 한다.
- min_child_weight = 1 : 향후에 튜닝할 것이다.
- gamma = 0 : 0.1 - 0.2로 시작해도 된다. 그런데 어짜피 튜닝할 것이다.
- subsample, colsample_bytree = 0.8 : 보통 0.5 - 0.9로 시작한다.
- scale_pos_weight = 1: Because of high class imbalance.


### 1-1. Scaler + PolynomialFeatures

In [4]:
pipe1_1 = Pipeline([
                ('scale', MinMaxScaler()),
                ('poly', PolynomialFeatures()),
                ('classifier', XGBClassifier())
                ])

param_grid1_1 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[1000],
               'classifier__max_depth':[5],
               'classifier__min_child_weight':[1],
               'classifier__gamma':[0],
               'classifier__subsample':[0.8],
               'classifier__colsample_bytree':[0.8],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
               'scale':[MinMaxScaler(), StandardScaler(), RobustScaler()],
               'poly':[PolynomialFeatures()],
               'poly__degree':[1, 2, 3]
              }
             ] 
grid1_1 = GridSearchCV(pipe1_1, param_grid1_1, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid1_1.fit(X_train, y_train)
print(grid1_1.best_params_)
print(grid1_1.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 12.2min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 1000, 'classifier__n_jobs': -1,

### 1-2. Scaler Only

In [6]:
pipe1_2 = Pipeline([
                ('scale', MinMaxScaler()),
#                 ('poly', PolynomialFeatures()),
                ('classifier', XGBClassifier())
                ])

param_grid1_2 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[1000],
               'classifier__max_depth':[5],
               'classifier__min_child_weight':[1],
               'classifier__gamma':[0],
               'classifier__subsample':[0.8],
               'classifier__colsample_bytree':[0.8],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
               'scale':[MinMaxScaler(), StandardScaler(), RobustScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[1, 2, 3]
              }
             ] 
grid1_2 = GridSearchCV(pipe, param_grid1_2, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid1_2.fit(X_train, y_train)
print(grid1_2.best_params_)
print(grid1_2.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  2.8min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 1000, 'classifier__n_jobs': -1,

### 1-3. Scale -> Poly -> ReduceDim

- PCA & LDA
    - https://huidea.tistory.com/126
- LDA
    - https://yamalab.tistory.com/41
    - http://www.datamarket.kr/xe/board_oFxn34/26649
- TSNE
    - https://agiantmind.tistory.com/215
    - https://lovit.github.io/nlp/representation/2018/09/28/tsne/

In [12]:
pipe1_3 = Pipeline([
                ('scale', MinMaxScaler()),
                 ('poly', PolynomialFeatures()),
                ('reduce_dims', PCA()),
                ('classifier', XGBClassifier())
                ])

param_grid1_3 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[1000],
               'classifier__max_depth':[5],
               'classifier__min_child_weight':[1],
               'classifier__gamma':[0],
               'classifier__subsample':[0.8],
               'classifier__colsample_bytree':[0.8],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
               'scale':[MinMaxScaler(), StandardScaler(), RobustScaler()],
               'poly':[PolynomialFeatures()],
               'poly__degree': [1, 2, 3],
               'reduce_dims' : [PCA(), LDA(), TSNE()],
               'reduce_dims__n_components' : [5, 7, 9, 11]
              }
             ] 
grid1_3 = GridSearchCV(pipe1_3, param_grid1_3, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid1_3.fit(X_train, y_train)
print(grid1_3.best_params_)
print(grid1_3.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 39.7min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 64.3min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 1000, 'classifier__n_jobs': -1,

### 1-4. Scale -> Poly -> Feature Selection

```python
len(PolynomialFeatures(3).fit_transform(X_train)[0, :]) = 560
```
- 280
- 140
- 70
- 35

# REF estimator로 XGBClassifier 사용한다면?
- https://gentlej90.tistory.com/86

In [11]:
pipe1_4 = Pipeline([
                ('scale', MinMaxScaler()),
                 ('poly', PolynomialFeatures()),
                ('feature_selection', RFE(LogisticRegression())),
                ('classifier', XGBClassifier())
                ])

param_grid1_4 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[1000],
               'classifier__max_depth':[5],
               'classifier__min_child_weight':[1],
               'classifier__gamma':[0],
               'classifier__subsample':[0.8],
               'classifier__colsample_bytree':[0.8],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
               'scale':[MinMaxScaler(), StandardScaler(), RobustScaler()],
               'poly':[PolynomialFeatures()],
               'poly__degree': [1, 2, 3],
               'feature_selection' : [RFE(LogisticRegression()),
                                     RFE(RandomForestClassifier())],
               'feature_selection__n_features_to_select' : [280, 140, 70, 35]
#                'reduce_dims' : [PCA(), LDA(), TSNE()],
#                'reduce_dims__n_components' : [5, 7, 9, 11]
              }
             ]
grid1_4 = GridSearchCV(pipe1_4, param_grid1_4, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid1_4.fit(X_train, y_train)
print(grid1_4.best_params_)
print(grid1_4.best_score_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 47.3min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 162.7min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 1000, 'classifier__n_jobs': -1,

### Scaler+Poly+RFE + Poly > Scaler Only > Scaler + Poly + DimReduction
- 0.808 > 0.805 > 0.797 > 0.789


In [6]:
# grid1_3 base, grid1_4 base 두 가지로 파라미터 튜닝 진행
# grid1_4를 위해, Scaler-poly-rfe 완료한 데이터를 미리 생성 (매번 만들지 않도록)
X_train_c, X_test_c, y_train_c, y_test_c = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()
# X_train_c, X_valid_c, y_train_c, y_valid_c = train_test_split(X_train_c, y_train_c,
#                                                                     test_size=0.2,
#                                                                     shuffle=True,
#                                                                     stratify=y_train_c,
#                                                                     random_state=11)
    
scaler = MinMaxScaler()
poly = PolynomialFeatures(degree=3)
rfe = RFE(RandomForestClassifier(), n_features_to_select=140)

X_train_c = scaler.fit_transform(X_train_c)
# X_valid_c = scaler.transform(X_valid_c)
X_test_c = scaler.transform(X_test_c)

X_train_c = poly.fit_transform(X_train_c)
# X_valid_c = poly.transform(X_valid_c)
X_test_c = poly.transform(X_test_c)

X_train_c = rfe.fit_transform(X_train_c, y_train_c)
# X_valid_c = rfe.transform(X_valid_c)
X_test_c = rfe.transform(X_test_c)

In [7]:
print('done')

done


In [10]:
X_train.shape, poly.transform(X_train).shape, X_train_c.shape

((1188, 13), (1188, 560), (1188, 140))

### max_depth, min_child_weight

In [7]:
# grid1_3 base (scaler-poly-xgb)
# max_depth와 min_child_weight를 튜닝한다.

pipe = Pipeline([
                ('scale', MinMaxScaler()),
                ('poly', PolynomialFeatures()),
                ('classifier', XGBClassifier())
                ])

param_grid2 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[1000],
               'classifier__max_depth':range(3,10,3),
               'classifier__min_child_weight':range(1,6,2),
               'classifier__gamma':[0],
               'classifier__subsample':[0.8],
               'classifier__colsample_bytree':[0.8],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
               'scale':[RobustScaler()],
               'poly':[PolynomialFeatures()],
               'poly__degree':[3]
              }
             ] 
grid2 = GridSearchCV(pipe, param_grid2, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid2.fit(X_train, y_train)
print(grid2.best_params_)
print(grid2.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 15.9min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 1000, 'classifier__n_jobs': -1,

In [12]:
# grid1_4 base (scaler-poly-rfe-xgb)
# max_depth와 min_child_weight를 튜닝한다.

pipe_1_4 = Pipeline([
#                 ('scale', MinMaxScaler()),
#                 ('poly', PolynomialFeatures()),
#                 ('feature_selection', RFE(RandomForestClassifier())),
                ('classifier', XGBClassifier())
                ])

param_grid2_1 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[1000],
               'classifier__max_depth':range(3,10,3),
               'classifier__min_child_weight':range(1,6,2),
               'classifier__gamma':[0],
               'classifier__subsample':[0.8],
               'classifier__colsample_bytree':[0.8],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
#                'scale':[MinMaxScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#                'feature_selection' : [RFE(RandomForestClassifier())],
#                'feature_selection__n_features_to_select' : [140]
              }
             ]
grid2_1 = GridSearchCV(pipe_1_4, param_grid2_1, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid2_1.fit(X_train_c, y_train_c)
print(grid2_1.best_params_)
print(grid2_1.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 11.7min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=3, min_child_weight=5, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_child_weight': 5, 'classifier__n_estimators': 1000, 'classifier__n_jobs': -1,

### Gamma

In [18]:
# grid1_3 base (scaler-poly-xgb)
# Gamma를 튜닝한다.
param_grid3 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[1000],
               'classifier__max_depth': [grid2.best_params_['classifier__max_depth']], # 3
               'classifier__min_child_weight': [grid2.best_params_['classifier__min_child_weight']], # 1
               'classifier__gamma':[i/10.0 for i in range(0,10)],
               'classifier__subsample':[0.8],
               'classifier__colsample_bytree':[0.8],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
               'scale':[RobustScaler()],
               'poly':[PolynomialFeatures()],
               'poly__degree':[3]
              }
             ] 
grid3 = GridSearchCV(pipe, param_grid3, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid3.fit(X_train, y_train)
print(grid3.best_params_)
print(grid3.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 11.9min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0.0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0.0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 1000, 'classifier__n_jobs':

In [13]:
# grid1_4 base (scaler-poly-rfe-xgb)
# Gamma 튜닝

pipe_1_4 = Pipeline([
#                 ('scale', MinMaxScaler()),
#                 ('poly', PolynomialFeatures()),
#                 ('feature_selection', RFE(RandomForestClassifier())),
                ('classifier', XGBClassifier())
                ])

param_grid3_1 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[1000],
               'classifier__max_depth': [grid2_1.best_params_['classifier__max_depth']], # 3
               'classifier__min_child_weight': [grid2_1.best_params_['classifier__min_child_weight']],# 5
               'classifier__gamma':[i/10.0 for i in range(0,10)],
               'classifier__subsample':[0.8],
               'classifier__colsample_bytree':[0.8],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
#                'scale':[MinMaxScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#                'feature_selection' : [RFE(RandomForestClassifier())],
#                'feature_selection__n_features_to_select' : [140]
              }
             ]
grid3_1 = GridSearchCV(pipe_1_4, param_grid3_1, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid3_1.fit(X_train_c, y_train_c)
print(grid3_1.best_params_)
print(grid3_1.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.1min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0.7, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=3, min_child_weight=5, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0.7, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_child_weight': 5, 'classifier__n_estimators': 1000, 'classifier__n_jobs':

### subsample, colsample_bytree

In [20]:
# subsample and colsample_bytree를 튜닝한다.
param_grid4 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[1000],
               'classifier__max_depth': [grid2.best_params_['classifier__max_depth']], # 3
               'classifier__min_child_weight': [grid2.best_params_['classifier__min_child_weight']], # 1
               'classifier__gamma':[grid3.best_params_['classifier__gamma']], # 0.0
               'classifier__subsample':[i/10.0 for i in range(6,10)],
               'classifier__colsample_bytree':[i/10.0 for i in range(6,10)],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
               'scale':[RobustScaler()],
               'poly':[PolynomialFeatures()],
               'poly__degree':[3]
              }
             ] 
grid4 = GridSearchCV(pipe, param_grid4, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid4.fit(X_train, y_train)
print(grid4.best_params_)
print(grid4.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 19.2min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.9, eval_metric='error',
              gamma=0.0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.7, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.9, 'classifier__eval_metric': 'error', 'classifier__gamma': 0.0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 1000, 'classifier__n_jobs':

In [14]:
# grid1_4 base (scaler-poly-rfe-xgb)
# subsample and colsample_bytree를 튜닝한다.

pipe_1_4 = Pipeline([
#                 ('scale', MinMaxScaler()),
#                 ('poly', PolynomialFeatures()),
#                 ('feature_selection', RFE(RandomForestClassifier())),
                ('classifier', XGBClassifier())
                ])

param_grid4_1 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[1000],
               'classifier__max_depth': [grid2_1.best_params_['classifier__max_depth']], # 3
               'classifier__min_child_weight': [grid2_1.best_params_['classifier__min_child_weight']],# 5
               'classifier__gamma':[grid3_1.best_params_['classifier__gamma']], # 0.7
               'classifier__subsample':[i/10.0 for i in range(6,10)],
               'classifier__colsample_bytree':[i/10.0 for i in range(6,10)],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
#                'scale':[MinMaxScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#                'feature_selection' : [RFE(RandomForestClassifier())],
#                'feature_selection__n_features_to_select' : [140]
              }
             ]
grid4_1 = GridSearchCV(pipe_1_4, param_grid4_1, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid4_1.fit(X_train_c, y_train_c)
print(grid4_1.best_params_)
print(grid4_1.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 15.4min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0.7, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=3, min_child_weight=5, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0.7, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_child_weight': 5, 'classifier__n_estimators': 1000, 'classifier__n_jobs':

### n_estimators

In [21]:
# n_estimators 튜닝

param_grid5 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[50, 100, 300, 500, 750, 1000],
               'classifier__max_depth': [grid2.best_params_['classifier__max_depth']], # 3
               'classifier__min_child_weight': [grid2.best_params_['classifier__min_child_weight']], # 1
               'classifier__gamma':[grid3.best_params_['classifier__gamma']], # 0.0
               'classifier__subsample':[grid4.best_params_['classifier__subsample']], # 0.7
               'classifier__colsample_bytree':[grid4.best_params_['classifier__colsample_bytree']], # 0.9
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
               'scale':[RobustScaler()],
               'poly':[PolynomialFeatures()],
               'poly__degree':[3]
              }
             ] 
grid5 = GridSearchCV(pipe, param_grid5, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid5.fit(X_train, y_train)
print(grid5.best_params_)
print(grid5.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.8min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.9, eval_metric='error',
              gamma=0.0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.7, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.9, 'classifier__eval_metric': 'error', 'classifier__gamma': 0.0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 1000, 'classifier__n_jobs':

In [15]:
# grid1_4 base (scaler-poly-rfe-xgb)
# n_estimator 튜닝

pipe_1_4 = Pipeline([
#                 ('scale', MinMaxScaler()),
#                 ('poly', PolynomialFeatures()),
#                 ('feature_selection', RFE(RandomForestClassifier())),
                ('classifier', XGBClassifier())
                ])

param_grid5_1 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[10, 300, 500, 700, 1000, 2000, 3000],
               'classifier__max_depth': [grid2_1.best_params_['classifier__max_depth']], # 3
               'classifier__min_child_weight': [grid2_1.best_params_['classifier__min_child_weight']],# 5
               'classifier__gamma':[grid3_1.best_params_['classifier__gamma']], # 0.7
               'classifier__subsample':[grid4_1.best_params_['classifier__subsample']], # 0.8
               'classifier__colsample_bytree':[grid4_1.best_params_['classifier__colsample_bytree']], # 0.8
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
#                'scale':[MinMaxScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#                'feature_selection' : [RFE(RandomForestClassifier())],
#                'feature_selection__n_features_to_select' : [140]
              }
             ]
grid5_1 = GridSearchCV(pipe_1_4, param_grid5_1, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid5_1.fit(X_train_c, y_train_c)
print(grid5_1.best_params_)
print(grid5_1.best_score_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed: 13.8min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0.7, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=3, min_child_weight=5, missing=nan,
              monotone_constraints=None, n_estimators=700, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0.7, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_child_weight': 5, 'classifier__n_estimators': 700, 'classifier__n_jobs': -

### rate_drop, skip_drop

In [28]:
# rate_drop, skip_drop 튜닝
param_grid6 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': np.arange(0.1, 0.55, 0.05),
              'classifier__skip_drop': np.arange(0.1, 0.55, 0.05),
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[grid5.best_params_['classifier__n_estimators']], # 1000
               'classifier__max_depth': [grid2.best_params_['classifier__max_depth']], # 3
               'classifier__min_child_weight': [grid2.best_params_['classifier__min_child_weight']], # 1
               'classifier__gamma':[grid3.best_params_['classifier__gamma']], # 0.0
               'classifier__subsample':[grid4.best_params_['classifier__subsample']], # 0.7
               'classifier__colsample_bytree':[grid4.best_params_['classifier__colsample_bytree']], # 0.9
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
               'scale':[RobustScaler()],
               'poly':[PolynomialFeatures()],
               'poly__degree':[3]
              }
             ] 
grid6 = GridSearchCV(pipe, param_grid6, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid6.fit(X_train, y_train)
print(grid6.best_params_)
print(grid6.best_score_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 47.9min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 96.6min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.9, eval_metric='error',
              gamma=0.0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.3500000000000001, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021,
              skip_drop=0.30000000000000004, subsample=0.7, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.9, 'classifier__eval_metric': 'error', 'classifier__gamma': 0.0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_child_weight': 1, 'classifier__n_estimators'

In [16]:
# grid1_4 base (scaler-poly-rfe-xgb)
# rate_drop, skip_drop 튜닝

pipe_1_4 = Pipeline([
#                 ('scale', MinMaxScaler()),
#                 ('poly', PolynomialFeatures()),
#                 ('feature_selection', RFE(RandomForestClassifier())),
                ('classifier', XGBClassifier())
                ])

param_grid6_1 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': np.arange(0.1, 0.55, 0.05),
              'classifier__skip_drop': np.arange(0.1, 0.55, 0.05),
               'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[grid5_1.best_params_['classifier__n_estimators']], # 700
               'classifier__max_depth': [grid2_1.best_params_['classifier__max_depth']], # 3
               'classifier__min_child_weight': [grid2_1.best_params_['classifier__min_child_weight']],# 5
               'classifier__gamma':[grid3_1.best_params_['classifier__gamma']], # 0.7
               'classifier__subsample':[grid4_1.best_params_['classifier__subsample']], # 0.8
               'classifier__colsample_bytree':[grid4_1.best_params_['classifier__colsample_bytree']], # 0.8
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
#                'scale':[MinMaxScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#                'feature_selection' : [RFE(RandomForestClassifier())],
#                'feature_selection__n_features_to_select' : [140]
              }
             ]
grid6_1 = GridSearchCV(pipe_1_4, param_grid6_1, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid6_1.fit(X_train_c, y_train_c)
print(grid6_1.best_params_)
print(grid6_1.best_score_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 38.7min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0.7, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=3, min_child_weight=5, missing=nan,
              monotone_constraints=None, n_estimators=700, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.25000000000000006, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021,
              skip_drop=0.25000000000000006, subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0.7, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_child_weight': 5, 'classifier__n_estimators'

### learning_rate

In [29]:
# learning rate

param_grid7 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [grid6.best_params_['classifier__rate_drop']], 
              'classifier__skip_drop': [grid6.best_params_['classifier__skip_drop']], 
             'classifier__learning_rate': [0.005, 0.01, 0.03, 0.05, 0.1],
             'classifier__n_estimators':[grid5.best_params_['classifier__n_estimators']], # 1000
               'classifier__max_depth': [grid2.best_params_['classifier__max_depth']], # 3
               'classifier__min_child_weight': [grid2.best_params_['classifier__min_child_weight']], # 1
               'classifier__gamma':[grid3.best_params_['classifier__gamma']], # 0.0
               'classifier__subsample':[grid4.best_params_['classifier__subsample']], # 0.7
               'classifier__colsample_bytree':[grid4.best_params_['classifier__colsample_bytree']], # 0.9
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
               'scale':[RobustScaler()],
               'poly':[PolynomialFeatures()],
               'poly__degree':[3]
              }
             ]
grid7 = GridSearchCV(pipe, param_grid7, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid7.fit(X_train, y_train)
print(grid7.best_params_)
print(grid7.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  5.4min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.9, eval_metric='error',
              gamma=0.0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.3500000000000001, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021,
              skip_drop=0.30000000000000004, subsample=0.7, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.9, 'classifier__eval_metric': 'error', 'classifier__gamma': 0.0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_child_weight': 1, 'classifier__n_estimators'

In [17]:
# grid1_4 base (scaler-poly-rfe-xgb)
# learning rate 튜닝

pipe_1_4 = Pipeline([
#                 ('scale', MinMaxScaler()),
#                 ('poly', PolynomialFeatures()),
#                 ('feature_selection', RFE(RandomForestClassifier())),
                ('classifier', XGBClassifier())
                ])

param_grid7_1 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [grid6_1.best_params_['classifier__rate_drop']], # 0.25
              'classifier__skip_drop': [grid6_1.best_params_['classifier__skip_drop']], # 0.25
               'classifier__learning_rate': [0.01, 0.03, 0.05, 0.1],
             'classifier__n_estimators':[grid5_1.best_params_['classifier__n_estimators']], # 700
               'classifier__max_depth': [grid2_1.best_params_['classifier__max_depth']], # 3
               'classifier__min_child_weight': [grid2_1.best_params_['classifier__min_child_weight']],# 5
               'classifier__gamma':[grid3_1.best_params_['classifier__gamma']], # 0.7
               'classifier__subsample':[grid4_1.best_params_['classifier__subsample']], # 0.8
               'classifier__colsample_bytree':[grid4_1.best_params_['classifier__colsample_bytree']], # 0.8
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
#                'scale':[MinMaxScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#                'feature_selection' : [RFE(RandomForestClassifier())],
#                'feature_selection__n_features_to_select' : [140]
              }
             ]
grid7_1 = GridSearchCV(pipe_1_4, param_grid7_1, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid7_1.fit(X_train_c, y_train_c)
print(grid7_1.best_params_)
print(grid7_1.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.0min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0.7, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=3, min_child_weight=5, missing=nan,
              monotone_constraints=None, n_estimators=700, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.25000000000000006, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021,
              skip_drop=0.25000000000000006, subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0.7, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_child_weight': 5, 'classifier__n_estimators'

In [20]:
grid7_1.best_params_['classifier']

XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0.7, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=3, min_child_weight=5, missing=nan,
              monotone_constraints=None, n_estimators=700, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.25000000000000006, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021,
              skip_drop=0.25000000000000006, subsample=0.8, tree_method=None, ...)

### Early Stopping Rounds

In [15]:
# fit - early stopping rounds
#xgb_best = grid7.best_estimator_[2]
xgb_best = XGBClassifier(booster='dart', colsample_bytree=0.9, eval_metric='error',
              gamma=0.0, importance_type='gain', learning_rate=0.1,
              max_depth=3, min_child_weight=1,
              n_estimators=1000, n_jobs=-1,
              nthread=-1, objective='binary:logistic',
              rate_drop=0.3500000000000001, 
              scale_pos_weight=1, seed=2021,
              skip_drop=0.30000000000000004, subsample=0.7)

best_esr_stoprounds = (-1, -1)
for i, esr in enumerate(np.arange(50, 550, 50)):
    X_train_c, X_test_c, y_train_c, y_test_c = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()
    X_train_c, X_valid_c, y_train_c, y_valid_c = train_test_split(X_train_c, y_train_c,
                                                                    test_size=0.2,
                                                                    shuffle=True,
                                                                    stratify=y_train_c,
                                                                    random_state=11)
    
    scaler = MinMaxScaler()
    poly = PolynomialFeatures(degree=3)
    
    X_train_c = scaler.fit_transform(X_train_c)
    X_valid_c = scaler.transform(X_valid_c)
    X_test_c = scaler.transform(X_test_c)
    
    X_train_c = poly.fit_transform(X_train_c)
    X_valid_c = poly.transform(X_valid_c)
    X_test_c = poly.transform(X_test_c)
    
    xgb_best.fit(X_train_c, y_train_c, early_stopping_rounds=esr, eval_metric="error",
                 eval_set=[(X_train_c, y_train_c), (X_valid_c, y_valid_c)], verbose=0)
    acc = accuracy_score(y_test, xgb_best.predict(X_test_c))
    
    if acc > best_esr_stoprounds[1]:
        best_esr_stoprounds = (esr, acc)
        print(i, best_esr_stoprounds)

KeyboardInterrupt: 

In [85]:
best_esr_stoprounds

(50, 0.8383838383838383)

In [21]:
# grid1_4 base (scaler-poly-rfe-xgb)
# early stop test

# xgb_best_c = XGBClassifier(booster='dart', colsample_bytree=0.9, eval_metric='error',
#               gamma=0.0, importance_type='gain', learning_rate=0.1,
#               max_depth=3, min_child_weight=1,
#               n_estimators=1000, n_jobs=-1,
#               nthread=-1, objective='binary:logistic',
#               rate_drop=0.3500000000000001, 
#               scale_pos_weight=1, seed=2021,
#               skip_drop=0.30000000000000004, subsample=0.7)


scaleing-poly-rfe completed
0/(50, 0.8249158249158249)
1/2/3/4/5/6/7/8/9/

In [30]:
xgb_best_c = grid7_1.best_params_['classifier']

X_train_c, X_test_c, y_train_c, y_test_c = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()
X_train_c, X_valid_c, y_train_c, y_valid_c = train_test_split(X_train_c, y_train_c,
                                                                    test_size=0.2,
                                                                    shuffle=True,
                                                                    stratify=y_train_c,
                                                                    random_state=11)

scaler = MinMaxScaler()
poly = PolynomialFeatures(degree=3)
rfe = RFE(RandomForestClassifier(), n_features_to_select=140)

X_train_c = scaler.fit_transform(X_train_c)
X_valid_c = scaler.transform(X_valid_c)
X_test_c = scaler.transform(X_test_c)

X_train_c = poly.fit_transform(X_train_c)
X_valid_c = poly.transform(X_valid_c)
X_test_c = poly.transform(X_test_c)

X_train_c = rfe.fit_transform(X_train_c, y_train_c)
X_valid_c = rfe.transform(X_valid_c)
X_test_c = rfe.transform(X_test_c)

best_esr_stoprounds_rfe = (-1, -1)
for i, esr in enumerate(np.arange(50, 550, 50)):
    print(i, end='/')
    xgb_best_c = grid7_1.best_params_['classifier']
    xgb_best_c.fit(X_train_c, y_train_c, early_stopping_rounds=esr, eval_metric="error",
                 eval_set=[(X_train_c, y_train_c), (X_valid_c, y_valid_c)], verbose=0)
    acc = accuracy_score(y_test, xgb_best_c.predict(X_test_c))
    
    if acc > best_esr_stoprounds_rfe[1]:
        best_esr_stoprounds_rfe = (esr, acc)
        print(best_esr_stoprounds_rfe)

0/(50, 0.8249158249158249)
1/2/3/4/5/6/7/8/9/

In [33]:
# 2nd try
for i, esr in enumerate(np.arange(10, 110, 10)):
    print(i, end='/')
    xgb_best_c = grid7_1.best_params_['classifier']
    xgb_best_c.fit(X_train_c, y_train_c, early_stopping_rounds=esr, eval_metric="error",
                 eval_set=[(X_train_c, y_train_c), (X_valid_c, y_valid_c)], verbose=0)
    acc = accuracy_score(y_test, xgb_best_c.predict(X_test_c))
    
    if acc > best_esr_stoprounds_rfe[1]:
        best_esr_stoprounds_rfe = (esr, acc)
        print(best_esr_stoprounds_rfe)

0/(10, 0.8316498316498316)
1/2/3/4/5/6/7/8/9/

# Feature Selection

In [3]:
scaler = MinMaxScaler()
poly = PolynomialFeatures(degree=3)

X_train_c, X_test_c, y_train_c, y_test_c = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()
X_train_c = scaler.fit_transform(X_train_c)
X_test_c = scaler.transform(X_test_c)
X_train_p = poly.fit_transform(X_train_c)
X_test_p = poly.transform(X_test_c)

In [99]:
len(X_train_c[0, :]), len(X_train_p[0, :])

(13, 560)

In [4]:
from sklearn.ensemble import ExtraTreesClassifier

etc_model = ExtraTreesClassifier()
etc_model.fit(X_train_p, y_train_c)
etc_model.
#print(etc_model.feature_importances_)
feature_list = pd.concat([pd.Series(X.columns), pd.Series(etc_model.feature_importances_)], axis=1)
feature_list.columns = ['features_name', 'importance']
feature_list.sort_values("importance", ascending =False)[:30]

Unnamed: 0,features_name,importance
2,startprice,0.039908
27,,0.026178
196,,0.020621
16,,0.019841
3,productSeries_imputed,0.016792
38,,0.015045
207,,0.013881
29,,0.013553
107,,0.012192
118,,0.011159


In [5]:
model = LogisticRegression()
rfe = RFE(model, 20)
fit = rfe.fit(X_train_p, y_train_c)

print(f"Num Features: {fit.n_features_}")
print(f"Selected features: {X[:,fit.support_]}")
print(f"Feature Ranking: {fit.ranking_}")

Num Features: 20


TypeError: '(slice(None, None, None), array([False, False, False,  True,  True, False, False, False, False,
       False, False, False, False, False, False,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
        True, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
        True, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False]))' is an invalid key

In [6]:
X_train_p.shape, rfe.transform(X_train_p).shape

((1188, 560), (1188, 20))

In [115]:
print(f"Num Features: {fit.n_features_}")
#print(f"Selected features: {X[:, fit.support_]}")
print(f"Feature Ranking: {fit.ranking_}")

Num Features: 20
Feature Ranking: [459  10   3   1   1 401 380 275 377 248 305  22  56 389 283   1   1  71
 450 325 441 114  78  97 477 219 429   1   1 301 416 252  63 357 175 411
 196 197  58 117 458 431   1  36  45   1 206   8  55  80  15 355 406 419
   1 360 310 229 332 304 448 253  53 473 290 204 225 165 447  33 113 386
 145  34 257 268 444 184  62 465 494 115 222 430 337 452 394 190 480 158
 354 498 151 511 286 361 522 439 453 228 330 478 139 538 132  40  67  65
   9 141  72  37 217  79 417 128  11 284 168   4  14 369 328 334 216 322
 314   1  44 368 230 289  95  29 485 212 246 313  26 155   1 126 317  70
  69 194 356  96 127 154 131 471  51  66 482 466  50 407 449 162 238 481
  38   1 507 104 220 475 462 182 255 135 442 489 134 281 265 446 315  82
 474 174 245 432 179 285 218 533 440 502 457 261 343 276 530 463   1   1
 338 342 348  94 259 202 306 129 235 183 318 207 391 170 266 346   1  83
   1  24  31 157   1 366 490 500 200 349  86  84  54 399 464   7 277 374
 352 156  25 339 

In [None]:
model = RandomForestClassifier()
rfe = RFE(model, 5)
fit = rfe.fit(X_train_p, y_train_c)

print(f"Num Features: {fit.n_features_}")
print(f"Selected features: {X.columns[fit.support_]}")
print(f"Feature Ranking: {fit.ranking_}")

In [109]:
# Install mlxtend
from mlxtend.feature_selection import SequentialFeatureSelector
selector = SequentialFeatureSelector(LogisticRegression(), scoring='accuracy', 
                                     verbose=1, k_features=20, forward=False, n_jobs=-1)
selector.fit(X_train_p, y_train_c)
selected_f = selector.k_feature_names_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:  2.2min finished
Features: 559/20[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   39.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 559 out of 559 | elapsed:  1.9min finished
Features: 558/20[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 558 out of 5

# Local Outlier Factor

In [38]:
# grid1_3 base
# lof for original data (df) : non-scaled & non-poly

def tune_lof_xgb(model, df, scaler=None, poly=None, dim_reduction=None, ref=None):
    test_neighbors = np.linspace(1, 101, num=50).astype(int)
    test_contams = np.linspace(0.01, 0.3, num=30)
    best_params, best_acc, X2, y2 = 0, 0, 0, 0
    
    for i, tn in enumerate(test_neighbors):
        for j, tc in enumerate(test_contams):
            
            clf = LocalOutlierFactor(n_neighbors=tn, contamination=tc)
            y_pred = clf.fit_predict(df.drop('sold', axis=1))
            lof_outlier_idx = pd.Series(y_pred)[pd.Series(y_pred)==-1].index
            df_lof2 = df.drop(lof_outlier_idx)
            
            X2 = df_lof2.drop('sold', axis=1)
            y2 = df_lof2.sold
            X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2,
                                                                    test_size=0.2,
                                                                    shuffle=True,
                                                                    stratify=y2,
                                                                    random_state=11)
            X2_train, X2_valid, y2_train, y2_valid = train_test_split(X2_train, y2_train,
                                                                    test_size=0.2,
                                                                    shuffle=True,
                                                                    stratify=y2_train,
                                                                    random_state=11)
            
            if scaler:
                X2_train = scaler.fit_transform(X2_train)
                X2_valid = scaler.transform(X2_valid)
                X2_test = scaler.transform(X2_test)
                
            if poly:
                X2_train = poly.fit_transform(X2_train)
                X2_valid = poly.transform(X2_valid)
                X2_test = poly.transform(X2_test)
            
            if dim_reduction:
                X2_train = dim_reduction.fit_transform(X2_train)
                X2_valid = dim_reduction.transform(X2_valid)
                X2_test = dim_reduction.transform(X2_test)
                
            if ref:
                X2_train = ref.fit_transform(X2_train, y2_train)
                X2_valid = ref.transform(X2_valid)
                X2_test = ref.transform(X2_test)
            
            mod = model
            mod.fit(X2_train, y2_train, early_stopping_rounds=50, eval_metric="error",
                 eval_set=[(X2_train, y2_train), (X2_valid, y2_valid)], verbose=0)
            mod_acc = accuracy_score(y2_test, mod.predict(X2_test))
            if best_acc < mod_acc:
                best_acc = mod_acc
                best_params = (tn, tc)
                X2 = X2
                y2 = y2
                print((i, j, tn, tc, best_acc))
    
    return best_params, best_acc, X2, y2

In [40]:
xgb_best = XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.9, eval_metric='error',
              gamma=0.0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=3, min_child_weight=1, missing=np.nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.3500000000000001, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021,
              skip_drop=0.30000000000000004, subsample=0.7, tree_method=None,)

In [42]:
#xgb_best = grid7.best_params_['classifier']
xgb_scaler = RobustScaler()
xgb_poly = PolynomialFeatures(degree=3)
# xgb_rfe = RFE(RandomForestClassifier(),
#               n_features_to_select=140)
xgb_lof_tune = tune_lof_xgb(xgb_best, df,
                                  scaler=xgb_scaler,
                                  poly=xgb_poly)
xgb_lof_tune[:2]

(0, 0, 1, 0.01, 0.8095238095238095)
(0, 2, 1, 0.03, 0.8159722222222222)
(0, 4, 1, 0.049999999999999996, 0.8368794326241135)
(0, 10, 1, 0.10999999999999997, 0.8377358490566038)
(2, 0, 5, 0.01, 0.8401360544217688)
(4, 25, 9, 0.25999999999999995, 0.8409090909090909)
(4, 27, 9, 0.27999999999999997, 0.8551401869158879)
(29, 26, 60, 0.26999999999999996, 0.8617511520737328)


((60, 0.26999999999999996), 0.8617511520737328)

In [9]:
def tune_lof_xgb2(model, df, stoprounds=50,
                  scaler=None, poly=None, dim_reduction=None, rfe=None,
                  preset=False):    
    best_params, best_acc = 0, 0    
    test_neighbors = np.linspace(1, 101, num=50).astype(int)
    test_contams = np.linspace(0.01, 0.3, num=30)
    
    if preset:
        X0_train, X0_valid, X0_test, y0_train, y0_valid, y0_test = df
        
    else:
        X0 = df.drop('sold', axis=1)
        y0 = df.sold
        X0_train, X0_test, y0_train, y0_test = train_test_split(X0, y0,
                                                                test_size=0.2,
                                                                shuffle=True,
                                                                stratify=y0,
                                                                random_state=11)
        X0_train, X0_valid, y0_train, y0_valid = train_test_split(X0_train, y0_train,
                                                                test_size=0.2,
                                                                shuffle=True,
                                                                stratify=y0_train,
                                                                random_state=11)

        if scaler:
            X0_train = scaler.fit_transform(X0_train)
            X0_valid = scaler.transform(X0_valid)
            X0_test = scaler.transform(X0_test)

        if poly:
            X0_train = poly.fit_transform(X0_train)
            X0_valid = poly.transform(X0_valid)
            X0_test = poly.transform(X0_test)

        if dim_reduction:
            X0_train = dim_reduction.fit_transform(X0_train)
            X0_valid = dim_reduction.transform(X0_valid)
            X0_test = dim_reduction.transform(X0_test)

        if rfe:
            X0_train = rfe.fit_transform(X0_train, y0_train)
            X0_valid = rfe.transform(X0_valid)
            X0_test = rfe.transform(X0_test)

        print('preprocessing complete')
    
    for i, tn in enumerate(test_neighbors):
        for j, tc in enumerate(test_contams):
            
            # 원본 보존을 위해 복사본 사용
            X_train_copy, X_valid_copy, X_test_copy = X0_train.copy(), X0_valid.copy(), X0_test.copy()
            y_train_copy, y_valid_copy, y_test_copy = y0_train.copy(), y0_valid.copy(), y0_test.copy()
            
            # LOF 모델 생성 및 트레인셋 학습
            clf = LocalOutlierFactor(n_neighbors=tn, contamination=tc,
                                    novelty=True)
            clf.fit(X_train_copy)
            
            # 트레인셋 아웃라이어 제거
            y_pred = clf.predict(X_train_copy)
            lof_outlier_idx_train = pd.Series(y_pred)[pd.Series(y_pred)==-1].index
            X_train_copy = pd.DataFrame(X_train_copy).drop(lof_outlier_idx_train)
            y_train_copy = y_train_copy.reset_index(drop=True).drop(lof_outlier_idx_train)
            
            # 밸리데이션 셋 아웃라이어 제거
            yval_pred = clf.predict(X_valid_copy)
            lof_outlier_idx_valid = pd.Series(yval_pred)[pd.Series(yval_pred)==-1].index
            X_valid_copy = pd.DataFrame(X_valid_copy).drop(lof_outlier_idx_valid)
            y_valid_copy = y_valid_copy.reset_index(drop=True).drop(lof_outlier_idx_valid)
            
            # 테스트 셋 아웃라이어 제거
            ytest_pred = clf.predict(X_test_copy)
            lof_outlier_idx_test = pd.Series(ytest_pred)[pd.Series(ytest_pred)==-1].index
            X_test_copy = pd.DataFrame(X_test_copy).drop(lof_outlier_idx_test)
            y_test_copy = y_test_copy.reset_index(drop=True).drop(lof_outlier_idx_test)
            
            # 예측모델 정의 및 트레인/벨리데이션 셋으로 학습
            mod = model
            mod.fit(X_train_copy, y_train_copy, early_stopping_rounds=stoprounds, eval_metric="error",
                 eval_set=[(X_train_copy, y_train_copy), (X_valid_copy, y_valid_copy)], verbose=0)
            
            # 테스트 정확도 측정 및 최고기록 업데이트
            mod_acc = accuracy_score(y_test_copy, mod.predict(X_test_copy))
            if best_acc < mod_acc:
                best_acc = mod_acc
                best_params = (tn, tc)
#                 X2 = X2
#                 y2 = y2
                print((i, j, tn, tc, best_acc))
    
    return {'best_params':best_params,
           'best_accuracy':best_acc,
           'preprocessed_data':[X0_train, X0_valid, X0_test, y0_train, y0_valid, y0_test],
           'LOF_data':[X_train_copy, X_valid_copy, X_test_copy,
                      y_train_copy, y_valid_copy, y_test_copy]}

In [81]:
# grid1_3 base
# lof for preprocessed data (scaled & poly applied)

xgb_best = XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.9, eval_metric='error',
              gamma=0.0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=3, min_child_weight=1, missing=np.nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.3500000000000001, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021,
              skip_drop=0.30000000000000004, subsample=0.7, tree_method=None,)

test_neighbors = np.linspace(1, 101, num=50).astype(int)
test_contams = np.linspace(0.01, 0.3, num=30)

xgb_scaler = RobustScaler()
xgb_poly = PolynomialFeatures(degree=3)
xgb_lof_tune = tune_lof_xgb2(xgb_best, df,
                                  scaler=xgb_scaler,
                                  poly=xgb_poly)
xgb_lof_tune['best_params'], xgb_lof_tune['best_accuracy']

preprocessing complete
(0, 0, 1, 0.01, 0.8082191780821918)
(0, 2, 1, 0.03, 0.8096885813148789)
(0, 3, 1, 0.039999999999999994, 0.8105263157894737)
(0, 4, 1, 0.049999999999999996, 0.8120567375886525)
(0, 5, 1, 0.05999999999999999, 0.8122743682310469)
(0, 8, 1, 0.08999999999999998, 0.8164794007490637)
(0, 9, 1, 0.09999999999999998, 0.8181818181818182)
(0, 10, 1, 0.10999999999999997, 0.8185328185328186)
(1, 1, 3, 0.019999999999999997, 0.823728813559322)
(1, 2, 3, 0.03, 0.8247422680412371)
(1, 5, 3, 0.05999999999999999, 0.8257839721254355)
(1, 10, 3, 0.10999999999999997, 0.8302583025830258)
(1, 18, 3, 0.18999999999999997, 0.836)


KeyboardInterrupt: 

In [None]:
print('done')

In [None]:
# grid1_4 base (+rfe)
# lof for preprocessed data (scaled & poly applied)

In [None]:
#grid7_1.best_params_['classifier']

In [7]:
#xgb_best_c = grid7_1.best_params_['classifier']
xgb_best_c = XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
                          colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
                          gamma=0.7, gpu_id=None, importance_type='gain',
                          interaction_constraints=None, learning_rate=0.1,
                          max_delta_step=None, max_depth=3, min_child_weight=5, missing=np.nan,
                          monotone_constraints=None, n_estimators=700, n_jobs=-1,
                          nthread=-1, num_parallel_tree=None, objective='binary:logistic',
                          random_state=None, rate_drop=0.25000000000000006, reg_alpha=None,
                          reg_lambda=None, scale_pos_weight=1, seed=2021,
                          skip_drop=0.25000000000000006, subsample=0.8, tree_method=None)

xgb_scaler = MinMaxScaler()
xgb_poly = PolynomialFeatures(degree=3)
xgb_rfe = RFE(RandomForestClassifier(),
              n_features_to_select=140)
xgb_lof_tune2 = tune_lof_xgb2(xgb_best_c, df,
                              stoprounds=10,
                              scaler=xgb_scaler,
                              poly=xgb_poly,
                              rfe=xgb_rfe)
xgb_lof_tune2['best_params'], xgb_lof_tune2['best_accuracy']

preprocessing complete
(0, 0, 1, 0.01, 0.8316151202749141)
(0, 2, 1, 0.03, 0.8321678321678322)
(0, 3, 1, 0.039999999999999994, 0.8333333333333334)
(0, 18, 1, 0.18999999999999997, 0.8367346938775511)
(0, 19, 1, 0.19999999999999998, 0.8403361344537815)
(0, 21, 1, 0.21999999999999997, 0.8405172413793104)
(1, 5, 3, 0.05999999999999999, 0.85)
(1, 12, 3, 0.12999999999999998, 0.8532818532818532)


((3, 0.12999999999999998), 0.8532818532818532)

### what if: RFE using XGBClassifier

In [11]:
# RFE estimator == XGBClassifier일때 성능 더 나오면
# 다시 처음부터 parameter tuning 진행해보기

xgb_best_c = XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
                          colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
                          gamma=0.7, gpu_id=None, importance_type='gain',
                          interaction_constraints=None, learning_rate=0.1,
                          max_delta_step=None, max_depth=3, min_child_weight=5, missing=np.nan,
                          monotone_constraints=None, n_estimators=700, n_jobs=-1,
                          nthread=-1, num_parallel_tree=None, objective='binary:logistic',
                          random_state=None, rate_drop=0.25000000000000006, reg_alpha=None,
                          reg_lambda=None, scale_pos_weight=1, seed=2021,
                          skip_drop=0.25000000000000006, subsample=0.8, tree_method=None)

xgb_scaler = MinMaxScaler()
xgb_poly = PolynomialFeatures(degree=3)
xgb_rfe = RFE(XGBClassifier(objective='binary:logistic',
                           eval_metric='error'),
              n_features_to_select=140)
xgb_lof_tune3 = tune_lof_xgb2(xgb_best_c, df,
                              stoprounds=10,
                              scaler=xgb_scaler,
                              poly=xgb_poly,
                              rfe=xgb_rfe)

xgb_lof_tune3['best_params'], xgb_lof_tune3['best_accuracy']

preprocessing complete
(0, 0, 1, 0.01, 0.8350515463917526)
(0, 1, 1, 0.019999999999999997, 0.8368055555555556)
(0, 2, 1, 0.03, 0.8385964912280702)
(0, 9, 1, 0.09999999999999998, 0.8388278388278388)
(1, 1, 3, 0.019999999999999997, 0.8401360544217688)
(1, 18, 3, 0.18999999999999997, 0.85)
(1, 20, 3, 0.20999999999999996, 0.8516949152542372)
(1, 21, 3, 0.21999999999999997, 0.8547008547008547)
(1, 22, 3, 0.22999999999999998, 0.8558951965065502)
(1, 23, 3, 0.23999999999999996, 0.8565022421524664)
(3, 18, 7, 0.18999999999999997, 0.8647540983606558)


((7, 0.18999999999999997), 0.8647540983606558)