In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# from sklearn.linear_model import LogisticRegression
# from sklearn.naive_bayes import GaussianNB
# from sklearn.ensemble import RandomForestClassifier, VotingClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn import svm
from sklearn. neighbors import LocalOutlierFactor
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score
# from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures

from scipy.stats import expon, reciprocal
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier
from missingpy import MissForest

import warnings
warnings.filterwarnings('ignore')



In [2]:
df = pd.read_csv('./data/galaxy_final.csv', index_col=0)
X = df.drop('sold', axis=1)
y = df.sold
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11,
                                                       stratify=y, shuffle=True)

### LIGHTGBM 이란? 그리고 PARAMETER 튜닝하기
- https://nurilee.com/2020/04/03/lightgbm-definition-parameter-tuning/
- http://machinelearningkorea.com/2019/09/29/lightgbm-파라미터/

---
- ask : 데이터에 대해서 수행하고자 하는 임무를 구체화합니다. train 트레이닝일 수도 있고 predict 예측일 수도 있습니다.

- application : 가장 중요한 파라미터로, 모델의 어플리케이션을 정하는데 이것이 regression 회귀분석 문제인지 또는 classification 분류 문제인지를 정합니다. Light GBM에서 디폴트는 regression 회귀분석 모델입니다.
    - regression: 회귀분석
    - binary: 이진 분류
    - multiclass: 다중 분류

- boosting : 실행하고자 하는 알고리즘 타입을 정의합니다. 디폴트값은 gdbt 입니다.
    - gdbt : Traditional Gradient Boosting Decision Tree
    - rf : Random Forest
    - dart : Dropouts meet Multiple Additive Regression Trees
    - goss : Gradient-based One-Side Sampling

- num_boost_round : boosting iteration 수로 일반적으로 100 이상입니다.

- learning_rate : 최종 결과에 대한 각각의 Tree에 영향을 미치는 변수입니다. GBM은 초기의 추정값에서 시작하여 각각의Tree 결과를 사용하여 추정값을 업데이트 합니다. 학습 파라미터는 이러한 추정에서 발생하는 변화의 크기를 컨트롤합니다. 일반적인 값은 0.1, 0.001, 0.003 등등이 있습니다.

- num_leaves : 전체 Tree의 leave 수 이고, 디폴트값은 31입니다.

- device : 디폴트 값은 cpu 인데 gpu로 변경할 수도 있습니다.


- metric : 모델을 구현할 때 손실을 정하기 때문에 중요한 변수 중에 하나입니다. regression과 classification 을 위한 일반적인 손실 값이 아래에 나와있습니다.

    - mae : mean absolute error
    - mse : mean squared error
    - binary_logloss : loss for binary classification
    - multi_logloss : loss for multi classification

- max_bin : feature 값의 최대 bin 수를 의미합니다.
- categorical_feature : 범주형 feature의 인덱스를 의미합니다. 만약 categorical_features 가 0, 1, 2 이면 column 0, column 1, column 2 가 범주형 변수들입니다.
- ignore_column : categorical_features와 동일한 것인데 범주형 feature로써 특정 칼럼을 고려하지 않는 것입니다. 그 변수들을 무시하는 것입니다.
- save_binary : 데이터 파일의 메모리 사이즈를 처리해야 한다면 이 파라미터 값을 True로 설정하십시오. 이 값을 True로 세팅하면 데이터 세트를 바이너리 파일로 저장할 것이고, 이 바이너리 파일은 다음에 데이터를 읽어올 때 그 속도를 줄여줄 것입니다.

# 하이퍼파라미터 튜닝 관련
아래 소개되는 기법들은 모델 정확도를 향상시키기 위해 사용될 수 있습니다.
- num_leaves : Tree 모델의 복잡성을 컨트롤하는 주요 파라미터입니다. 이상적으로 num_leaves 값은 2 ^ (max_depth) 값보다 적거나 같아야 합니다. 이것보다 많은 값은 과적합을 유발할 것입니다.
- min_data_in_leaf : 큰 값으로 세팅하는 것은 Tree가 너무 깊게 확장되는 것을 막을 수 있지만 under-fitting 언더 피팅이 발생할 수도 있습니다. 관행적으로, 수백 또는 수천 개로 정하는 것이 큰 데이터 세트에 충분합니다.
- max_depth : Tree 깊이를 명확하게 제한하기 위해 max_depth 값을 설정할 수도 있습니다.

더 빠른 속도를 위하여 :
- bagging_fraction과 baggin_freq 을 설정하여 bagging 을 적용하십시오
- feature_fraction을 설정하여 feature sub-sampling을 하십시오
작은 max_bin 값을 사용하십시오
- save_binary 를 값을 통해 다가오는 학습에서 데이터 로딩 속도를 줄이십시오
- parallel learning 병렬 학습을 적용하십시오

더 나은 정확도를 위해 :
- 큰 max_bin 값을 사용하십시오 (아마 속도는 느려질 수 있습니다)
- 작은 learning_rate 값을 큰 num_iterations 값과 함께 사용하십시오
- 큰 num_leaves 값을 사용하십시오 (아마 과적합을 유발할 수도 있습니다)
- 더 큰 트레이닝 데이터를 사용하십시오
- dart 를 사용하십시오
- 범주형 feature를 사용하십시오

과적합을 해결하기 위해 :
- 작은 max_bin 값을 사용하십시오
- 작은 num_leaves 값을 사용하십시오
- min_data_in_leaf 와 min_sum_hessian_in_leaf 파라미터를 사용하십시오
- bagging_fraction 과 bagging_freq 을 사용하여 bagging 을 적용하십시오
- feature_fraction을 세팅하여 feature sub-sampling을 하십시오
- lambda_l1, lambda_l2 그리고 min_gain_to_split 파라미터를 이용해 regularization (정규화) 를 적용하십시오
- max_depth 를 설정해 Deep Tree 가 만들어지는 것을 방지하십시오

In [None]:
param_grid = [              
              {'classifier': [LGBMClassifier()],
              'classifier__max_depth': [3, 5, 7, 9],
              'classifier__num_leaves':[2**2-1, 2**4-1, 2**5-1, 2**7-1],
              'classifier__min_child_samples': [10, 15],
              'classifier__subsample': [0.25, 0.5, 0.75, 1],
             'classifier__learning_rate':[0.03, 0.1],
             'classifier__n_estimators':[64, 128, 256],
               'classifier__application':['binary'],
               'classifier__metric':['binary_logloss'],
               'classifier__categorical_feature':[[0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12]],
               'classifier__feature_fraction':[0.7, 0.9, 1],
               'classifier__boosting_type':['gbdt', 'dart'],
               'classifier__num_iterations':[1000, 3000],
               'classifier__drop_rate':[0.1, 0.2, 0.3],
               'classifier__skip_drop':[0.3, 0.5]
               'scale':[MinMaxScaler(), StandardScaler(), RobustScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[1, 2, 3]
              }    
             ] 
grid = GridSearchCV(pipe, param_grid, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

# Tuning Plan

### Overview
- high learning rate(0.05 - 0.3)를 선택하고 이 학습률에 맞는 tree 개수를 선정한다.
- tree-specific parameter를 수정한다.
- max_depth, min_child_weight, gamma, subsample, colsample_bytree
- regularization parameter를 수정한다.
- 학습률을 낮추고 다시 반복한다.

### 1. Learning rate와 estimator 수를 고정한다.
초기값은 다음과 같이 선정한다.
- max_depth = 5: 보통 4-6 를 시작점으로 한다.
- min_child_weight = 1 : 향후에 튜닝할 것이다.
- gamma = 0 : 0.1 - 0.2로 시작해도 된다. 그런데 어짜피 튜닝할 것이다.
- subsample, colsample_bytree = 0.8 : 보통 0.5 - 0.9로 시작한다.
- scale_pos_weight = 1: Because of high class imbalance.


In [26]:
pipe = Pipeline([
                ('scale', MinMaxScaler()),
                ('poly', PolynomialFeatures()),
                ('classifier', LGBMClassifier())
                ])

### First Grid
Earliest / Major Param
- scaler
- polynomial degree
- learning rate
- boosting type

In [None]:
LGBMClassifier()

In [27]:
param_grid1 = [              
              {'classifier': [LGBMClassifier()],
               'classifier__categorical_feature':[[0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12]],
               'classifier__application':['binary'],
               'classifier__metric':['binary_logloss'],
               'classifier__learning_rate':[0.03, 0.05, 0.1],
              'classifier__max_depth': [5],
              'classifier__num_leaves':[2**3],
              'classifier__subsample': [0.8],
               'classifier__colsample_bytree':[0.8],       
              'classifier__n_estimators':[1000],
               'classifier__boosting_type':['gbdt', 'dart', 'rf'],
               'classifier__drop_rate':[0.1],
               'classifier__skip_drop':[0.5],
               'classifier__num_iterations':[1000],
               'scale':[MinMaxScaler(), StandardScaler(), RobustScaler()],
              'poly':[PolynomialFeatures()],
               'poly__degree':[1, 2, 3]
              }
             ]
grid1 = GridSearchCV(pipe, param_grid1, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid1.fit(X_train, y_train)
print(grid1.best_params_)
print(grid1.best_score_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  3.6min finished


{'classifier': LGBMClassifier(application='binary', boosting_type='dart',
               categorical_feature=[0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12],
               class_weight=None, colsample_bytree=0.8, drop_rate=0.1,
               importance_type='split', learning_rate=0.03, max_depth=5,
               metric='binary_logloss', min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=1000,
               n_jobs=-1, num_iterations=1000, num_leaves=8, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               skip_drop=0.5, subsample=0.8, subsample_for_bin=200000,
               subsample_freq=0), 'classifier__application': 'binary', 'classifier__boosting_type': 'dart', 'classifier__categorical_feature': [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12], 'classifier__colsample_bytree': 0.8, 'classifier__drop_rate': 0.1, 'classifier__learning_rate': 0.03, 'classifier__max_depth': 5, 'classifier__metric': 'binary_lo

In [None]:
0.8080984292451158

### Second Grid
major hyper parameters
- max_depth
- num_iterations

In [28]:
param_grid2 = [              
              {'classifier': [LGBMClassifier()],
               'classifier__categorical_feature':[[0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12]],
               'classifier__application':['binary'],
               'classifier__metric':['binary_logloss'],
               'classifier__learning_rate':[grid1.best_params_['classifier__learning_rate']], # 0.03
              'classifier__max_depth': [3, 5, 7, 9],
              'classifier__num_leaves':[2**3, 2**4, 2**5],
              'classifier__subsample': [0.8],
               'classifier__colsample_bytree':[0.8],       
              'classifier__n_estimators':[1000],
               'classifier__boosting_type':[grid1.best_params_['classifier__boosting_type']], # dart
               'classifier__drop_rate':[0.1],
               'classifier__skip_drop':[0.5],
               'classifier__num_iterations':[1000, 3000, 5000],
               'scale':[grid1.best_params_['scale']], #MinMaxScaler
               'poly':[PolynomialFeatures()],
               'poly__degree':[grid1.best_params_['poly__degree']] # 3
              }
             ]
grid2 = GridSearchCV(pipe, param_grid2, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid2.fit(X_train, y_train)
print(grid2.best_params_)
print(grid2.best_score_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 16.3min finished


{'classifier': LGBMClassifier(application='binary', boosting_type='dart',
               categorical_feature=[0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12],
               class_weight=None, colsample_bytree=0.8, drop_rate=0.1,
               importance_type='split', learning_rate=0.03, max_depth=9,
               metric='binary_logloss', min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=1000,
               n_jobs=-1, num_iterations=1000, num_leaves=16, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               skip_drop=0.5, subsample=0.8, subsample_for_bin=200000,
               subsample_freq=0), 'classifier__application': 'binary', 'classifier__boosting_type': 'dart', 'classifier__categorical_feature': [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12], 'classifier__colsample_bytree': 0.8, 'classifier__drop_rate': 0.1, 'classifier__learning_rate': 0.03, 'classifier__max_depth': 9, 'classifier__metric': 'binary_l

### Third Grid
major hyper parameters
- subsample
- colsample_bytree

In [29]:
param_grid3 = [              
              {'classifier': [LGBMClassifier()],
               'classifier__categorical_feature':[[0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12]],
               'classifier__application':['binary'],
               'classifier__metric':['binary_logloss'],
               'classifier__learning_rate':[grid1.best_params_['classifier__learning_rate']], # 0.03
              'classifier__max_depth': [grid2.best_params_['classifier__max_depth']], # 9
              'classifier__num_leaves':[grid2.best_params_['classifier__num_leaves']], # 16
              'classifier__subsample': [0.5, 0.6, 0.7, 0.8, 0.9],
               'classifier__colsample_bytree':[0.5, 0.6, 0.7, 0.8, 0.9],       
              'classifier__n_estimators':[1000],
               'classifier__boosting_type':[grid1.best_params_['classifier__boosting_type']], # dart
               'classifier__drop_rate':[0.1],
               'classifier__skip_drop':[0.5],
               'classifier__num_iterations':[grid2.best_params_['classifier__num_iterations']], # 1000
               'scale':[grid1.best_params_['scale']], # MinMaxScaler
               'poly':[PolynomialFeatures()],
               'poly__degree':[grid1.best_params_['poly__degree']] # 3
              }
             ]
grid3 = GridSearchCV(pipe, param_grid3, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid3.fit(X_train, y_train)
print(grid3.best_params_)
print(grid3.best_score_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:  4.7min finished


{'classifier': LGBMClassifier(application='binary', boosting_type='dart',
               categorical_feature=[0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12],
               class_weight=None, colsample_bytree=0.8, drop_rate=0.1,
               importance_type='split', learning_rate=0.03, max_depth=9,
               metric='binary_logloss', min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=1000,
               n_jobs=-1, num_iterations=1000, num_leaves=16, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               skip_drop=0.5, subsample=0.5, subsample_for_bin=200000,
               subsample_freq=0), 'classifier__application': 'binary', 'classifier__boosting_type': 'dart', 'classifier__categorical_feature': [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12], 'classifier__colsample_bytree': 0.8, 'classifier__drop_rate': 0.1, 'classifier__learning_rate': 0.03, 'classifier__max_depth': 9, 'classifier__metric': 'binary_l

### Fourth
minor parameters + micro tuning
- drop_rate (dart)
- skip_drop (dart)
- n_estimators

In [30]:
param_grid4 = [              
              {'classifier': [LGBMClassifier()],
               'classifier__categorical_feature':[[0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12]],
               'classifier__application':['binary'],
               'classifier__metric':['binary_logloss'],
               'classifier__learning_rate':[grid1.best_params_['classifier__learning_rate']], # 0.03
              'classifier__max_depth': [grid2.best_params_['classifier__max_depth']], # 9
              'classifier__num_leaves':[grid2.best_params_['classifier__num_leaves']], # 16
              'classifier__subsample': [grid3.best_params_['classifier__subsample']], # 0.5
               'classifier__colsample_bytree':[grid3.best_params_['classifier__colsample_bytree']], # 0.8       
              'classifier__n_estimators':[100, 300, 500, 1000, 3000],
               'classifier__boosting_type':[grid1.best_params_['classifier__boosting_type']], # dart
               'classifier__drop_rate':[0.1, 0.2, 0.3, 0.4, 0.5],
               'classifier__skip_drop':[0.1, 0.2, 0.3, 0.4, 0.5],
               'classifier__num_iterations':[grid2.best_params_['classifier__num_iterations']], # 1000
               'scale':[grid1.best_params_['scale']], # MinMaxScaler
               'poly':[PolynomialFeatures()],
               'poly__degree':[grid1.best_params_['poly__degree']] # 3
              }
             ]
grid4 = GridSearchCV(pipe, param_grid4, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid4.fit(X_train, y_train)
print(grid4.best_params_)
print(grid4.best_score_)

Fitting 5 folds for each of 125 candidates, totalling 625 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 20.0min
[Parallel(n_jobs=-1)]: Done 625 out of 625 | elapsed: 29.0min finished


{'classifier': LGBMClassifier(application='binary', boosting_type='dart',
               categorical_feature=[0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12],
               class_weight=None, colsample_bytree=0.8, drop_rate=0.4,
               importance_type='split', learning_rate=0.03, max_depth=9,
               metric='binary_logloss', min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
               n_jobs=-1, num_iterations=1000, num_leaves=16, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               skip_drop=0.4, subsample=0.5, subsample_for_bin=200000,
               subsample_freq=0), 'classifier__application': 'binary', 'classifier__boosting_type': 'dart', 'classifier__categorical_feature': [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12], 'classifier__colsample_bytree': 0.8, 'classifier__drop_rate': 0.4, 'classifier__learning_rate': 0.03, 'classifier__max_depth': 9, 'classifier__metric': 'binary_lo

In [34]:
param_grid_best = [              
              {'classifier': [LGBMClassifier()],
               'classifier__categorical_feature':[[0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12]],
               'classifier__application':['binary'],
               'classifier__metric':['binary_logloss'],
               'classifier__learning_rate':[grid1.best_params_['classifier__learning_rate']], # 0.03
              'classifier__max_depth': [grid2.best_params_['classifier__max_depth']], # 9
              'classifier__num_leaves':[grid2.best_params_['classifier__num_leaves']], # 16
              'classifier__subsample': [grid3.best_params_['classifier__subsample']], # 0.5
               'classifier__colsample_bytree':[grid3.best_params_['classifier__colsample_bytree']], # 0.8       
              'classifier__n_estimators':[grid4.best_params_['classifier__n_estimators']], # 100
               'classifier__boosting_type':[grid1.best_params_['classifier__boosting_type']], # dart
               'classifier__drop_rate':[grid4.best_params_['classifier__drop_rate']], # 0.4
               'classifier__skip_drop':[grid4.best_params_['classifier__skip_drop']], # 0.4
               'classifier__num_iterations':[grid2.best_params_['classifier__num_iterations']], # 1000
               'scale':[grid1.best_params_['scale']], # MinMaxScaler
               'poly':[PolynomialFeatures()],
               'poly__degree':[grid1.best_params_['poly__degree']] # 3
              }
             ]

lgbm_best = LGBMClassifier(application='binary', boosting_type='dart',
               categorical_feature=[0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12],
               class_weight=None, colsample_bytree=0.8, importance_type='split',
               learning_rate=0.03, max_depth=9, metric='binary_logloss',
               min_child_samples=16, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=16, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.5, subsample_for_bin=200000, subsample_freq=0,
                          drop_rate=0.4, skip_drop=0.4, num_iterations=1000)

lgbm_best.fit(X_train, y_train)
accuracy_score(y_test, lgbm_best.predict(X_test))

0.8181818181818182

# Local Outlier Factor
- The number of neighbors considered (parameter n_neighbors) is typically set 
    - 1) greater than the minimum number of samples a cluster has to contain, so that other samples can be local outliers relative to this cluster
    - 2) smaller than the maximum number of close by samples that can potentially be local outliers. 
- In practice, such informations are generally not available, and taking n_neighbors=20 appears to work well in general.

In [32]:
test_neighbors = np.linspace(1, 100, num=20).astype(int)
test_contams = np.linspace(0.01, 0.2, num=20)
best_params, best_acc, X2, y2 = 0, 0, 0, 0

def tune_lof_by_model(model, df, scaler=None, poly=None, dim_reduction=None):
    best_params, best_acc, X2, y2 = 0, 0, 0, 0
    for i, tn in enumerate(test_neighbors):
        print(i, end='/')
        for j, tc in enumerate(test_contams):
            
            clf = LocalOutlierFactor(n_neighbors=tn, contamination=tc)
            y_pred = clf.fit_predict(df.drop('sold', axis=1))
            lof_outlier_idx = pd.Series(y_pred)[pd.Series(y_pred)==-1].index
            df_lof2 = df.drop(lof_outlier_idx)
            
            X2 = df_lof2.drop('sold', axis=1)
            y2 = df_lof2.sold
            X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2,
                                                                    test_size=0.2,
                                                                    shuffle=True,
                                                                    stratify=y2,
                                                                    random_state=11)
            
            if scaler:
                X2_train = scaler.fit_transform(X2_train)
                X2_test = scaler.transform(X2_test)
                
            if poly:
                X2_train = poly.fit_transform(X2_train)
                X2_test = poly.transform(X2_test)
            
            if dim_reduction:
                X2_train = dim_reduction.fit_transform(X2_train)
                X2_test = dim_reduction.transform(X2_test)
            
            mod = model
            mod.fit(X2_train, y2_train)
            mod_acc = accuracy_score(y2_test, mod.predict(X2_test))
            if best_acc < mod_acc:
                best_acc = mod_acc
                best_params = (tn, tc)
                X2 = X2
                y2 = y2
    return best_params, best_acc, X2, y2
        #print(accuracy_score(y2_test, lr.predict(X2_test)))
        #print(test_ensemble(X2_train, y2_train, X2_test, y2_test))

In [35]:
lgbm_best = LGBMClassifier(application='binary', boosting_type='dart',
               categorical_feature=[0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12],
               class_weight=None, colsample_bytree=0.8, importance_type='split',
               learning_rate=0.03, max_depth=9, metric='binary_logloss',
               min_child_samples=16, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=16, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.5, subsample_for_bin=200000, subsample_freq=0,
                          drop_rate=0.4, skip_drop=0.4, num_iterations=1000)

lgbm_scaler = MinMaxScaler()
lgbm_poly = PolynomialFeatures(degree=3)
lgbm_lof_tune = tune_lof_by_model(lgbm_best, df,
                                  scaler=lgbm_scaler,
                                 poly=lgbm_poly)
lgbm_lof_tune[:2]

0/1/2/3/4/5/6/7/8/9/10/11/12/13/14/15/16/17/18/19/

((37, 0.09999999999999999), 0.8507462686567164)