In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn. neighbors import LocalOutlierFactor
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures

from scipy.stats import expon, reciprocal
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier
from missingpy import MissForest

import warnings
warnings.filterwarnings('ignore')

# Light GBM > 원핫 인코딩 안한 카테고리 변수?
https://www.kaggle.com/mlisovyi/beware-of-categorical-features-in-lgbm
- 카테고리 변수 지정이 의미 있는가?
- LGBM은 연속성 실수 변수와 카테고리 변수를 어떻게 처리하는가?
    - 토론 재밌음!
    
# max_bin 파라미터 튜닝 추가!
- 값이 작으면 빨라지고,크면 느려지지만 정확해진다 

# SHAP - tensorflow backed feature evaluator
https://github.com/slundberg/shap
- 그림이 예쁨

In [17]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            #print("******************************")
            #print("Column: ",col)
            #print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            #print("dtype after: ",props[col].dtype)
            #print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [143]:
df = pd.read_csv('./data/galaxy_final_naive.csv', index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1485 entries, 0 to 1484
Data columns (total 12 columns):
index                        1485 non-null int64
BuyItNow                     1485 non-null int64
startprice                   1485 non-null float64
color_sentiment2             1485 non-null int64
carrier_none                 1485 non-null int64
productSeries_imputed        1485 non-null int64
product_isNote_imputed       1485 non-null int64
hasDescription               1485 non-null int64
charCountDescription         1485 non-null int64
upperCaseDescription_rate    1485 non-null float64
startprice_point9            1485 non-null int64
sold                         1485 non-null int64
dtypes: float64(2), int64(10)
memory usage: 150.8 KB


### Advanced Topics for LGBM
https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html

#### Categorical Feature Support
- LightGBM offers good accuracy with integer-encoded categorical features. LightGBM applies Fisher (1958) to find the optimal split over categories as described here. This often performs better than one-hot encoding.
- Use categorical_feature to specify the categorical features. Refer to the parameter categorical_feature in Parameters.
- Categorical features must be encoded as non-negative integers (int) less than Int32.MaxValue (2147483647). It is best to use a contiguous range of integers started from zero.
- Use min_data_per_group, cat_smooth to deal with over-fitting (when #data is small or #category is large).
- For a categorical feature with high cardinality (#category is large), it often works best to treat the feature as numeric, either by simply ignoring the categorical interpretation of the integers or by embedding the categories in a low-dimensional numeric space.


In [144]:
pd.concat([df.min(), df.max()], axis=1)

Unnamed: 0,0,1
index,0.0,1484.0
BuyItNow,0.0,1.0
startprice,0.01,999.0
color_sentiment2,-1.0,1.0
carrier_none,-1.0,1.0
productSeries_imputed,0.0,3.0
product_isNote_imputed,0.0,1.0
hasDescription,0.0,1.0
charCountDescription,0.0,111.0
upperCaseDescription_rate,0.0,1.0


In [145]:
df['color_sentiment2'] += 1
df['carrier_none'] += 1
df['color_sentiment2'].unique(), df['carrier_none'].unique()

(array([1, 0, 2]), array([2, 0, 1]))

In [146]:
df, NAli = reduce_mem_usage(df)

Memory usage of properties dataframe is : 0.14728546142578125  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.038237571716308594  MB
This is  25.96153846153846 % of the initial size


In [147]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1485 entries, 0 to 1484
Data columns (total 12 columns):
index                        1485 non-null uint16
BuyItNow                     1485 non-null uint8
startprice                   1485 non-null float32
color_sentiment2             1485 non-null uint8
carrier_none                 1485 non-null uint8
productSeries_imputed        1485 non-null uint8
product_isNote_imputed       1485 non-null uint8
hasDescription               1485 non-null uint8
charCountDescription         1485 non-null uint8
upperCaseDescription_rate    1485 non-null float32
startprice_point9            1485 non-null uint8
sold                         1485 non-null uint8
dtypes: float32(2), uint16(1), uint8(9)
memory usage: 39.2 KB


In [148]:
X = df.drop('sold', axis=1)
y = df.sold
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11,
                                                       stratify=y, shuffle=True)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=11,
                                                       stratify=y_train, shuffle=True)

![lgbm](https://cdn-images-1.medium.com/max/1000/1*A0b_ahXOrrijazzJengwYw.png)

### Note
- Similar to CatBoost, LightGBM can also handle categorical features by taking the input of feature names.
- **It does not convert to one-hot coding, and is much faster than one-hot coding.**
- LGBM uses a special algorithm to find the split value of categorical features [Link](http://www.csiss.org/SPACE/workshops/2004/SAC/files/fisher.pdf).

![](https://cdn-images-1.medium.com/max/800/1*fR5nLi61SkS031Spb3qgLg.png)

- Note: You should convert your categorical features to int type before you construct Dataset for LGBM. It does not accept string values even if you passes it through categorical_feature parameter.

# Target Encoding
### XAM / Additive Smoothing / Feature Extraction.
```bash
pip install fabric3
pip install git+https://github.com/MaxHalford/xam --upgrade
```
- https://maxhalford.github.io/blog/target-encoding/
    - https://github.com/MaxHalford/xam/blob/master/docs/feature-extraction.md#smooth-target-encoding
- https://www.wikiwand.com/en/Additive_smoothing

### Light GBM / CatBoost
- Optimal Binning (최적구간으로 연속형 실수 등 숫자 칼럼을 쪼갬)

In [13]:
import xam
XX = pd.DataFrame({'x_0': ['a'] * 5 + ['b'] * 5, 'x_1': ['a'] * 9 + ['b'] * 1})
yy = pd.Series([1, 1, 1, 1, 0, 1, 0, 0, 0, 0])

encoder = xam.feature_extraction.BayesianTargetEncoder(
     columns=['x_0', 'x_1'],
     prior_weight=3,
     suffix=''
)
encoder.fit_transform(XX, yy)

Unnamed: 0,x_0,x_1
0,0.6875,0.541667
1,0.6875,0.541667
2,0.6875,0.541667
3,0.6875,0.541667
4,0.6875,0.541667
5,0.3125,0.541667
6,0.3125,0.541667
7,0.3125,0.541667
8,0.3125,0.541667
9,0.3125,0.375


# Gentle Intro to Light GBM for applied ML
https://sefiks.com/2018/10/13/a-gentle-introduction-to-lightgbm-for-applied-machine-learning/

Light GBM은 어떤 이유로 엄청난 인기를 얻게 되었을까요?
- 데이터 사이즈는 날이 갈수록 커지고 있고 전통적인 데이터 분석 알고리즘으로 빠른 결과를 얻기란 더욱 어려워졌습니다. Light GBM은 말 그대로 “Light” 가벼운 것인데요, 왜냐면 속도가 빠르기 때문입니다. Light GBM은 큰 사이즈의 데이터를 다룰 수 있고 실행시킬 때 적은 메모리를 차지합니다. Light GBM이 인기있는 또 다른 이유는 바로 결과의 정확도에 초점을 맞추기 때문입니다. LGBM은 또한 GPU 학습을 지원하기 때문에 데이터 사이언티스트가 데이터 분석 어플리케이션을 개발할 때 LGBM을 폭넓게 사용하고 있습니다.

Light GBM은 어디서나 사용할 수 있을까요?
- 아닙니다. LGBM을 작은 데이터 세트에 사용하는 것은 추천되지 않습니다. Light GBM은 overfitting (과적합)에 민감하고 작은 데이터에 대해서 과적합하기 쉽습니다. row (행) 수에 대한 제한은 없지만 제 경험상 10,000 이상의 row (행) 을 가진 데이터에 사용하는 것을 권유해드립니다.

### Params & default values
https://testlightgbm.readthedocs.io/en/latest/Parameters.html

### First Grid
Earliest / Major Param
- scaler
- learning rate
- boosting type
- num_iterations

In [36]:
# categorical
cat_features_idx = [(i) for (i, colname) in enumerate((df.columns)) if (len(df[colname].unique()) <= 3)
                   & (colname != 'sold')]

In [42]:
# without PolynomialFeatures/ feature selection

pipe = Pipeline([
                ('scale', MinMaxScaler()),
#                 ('poly', PolynomialFeatures()),
#                 ('feature_selection', RFE(LGBMClassifier())),
                ('classifier', LGBMClassifier())
                ])

param_grid1 = [              
              {'classifier': [LGBMClassifier()],
               'classifier__categorical_feature':[cat_features_idx],
               'classifier__objective':['binary'],
               'classifier__metric':['binary_logloss'],
              'classifier__boosting_type':['gbdt', 'dart'],
              'classifier__drop_rate':[0.1],
               'classifier__skip_drop':[0.5],
               'classifier__learning_rate':[0.01, 0.03, 0.1],
               'classifier__num_iterations':[500, 1000, 2000, 3000, 5000],
              'classifier__bagging_fraction': [0.8],
               'classifier__feature_fraction':[0.8],
               'classifier__early_stopping_round':[0],
               'classifier__max_depth': [5],
               'classifier__num_leaves':[2**3],
               'classifier__min_data_in_leaf':[20],
               'classifier__max_bin':[255],
               'classifier__n_estimators':[1000],
               'classifier__lambda_l1':[0],
               'classifier__lambda_l2':[0],
               'classifier__scale_pos_weight':[1.0],
               'scale':[MinMaxScaler(), StandardScaler(), RobustScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#               'feature_selection' : [RFE(LGBMClassifier(objective='binary',
#                                                         metric='binary_logloss'))],
#                 'feature_selection__n_features_to_select' : [140, 70, 35]
              }
             ]
grid1 = GridSearchCV(pipe, param_grid1, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid1.fit(X_train, y_train)
print(grid1.best_params_)
print(grid1.best_score_)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   45.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:  6.3min finished


{'classifier': LGBMClassifier(bagging_fraction=0.8, boosting_type='dart',
               categorical_feature=[1, 3, 4, 6, 7, 10], class_weight=None,
               colsample_bytree=1.0, drop_rate=0.1, feature_fraction=0.8,
               importance_type='split', lambda_l1=0, lambda_l2=0,
               learning_rate=0.01, max_bin=255, max_depth=5,
               metric='binary_logloss', min_child_samples=20,
               min_child_weight=0.001, min_data_in_leaf=20, min_split_gain=0.0,
               n_estimators=1000, n_jobs=-1, num_iterations=5000, num_leaves=8,
               objective='binary', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, scale_pos_weight=1.0, silent=True, skip_drop=0.5,
               subsample=1.0, ...), 'classifier__bagging_fraction': 0.8, 'classifier__boosting_type': 'dart', 'classifier__categorical_feature': [1, 3, 4, 6, 7, 10], 'classifier__drop_rate': 0.1, 'classifier__feature_fraction': 0.8, 'classifier__lambda_l1': 0, 'classifier__lamb

### Second Grid
major hyper parameters
- drop_rate
- skip_drop

In [44]:
# drop_Rate, skip_drop

pipe = Pipeline([
                ('scale', RobustScaler()),
#                 ('poly', PolynomialFeatures()),
#                 ('feature_selection', RFE(LGBMClassifier())),
                ('classifier', LGBMClassifier())
                ])

param_grid2 = [              
              {'classifier': [LGBMClassifier()],
               'classifier__categorical_feature':[cat_features_idx],
               'classifier__objective':['binary'],
               'classifier__metric':['binary_logloss'],
              'classifier__boosting_type':[grid1.best_params_['classifier__boosting_type']],
              'classifier__drop_rate':np.arange(0.1, 0.55, 0.05),
               'classifier__skip_drop':np.arange(0.1, 0.55, 0.05),
               'classifier__learning_rate':[grid1.best_params_['classifier__learning_rate']],
               'classifier__num_iterations':[grid1.best_params_['classifier__num_iterations']],
              'classifier__bagging_fraction': [0.8],
               'classifier__feature_fraction':[0.8],
               'classifier__early_stopping_round':[0],
               'classifier__max_depth': [5],
               'classifier__num_leaves':[2**3],
               'classifier__min_data_in_leaf':[20],
               'classifier__max_bin':[255],
               'classifier__n_estimators':[1000],
               'classifier__lambda_l1':[0],
               'classifier__lambda_l2':[0],
               'classifier__scale_pos_weight':[1.0],
               'scale':[RobustScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#               'feature_selection' : [RFE(LGBMClassifier(objective='binary',
#                                                         metric='binary_logloss'))],
#                 'feature_selection__n_features_to_select' : [140, 70, 35]
              }
             ]
grid2 = GridSearchCV(pipe, param_grid2, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid2.fit(X_train, y_train)
print(grid2.best_params_)
print(grid2.best_score_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 15.4min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 33.8min finished


{'classifier': LGBMClassifier(bagging_fraction=0.8, boosting_type='dart',
               categorical_feature=[1, 3, 4, 6, 7, 10], class_weight=None,
               colsample_bytree=1.0, drop_rate=0.3500000000000001,
               early_stopping_round=0, feature_fraction=0.8,
               importance_type='split', lambda_l1=0, lambda_l2=0,
               learning_rate=0.01, max_bin=255, max_depth=5,
               metric='binary_logloss', min_child_samples=20,
               min_child_weight=0.001, min_data_in_leaf=20, min_split_gain=0.0,
               n_estimators=1000, n_jobs=-1, num_iterations=5000, num_leaves=8,
               objective='binary', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, scale_pos_weight=1.0, silent=True,
               skip_drop=0.45000000000000007, ...), 'classifier__bagging_fraction': 0.8, 'classifier__boosting_type': 'dart', 'classifier__categorical_feature': [1, 3, 4, 6, 7, 10], 'classifier__drop_rate': 0.3500000000000001, 'classifier_

### Third
- max_depth
- num_leaves
- min_data_in_leaf

In [47]:
pipe = Pipeline([
                ('scale', RobustScaler()),
#                 ('poly', PolynomialFeatures()),
#                 ('feature_selection', RFE(LGBMClassifier())),
                ('classifier', LGBMClassifier())
                ])

param_grid3 = [              
              {'classifier': [LGBMClassifier()],
               'classifier__categorical_feature':[cat_features_idx],
               'classifier__objective':['binary'],
               'classifier__metric':['binary_logloss'],
              'classifier__boosting_type':[grid1.best_params_['classifier__boosting_type']],
              'classifier__drop_rate':[grid2.best_params_['classifier__drop_rate']],
               'classifier__skip_drop':[grid2.best_params_['classifier__skip_drop']],
               'classifier__learning_rate':[grid1.best_params_['classifier__learning_rate']],
               'classifier__num_iterations':[grid1.best_params_['classifier__num_iterations']],
              'classifier__bagging_fraction': [0.8],
               'classifier__feature_fraction':[0.8],
               'classifier__max_depth': [3, 6, 9, 12, 15, 20],
               'classifier__num_leaves':[2**3-1, 2**5-1, 2**7-1, 2**9-1],
               'classifier__min_data_in_leaf':[20, 100, 250, 500, 1000],
               'classifier__max_bin':[255],
               'classifier__n_estimators':[1000],
               'classifier__early_stopping_round':[0],
               'classifier__lambda_l1':[0],
               'classifier__lambda_l2':[0],
               'classifier__scale_pos_weight':[1.0],
               'scale':[RobustScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#               'feature_selection' : [RFE(LGBMClassifier(objective='binary',
#                                                         metric='binary_logloss'))],
#                 'feature_selection__n_features_to_select' : [140, 70, 35]
              }
             ]

grid3 = GridSearchCV(pipe, param_grid3, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid3.fit(X_train, y_train)
print(grid3.best_params_)
print(grid3.best_score_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 21.1min finished


{'classifier': LGBMClassifier(bagging_fraction=0.8, boosting_type='dart',
               categorical_feature=[1, 3, 4, 6, 7, 10], class_weight=None,
               colsample_bytree=1.0, drop_rate=0.3500000000000001,
               early_stopping_round=0, feature_fraction=0.8,
               importance_type='split', lambda_l1=0, lambda_l2=0,
               learning_rate=0.01, max_bin=255, max_depth=6,
               metric='binary_logloss', min_child_samples=20,
               min_child_weight=0.001, min_data_in_leaf=20, min_split_gain=0.0,
               n_estimators=1000, n_jobs=-1, num_iterations=5000, num_leaves=7,
               objective='binary', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, scale_pos_weight=1.0, silent=True,
               skip_drop=0.45000000000000007, ...), 'classifier__bagging_fraction': 0.8, 'classifier__boosting_type': 'dart', 'classifier__categorical_feature': [1, 3, 4, 6, 7, 10], 'classifier__drop_rate': 0.3500000000000001, 'classifier_

### Fourth Grid
major hyper parameters
- bagging_fraction
    - bagging_freq
- feature_fraction

In [48]:
pipe = Pipeline([
                ('scale', RobustScaler()),
#                 ('poly', PolynomialFeatures()),
#                 ('feature_selection', RFE(LGBMClassifier())),
                ('classifier', LGBMClassifier())
                ])

param_grid4 = [              
              {'classifier': [LGBMClassifier()],
               'classifier__categorical_feature':[cat_features_idx],
               'classifier__objective':['binary'],
               'classifier__metric':['binary_logloss'],
              'classifier__boosting_type':[grid1.best_params_['classifier__boosting_type']],
              'classifier__drop_rate':[grid2.best_params_['classifier__drop_rate']],
               'classifier__skip_drop':[grid2.best_params_['classifier__skip_drop']],
               'classifier__learning_rate':[grid1.best_params_['classifier__learning_rate']],
               'classifier__num_iterations':[grid1.best_params_['classifier__num_iterations']],
              'classifier__bagging_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],
               'classifier__bagging_freq':[100, 250, 500, 750, 1000],
               'classifier__feature_fraction':[0.5, 0.6, 0.7, 0.8, 0.9],
               'classifier__max_depth': [grid3.best_params_['classifier__max_depth']],
               'classifier__num_leaves':[grid3.best_params_['classifier__num_leaves']],
               'classifier__min_data_in_leaf':[grid3.best_params_['classifier__min_data_in_leaf']],
               'classifier__max_bin':[255],
               'classifier__n_estimators':[1000],
               'classifier__early_stopping_round':[0],
               'classifier__lambda_l1':[0],
               'classifier__lambda_l2':[0],
               'classifier__scale_pos_weight':[1.0],
               'scale':[RobustScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#               'feature_selection' : [RFE(LGBMClassifier(objective='binary',
#                                                         metric='binary_logloss'))],
#                 'feature_selection__n_features_to_select' : [140, 70, 35]
              }
             ]
grid4 = GridSearchCV(pipe, param_grid4, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid4.fit(X_train, y_train)
print(grid4.best_params_)
print(grid4.best_score_)

Fitting 5 folds for each of 125 candidates, totalling 625 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 24.0min
[Parallel(n_jobs=-1)]: Done 625 out of 625 | elapsed: 33.6min finished


{'classifier': LGBMClassifier(bagging_fraction=0.5, bagging_freq=1000, boosting_type='dart',
               categorical_feature=[1, 3, 4, 6, 7, 10], class_weight=None,
               colsample_bytree=1.0, drop_rate=0.3500000000000001,
               early_stopping_round=0, feature_fraction=0.9,
               importance_type='split', lambda_l1=0, lambda_l2=0,
               learning_rate=0.01, max_bin=255, max_depth=6,
               metric='binary_logloss', min_child_samples=20,
               min_child_weight=0.001, min_data_in_leaf=20, min_split_gain=0.0,
               n_estimators=1000, n_jobs=-1, num_iterations=5000, num_leaves=7,
               objective='binary', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, scale_pos_weight=1.0, silent=True, ...), 'classifier__bagging_fraction': 0.5, 'classifier__bagging_freq': 1000, 'classifier__boosting_type': 'dart', 'classifier__categorical_feature': [1, 3, 4, 6, 7, 10], 'classifier__drop_rate': 0.3500000000000001, 'clas

### Fifth
- max_bin
- n_estimators
- early_stopping_round

In [49]:
pipe = Pipeline([
                ('scale', RobustScaler()),
#                 ('poly', PolynomialFeatures()),
#                 ('feature_selection', RFE(LGBMClassifier())),
                ('classifier', LGBMClassifier())
                ])

param_grid5 = [              
              {'classifier': [LGBMClassifier()],
               'classifier__categorical_feature':[cat_features_idx],
               'classifier__objective':['binary'],
               'classifier__metric':['binary_logloss'],
              'classifier__boosting_type':[grid1.best_params_['classifier__boosting_type']],
              'classifier__drop_rate':[grid2.best_params_['classifier__drop_rate']],
               'classifier__skip_drop':[grid2.best_params_['classifier__skip_drop']],
               'classifier__learning_rate':[grid1.best_params_['classifier__learning_rate']],
               'classifier__num_iterations':[grid1.best_params_['classifier__num_iterations']],
              'classifier__bagging_fraction': [grid4.best_params_['classifier__bagging_fraction']],
               'classifier__bagging_freq': [grid4.best_params_['classifier__bagging_freq']],
               'classifier__feature_fraction':[grid4.best_params_['classifier__feature_fraction']],
               'classifier__max_depth': [grid3.best_params_['classifier__max_depth']],
               'classifier__num_leaves': [grid3.best_params_['classifier__num_leaves']],
               'classifier__min_data_in_leaf': [grid3.best_params_['classifier__min_data_in_leaf']],
               'classifier__max_bin':[100, 150, 200, 255, 300],
               'classifier__n_estimators':[250, 500, 1000, 2000, 3000],
               'classifier__early_stopping_round':[10, 25, 50, 75, 100, 200],
               'classifier__lambda_l1':[0],
               'classifier__lambda_l2':[0],
               'classifier__scale_pos_weight':[1.0],
               'scale':[RobustScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#               'feature_selection' : [RFE(LGBMClassifier(objective='binary',
#                                                         metric='binary_logloss'))],
#                 'feature_selection__n_features_to_select' : [140, 70, 35]
              }
             ]

grid5 = GridSearchCV(pipe, param_grid5, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid5.fit(X_train, y_train)
print(grid5.best_params_)
print(grid5.best_score_)

Fitting 5 folds for each of 150 candidates, totalling 750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 24.7min
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed: 41.9min finished


{'classifier': LGBMClassifier(bagging_fraction=0.5, bagging_freq=1000, boosting_type='dart',
               categorical_feature=[1, 3, 4, 6, 7, 10], class_weight=None,
               colsample_bytree=1.0, drop_rate=0.3500000000000001,
               early_stopping_round=10, feature_fraction=0.9,
               importance_type='split', lambda_l1=0, lambda_l2=0,
               learning_rate=0.01, max_bin=150, max_depth=6,
               metric='binary_logloss', min_child_samples=20,
               min_child_weight=0.001, min_data_in_leaf=20, min_split_gain=0.0,
               n_estimators=250, n_jobs=-1, num_iterations=5000, num_leaves=7,
               objective='binary', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, scale_pos_weight=1.0, silent=True, ...), 'classifier__bagging_fraction': 0.5, 'classifier__bagging_freq': 1000, 'classifier__boosting_type': 'dart', 'classifier__categorical_feature': [1, 3, 4, 6, 7, 10], 'classifier__drop_rate': 0.3500000000000001, 'clas

### Sixth
- lambda_l1
- lambda_l2
- scale_pos_weight

In [50]:
pipe = Pipeline([
                ('scale', RobustScaler()),
#                 ('poly', PolynomialFeatures()),
#                 ('feature_selection', RFE(LGBMClassifier())),
                ('classifier', LGBMClassifier())
                ])

param_grid6 = [              
              {'classifier': [LGBMClassifier()],
               'classifier__categorical_feature':[cat_features_idx],
               'classifier__objective':['binary'],
               'classifier__metric':['binary_logloss'],
              'classifier__boosting_type':[grid1.best_params_['classifier__boosting_type']],
              'classifier__drop_rate':[grid2.best_params_['classifier__drop_rate']],
               'classifier__skip_drop':[grid2.best_params_['classifier__skip_drop']],
               'classifier__learning_rate':[grid1.best_params_['classifier__learning_rate']],
               'classifier__num_iterations':[grid1.best_params_['classifier__num_iterations']],
              'classifier__bagging_fraction': [grid4.best_params_['classifier__bagging_fraction']],
               'classifier__bagging_freq': [grid4.best_params_['classifier__bagging_freq']],
               'classifier__feature_fraction':[grid4.best_params_['classifier__feature_fraction']],
               'classifier__max_depth': [grid3.best_params_['classifier__max_depth']],
               'classifier__num_leaves': [grid3.best_params_['classifier__num_leaves']],
               'classifier__min_data_in_leaf': [grid3.best_params_['classifier__min_data_in_leaf']],
               'classifier__max_bin':[grid5.best_params_['classifier__max_bin']],
               'classifier__n_estimators':[grid5.best_params_['classifier__n_estimators']],
               'classifier__early_stopping_round':[grid5.best_params_['classifier__early_stopping_round']],
               'classifier__lambda_l1':[0, 1e-4, 1e-3, 1e-2, 0.1, 1],
               'classifier__lambda_l2':[0, 1e-4, 1e-3, 1e-2, 0.1, 1],
               'classifier__scale_pos_weight':[1.0, 1.1, 1.2, 1.3, 1.4, 1.5],
               'scale':[RobustScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#               'feature_selection' : [RFE(LGBMClassifier(objective='binary',
#                                                         metric='binary_logloss'))],
#                 'feature_selection__n_features_to_select' : [140, 70, 35]
              }
             ]

grid6 = GridSearchCV(pipe, param_grid6, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid6.fit(X_train, y_train)
print(grid6.best_params_)
print(grid6.best_score_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 24.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 43.6min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed: 59.0min finished


{'classifier': LGBMClassifier(bagging_fraction=0.5, bagging_freq=1000, boosting_type='dart',
               categorical_feature=[1, 3, 4, 6, 7, 10], class_weight=None,
               colsample_bytree=1.0, drop_rate=0.3500000000000001,
               early_stopping_round=10, feature_fraction=0.9,
               importance_type='split', lambda_l1=0.01, lambda_l2=0.001,
               learning_rate=0.01, max_bin=150, max_depth=6,
               metric='binary_logloss', min_child_samples=20,
               min_child_weight=0.001, min_data_in_leaf=20, min_split_gain=0.0,
               n_estimators=250, n_jobs=-1, num_iterations=5000, num_leaves=7,
               objective='binary', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, scale_pos_weight=1.0, silent=True, ...), 'classifier__bagging_fraction': 0.5, 'classifier__bagging_freq': 1000, 'classifier__boosting_type': 'dart', 'classifier__categorical_feature': [1, 3, 4, 6, 7, 10], 'classifier__drop_rate': 0.3500000000000001

In [102]:
param_grid_fin = [              
              {'classifier': [LGBMClassifier()],
               'classifier__categorical_feature':[cat_features_idx],
               'classifier__objective':['binary'],
               'classifier__metric':['binary_logloss'],
              'classifier__boosting_type':[grid1.best_params_['classifier__boosting_type']],
              'classifier__drop_rate':[grid2.best_params_['classifier__drop_rate']],
               'classifier__skip_drop':[grid2.best_params_['classifier__skip_drop']],
               'classifier__learning_rate':[grid1.best_params_['classifier__learning_rate']],
               'classifier__num_iterations':[grid1.best_params_['classifier__num_iterations']],
              'classifier__bagging_fraction': [grid4.best_params_['classifier__bagging_fraction']],
               'classifier__bagging_freq': [grid4.best_params_['classifier__bagging_freq']],
               'classifier__feature_fraction':[grid4.best_params_['classifier__feature_fraction']],
               'classifier__max_depth': [grid3.best_params_['classifier__max_depth']],
               'classifier__num_leaves': [grid3.best_params_['classifier__num_leaves']],
               'classifier__min_data_in_leaf': [grid3.best_params_['classifier__min_data_in_leaf']],
               'classifier__max_bin':[grid5.best_params_['classifier__max_bin']],
               'classifier__n_estimators':[grid5.best_params_['classifier__n_estimators']],
               'classifier__early_stopping_round':[grid5.best_params_['classifier__early_stopping_round']],
               'classifier__lambda_l1':[grid6.best_params_['classifier__lambda_l1']],
               'classifier__lambda_l2':[grid6.best_params_['classifier__lambda_l2']],
               'classifier__scale_pos_weight':[grid6.best_params_['classifier__scale_pos_weight']]
              }]

param_grid_fin

[{'classifier': [LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                  importance_type='split', learning_rate=0.1, max_depth=-1,
                  min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                  n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                  random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
                  subsample=1.0, subsample_for_bin=200000, subsample_freq=0)],
  'classifier__categorical_feature': [[1, 3, 4, 6, 7, 10]],
  'classifier__objective': ['binary'],
  'classifier__metric': ['binary_logloss'],
  'classifier__boosting_type': ['dart'],
  'classifier__drop_rate': [0.3500000000000001],
  'classifier__skip_drop': [0.45000000000000007],
  'classifier__learning_rate': [0.01],
  'classifier__num_iterations': [5000],
  'classifier__bagging_fraction': [0.5],
  'classifier__bagging_freq': [1000],
  'classifier__feature_fraction': [0.9],
  'classifier__max_depth'

In [152]:
lgbm_best = LGBMClassifier(categorical_feature=[1, 3, 4, 6, 7, 10],
                          objective='binary', metric='binary_logloss',
                          boosting_type='dart', drop_rate=0.35, skip_drop=0.45,
                           learning_rate=0.01, num_iterations=5000,
                           subsample=0.5, subsample_freq=1000,
                           colsample_bytree=0.9, max_depth=6, num_leaves=7,
                           min_child_samples=20, max_bin=150, n_estimators=250,
                           early_stopping_round=10, reg_alpha=0.01, reg_lambda=0.001,
                           scale_pos_weight=1.0, verbose=-1
                      )

sc = RobustScaler()
Xtr = sc.fit_transform(X_train)
Xte = sc.transform(X_test)

lgbm_best.fit(Xtr, y_train)
accuracy_score(y_test, lgbm_best.predict(Xte))

0.8047138047138047

# Local Outlier Factor

In [132]:
def tune_lof2(model, df, 
                  scaler=None, poly=None, dim_reduction=None, rfe=None,
                  preset=False):    
    best_params, best_acc = 0, 0  
    test_neighbors = np.linspace(1, 31, num=30).astype(int)
    test_contams = np.linspace(0.01, 0.26, num=25)
    
    if preset:
        X0_train, X0_valid, X0_test, y0_train, y0_valid, y0_test = df
        
    else:
        X0 = df.drop('sold', axis=1)
        y0 = df.sold
        X0_train, X0_test, y0_train, y0_test = train_test_split(X0, y0,
                                                                test_size=0.2,
                                                                shuffle=True,
                                                                stratify=y0,
                                                                random_state=11)
        X0_train, X0_valid, y0_train, y0_valid = train_test_split(X0_train, y0_train,
                                                                test_size=0.2,
                                                                shuffle=True,
                                                                stratify=y0_train,
                                                                random_state=11)

        if scaler:
            X0_train = scaler.fit_transform(X0_train)
            X0_valid = scaler.transform(X0_valid)
            X0_test = scaler.transform(X0_test)

        if poly:
            X0_train = poly.fit_transform(X0_train)
            X0_valid = poly.transform(X0_valid)
            X0_test = poly.transform(X0_test)

        if dim_reduction:
            X0_train = dim_reduction.fit_transform(X0_train)
            X0_valid = dim_reduction.transform(X0_valid)
            X0_test = dim_reduction.transform(X0_test)

        if rfe:
            X0_train = rfe.fit_transform(X0_train, y0_train)
            X0_valid = rfe.transform(X0_valid)
            X0_test = rfe.transform(X0_test)

        print('preprocessing complete')
    
    for i, tn in enumerate(test_neighbors):
        print(i, end='/')
        for j, tc in enumerate(test_contams):
            
            # 원본 보존을 위해 복사본 사용
            X_train_copy, X_valid_copy, X_test_copy = X0_train.copy(), X0_valid.copy(), X0_test.copy()
            y_train_copy, y_valid_copy, y_test_copy = y0_train.copy(), y0_valid.copy(), y0_test.copy()

            # LOF 모델 생성 및 트레인셋 학습
            clf = LocalOutlierFactor(n_neighbors=tn, contamination=tc,
                                    novelty=True, n_jobs=-1)
            clf.fit(X_train_copy)

            # 트레인셋 아웃라이어 제거
            y_pred = clf.predict(X_train_copy)
            lof_outlier_idx_train = pd.Series(y_pred)[pd.Series(y_pred)==-1].index
            X_train_copy = pd.DataFrame(X_train_copy).reset_index(drop=True).drop(lof_outlier_idx_train)
            y_train_copy = y_train_copy.reset_index(drop=True).drop(lof_outlier_idx_train)

            # 밸리데이션 셋 아웃라이어 제거
#             yval_pred = clf.predict(X_valid_copy)
#             lof_outlier_idx_valid = pd.Series(yval_pred)[pd.Series(yval_pred)==-1].index
#             X_valid_copy = pd.DataFrame(X_valid_copy).reset_index(drop=True).drop(lof_outlier_idx_valid)
#             y_valid_copy = y_valid_copy.reset_index(drop=True).drop(lof_outlier_idx_valid)

            # 테스트 셋 아웃라이어 제거
#             ytest_pred = clf.predict(X_test_copy)
#             lof_outlier_idx_test = pd.Series(ytest_pred)[pd.Series(ytest_pred)==-1].index
#             X_test_copy = pd.DataFrame(X_test_copy).reset_index(drop=True).drop(lof_outlier_idx_test)
#             y_test_copy = y_test_copy.reset_index(drop=True).drop(lof_outlier_idx_test)

            # 예측모델 정의 및 트레인 셋으로 학습
            mod = model
            mod.fit(X_train_copy, y_train_copy, verbose=-1)

            # scaling만 적용한 밸리데이션 셋으로 정확도 측정
            mod_acc = accuracy_score(y_valid_copy, mod.predict(X_valid_copy))
            
            if best_acc < mod_acc:
                best_acc = mod_acc
                best_params = (tn, tc)
                print((tn, tc, best_acc), end='/')
#                 X2 = X2
#                 y2 = y2
                
    
    return {'best_params':best_params,
           'best_accuracy':best_acc,
           'preprocessed_data':[X0_train, X0_valid, X0_test, y0_train, y0_valid, y0_test],
           'LOF_data':[X_train_copy, X_valid_copy, X_test_copy,
                      y_train_copy, y_valid_copy, y_test_copy]}

In [133]:
# converting parameter name for sklearn wrapper version LGBM
lgbm_best = LGBMClassifier(categorical_feature=[1, 3, 4, 6, 7, 10],
                          objective='binary', metric='binary_logloss',
                          boosting_type='dart', drop_rate=0.35, skip_drop=0.45,
                           learning_rate=0.01, num_iterations=5000,
                           subsample=0.5, subsample_freq=1000,
                           colsample_bytree=0.9, max_depth=6, num_leaves=7,
                           min_child_samples=20, max_bin=150, n_estimators=250,
                           early_stopping_round=10, reg_alpha=0.01, reg_lambda=0.001,
                           scale_pos_weight=1.0, verbose=-1
                      )
lgbm_scaler = RobustScaler()
lgbm_lof_tune = tune_lof2(lgbm_best, df,
                              scaler=lgbm_scaler)
lgbm_lof_tune['best_params'], lgbm_lof_tune['best_accuracy']

preprocessing complete
0/(1, 0.01, 0.7647058823529411)/1/(2, 0.01, 0.7773109243697479)/(2, 0.020416666666666666, 0.8067226890756303)/(2, 0.09333333333333332, 0.8151260504201681)/(2, 0.14541666666666667, 0.819327731092437)/(2, 0.26, 0.8235294117647058)/2/3/4/5/6/7/8/9/10/11/12/13/14/15/16/17/18/19/20/21/22/23/24/25/26/27/28/29/

((2, 0.26), 0.8235294117647058)

In [142]:
lgbm_scaler = RobustScaler()
X_train_rs = lgbm_scaler.fit_transform(X_train)
X_test_rs = lgbm_scaler.transform(X_test)

clf = LocalOutlierFactor(n_neighbors=2, contamination=0.26,
                                    novelty=True, n_jobs=-1)
clf.fit(X_train_rs)
y_pred = clf.predict(X_train_rs)

lof_outlier_idx_train = pd.Series(y_pred)[pd.Series(y_pred)==-1].index
X_train_lof = pd.DataFrame(X_train_rs).reset_index(drop=True).drop(lof_outlier_idx_train)
y_train_lof = y_train.reset_index(drop=True).drop(lof_outlier_idx_train)

lgbm_best = LGBMClassifier(categorical_feature=[1, 3, 4, 6, 7, 10],
                          objective='binary', metric='binary_logloss',
                          boosting_type='dart', drop_rate=0.35, skip_drop=0.45,
                           learning_rate=0.01, num_iterations=5000,
                           subsample=0.5, subsample_freq=1000,
                           colsample_bytree=0.9, max_depth=6, num_leaves=7,
                           min_child_samples=20, max_bin=150, n_estimators=250,
                           early_stopping_round=10, reg_alpha=0.01, reg_lambda=0.001,
                           scale_pos_weight=1.0, verbose=-1
                      )

lgbm_best.fit(X_train_lof, y_train_lof)
# accuracy_score(y_test, lgbm_best.predict(X_test))
accuracy_score(y_valid, lgbm_best.predict(X_valid))

0.5546218487394958

0.5488215488215489

# 최적 LOF 적용한 트레인셋 -> 2차 하이퍼파라미터 튜닝

# Last Try
- LGBM은 하이퍼 파라미터 튜닝으로, outlier 포함된 데이터에 FIT 된 상태 (마른걸레 짜기)
- outlier를 제거하였으므로 모델과 데이터 간의 FIT은 감소했다고 생각할 수 있음
- LGBM 오버핏 제어 파라미터를 조정하여 조금 더 robust한 상태로 LOF data를 학습시켜 보기.

In [None]:
def remove_outlier_by_lof(lof_fit, Xdf, ydf):
    ypred = lof_fit.predict(Xdf)
    lof_outlier_idx = pd.Series(ypred)[pd.Series(ypred)==-1].index
    Xdf_ = pd.DataFrame(Xdf).reset_index(drop=True).drop(lof_outlier_idx)
    ydf_ = ydf.reset_index(drop=True).drop(lof_outlier_idx)
    return Xdf_, ydf_

rscaler = RobustScaler()
X_train_ = rscaler.fit_transform(X_train)
X_valid_ = rscaler.transform(X_valid)
X_test_ = rscaler.transform(X_test)

clf = LocalOutlierFactor(n_neighbors=tn, contamination=tc,
                        novelty=True, n_jobs=-1)
clf.fit(X_train_copy)

Xtrain_lof, ytrain_lof = remove_outlier_by_lof(clf, X_train_, y_train)
Xvalid_lof, yvalid_lof = remove_outlier_by_lof(clf, X_valid_, y_valid)
Xtest_lof, ytest_lof = remove_outlier_by_lof(clf, X_test_, y_test)

In [None]:
pipe = Pipeline([
#                 ('scale', RobustScaler()),
#                 ('poly', PolynomialFeatures()),
#                 ('feature_selection', RFE(LGBMClassifier())),
                ('classifier', LGBMClassifier())
                ])

param_grid6 = [              
              {'classifier': [LGBMClassifier()],
               'classifier__categorical_feature':[cat_features_idx],
               'classifier__objective':['binary'],
               'classifier__metric':['binary_logloss'],
              'classifier__boosting_type':[grid1.best_params_['classifier__boosting_type']],
              'classifier__drop_rate':[grid2.best_params_['classifier__drop_rate']],
               'classifier__skip_drop':[grid2.best_params_['classifier__skip_drop']],
               'classifier__learning_rate':[grid1.best_params_['classifier__learning_rate']],
               'classifier__num_iterations':[grid1.best_params_['classifier__num_iterations']],
              'classifier__bagging_fraction': [grid4.best_params_['classifier__bagging_fraction']],
               'classifier__bagging_freq': [grid4.best_params_['classifier__bagging_freq']],
               'classifier__feature_fraction':[grid4.best_params_['classifier__feature_fraction']],
               'classifier__max_depth': [grid3.best_params_['classifier__max_depth']],
               'classifier__num_leaves': [grid3.best_params_['classifier__num_leaves']],
               'classifier__min_data_in_leaf': [grid3.best_params_['classifier__min_data_in_leaf']],
               'classifier__max_bin':[grid5.best_params_['classifier__max_bin']],
               'classifier__n_estimators':[grid5.best_params_['classifier__n_estimators']],
               'classifier__early_stopping_round':[grid5.best_params_['classifier__early_stopping_round']],
               'classifier__lambda_l1':[0, 1e-4, 1e-3, 1e-2, 0.1, 1],
               'classifier__lambda_l2':[0, 1e-4, 1e-3, 1e-2, 0.1, 1],
               'classifier__scale_pos_weight':[1.0, 1.1, 1.2, 1.3, 1.4, 1.5],
#                'scale':[RobustScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#               'feature_selection' : [RFE(LGBMClassifier(objective='binary',
#                                                         metric='binary_logloss'))],
#                 'feature_selection__n_features_to_select' : [140, 70, 35]
              }
             ]

grid6 = GridSearchCV(pipe, param_grid6, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid6.fit(X_train, y_train)
print(grid6.best_params_)
print(grid6.best_score_)

# Stage 2:

Poly + RFE 적용

```
model.fit(categorical_feature=[0, 1, ...]
```

In [None]:
pipe1 = Pipeline([
                ('scale', MinMaxScaler()),
                 ('poly', PolynomialFeatures()),
                ('feature_selection', RFE(XGBClassifier())),
                ('classifier', XGBClassifier())
                ])

param_grid1 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[1000],
               'classifier__max_depth':[5],
               'classifier__min_child_weight':[1],
               'classifier__gamma':[0],
               'classifier__subsample':[0.8],
               'classifier__colsample_bytree':[0.8],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
               'scale':[MinMaxScaler()],
               'poly':[PolynomialFeatures()],
               'poly__degree': [3],
               'feature_selection' : [RFE(XGBClassifier(objective='binary:logistic',
                                                       eval_metric='error'))],
               'feature_selection__n_features_to_select' : [140, 70, 35, 18]
#                'reduce_dims' : [PCA(), LDA(), TSNE()],
#                'reduce_dims__n_components' : [5, 7, 9, 11]
              }
             ]
grid1 = GridSearchCV(pipe1, param_grid1, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid1.fit(X_valid, y_valid)
print(grid1.best_params_)
print(grid1.best_score_)