In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn. neighbors import LocalOutlierFactor
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures

from scipy.stats import expon, reciprocal
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier
from missingpy import MissForest

import warnings
warnings.filterwarnings('ignore')



In [2]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            #print("******************************")
            #print("Column: ",col)
            #print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            #print("dtype after: ",props[col].dtype)
            #print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [3]:
df, NAli = reduce_mem_usage(pd.read_csv('./data/galaxy_final.csv', index_col=0))

Memory usage of properties dataframe is : 0.16994476318359375  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.041069984436035156  MB
This is  24.166666666666668 % of the initial size


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1485 entries, 0 to 1484
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   index                      1485 non-null   uint16 
 1   BuyItNow                   1485 non-null   uint8  
 2   startprice                 1485 non-null   float32
 3   productSeries_imputed      1485 non-null   uint8  
 4   product_isNote_imputed     1485 non-null   uint8  
 5   hasDescription             1485 non-null   uint8  
 6   charCountDescriptionBins   1485 non-null   uint8  
 7   upperCaseDescription_rate  1485 non-null   float32
 8   startprice_point9          1485 non-null   uint8  
 9   sold                       1485 non-null   uint8  
 10  color_sentiment_0          1485 non-null   uint8  
 11  color_sentiment_1          1485 non-null   uint8  
 12  carrier_none_0             1485 non-null   uint8  
 13  carrier_none_1             1485 non-null   uint8

In [5]:
X = df.drop('sold', axis=1)
y = df.sold
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11,
                                                       stratify=y, shuffle=True)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=11,
                                                       stratify=y_train, shuffle=True)

In [8]:
pipe1 = Pipeline([
                ('scale', MinMaxScaler()),
                 ('poly', PolynomialFeatures()),
                ('feature_selection', RFE(XGBClassifier())),
                ('classifier', XGBClassifier())
                ])

param_grid1 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[1000],
               'classifier__max_depth':[5],
               'classifier__min_child_weight':[1],
               'classifier__gamma':[0],
               'classifier__subsample':[0.8],
               'classifier__colsample_bytree':[0.8],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
               'scale':[MinMaxScaler()],
               'poly':[PolynomialFeatures()],
               'poly__degree': [3],
               'feature_selection' : [RFE(XGBClassifier(objective='binary:logistic',
                                                       eval_metric='error'))],
               'feature_selection__n_features_to_select' : [140, 70, 35, 18]
#                'reduce_dims' : [PCA(), LDA(), TSNE()],
#                'reduce_dims__n_components' : [5, 7, 9, 11]
              }
             ]
grid1 = GridSearchCV(pipe1, param_grid1, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid1.fit(X_valid, y_valid)
print(grid1.best_params_)
print(grid1.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 33.1min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.1,
              max_delta_step=None, max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 1000, 'classifier__n_jobs': -1,

In [6]:
scaler = MinMaxScaler()
poly = PolynomialFeatures(degree=3)
rfe = RFE(XGBClassifier(objective='binary:logistic', eval_metric='error'),
          n_features_to_select=70)

X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

X_train = poly.fit_transform(X_train)
X_valid = poly.transform(X_valid)
X_test = poly.transform(X_test)

In [7]:
%%time
X_train = rfe.fit_transform(X_train, y_train)
X_valid = rfe.transform(X_valid)
X_test = rfe.transform(X_test)

CPU times: user 29min 30s, sys: 6.21 s, total: 29min 36s
Wall time: 7min 56s


In [9]:
for i in range(X_train.shape[0]):
    print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

Column: 0, Selected False, Rank: 491.000
Column: 1, Selected True, Rank: 1.000
Column: 2, Selected True, Rank: 1.000
Column: 3, Selected True, Rank: 1.000
Column: 4, Selected True, Rank: 1.000
Column: 5, Selected True, Rank: 1.000
Column: 6, Selected False, Rank: 82.000
Column: 7, Selected False, Rank: 28.000
Column: 8, Selected True, Rank: 1.000
Column: 9, Selected False, Rank: 49.000
Column: 10, Selected False, Rank: 91.000
Column: 11, Selected False, Rank: 66.000
Column: 12, Selected False, Rank: 94.000
Column: 13, Selected False, Rank: 101.000
Column: 14, Selected False, Rank: 103.000
Column: 15, Selected True, Rank: 1.000
Column: 16, Selected True, Rank: 1.000
Column: 17, Selected True, Rank: 1.000
Column: 18, Selected True, Rank: 1.000
Column: 19, Selected False, Rank: 9.000
Column: 20, Selected False, Rank: 63.000
Column: 21, Selected True, Rank: 1.000
Column: 22, Selected False, Rank: 30.000
Column: 23, Selected False, Rank: 17.000
Column: 24, Selected True, Rank: 1.000
Column:

IndexError: index 560 is out of bounds for axis 0 with size 560

In [18]:
print(f'num of columns:{len(X_train[0, :])}')

num of columns:70


In [58]:
pipe = Pipeline([
                ('classifier', XGBClassifier())
                ])

param_grid2 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.01, 0.02, 0.03],
             'classifier__n_estimators':[1000],
               'classifier__max_depth':range(3,10),
               'classifier__min_child_weight':range(1,6),
               'classifier__gamma':[0],
               'classifier__subsample':[0.8],
               'classifier__colsample_bytree':[0.8],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
              }
             ]
grid2 = GridSearchCV(pipe, param_grid2, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid2.fit(X_train, y_train)
print(grid2.best_params_)
print(grid2.best_score_)

Fitting 5 folds for each of 105 candidates, totalling 525 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 29.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 64.8min
[Parallel(n_jobs=-1)]: Done 525 out of 525 | elapsed: 78.4min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.02,
              max_delta_step=None, max_depth=9, min_child_weight=4, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0, 'classifier__learning_rate': 0.02, 'classifier__max_depth': 9, 'classifier__min_child_weight': 4, 'classifier__n_estimators': 1000, 'classifier__n_jobs': -

In [60]:
# Gamma 튜닝

pipe = Pipeline([
                ('classifier', XGBClassifier())
                ])

param_grid3 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[grid2.best_params_['classifier__learning_rate']],
             'classifier__n_estimators':[1000],
               'classifier__max_depth': [grid2.best_params_['classifier__max_depth']], # 3
               'classifier__min_child_weight': [grid2.best_params_['classifier__min_child_weight']],# 3
               'classifier__gamma':[i/10.0 for i in range(0,10)],
               'classifier__subsample':[0.8],
               'classifier__colsample_bytree':[0.8],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
#                'scale':[MinMaxScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#                'feature_selection' : [RFE(RandomForestClassifier())],
#                'feature_selection__n_features_to_select' : [140]
              }
             ]
grid3 = GridSearchCV(pipe, param_grid3, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid3.fit(X_train, y_train)
print(grid3.best_params_)
print(grid3.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.0min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0.0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.02,
              max_delta_step=None, max_depth=9, min_child_weight=4, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0.0, 'classifier__learning_rate': 0.02, 'classifier__max_depth': 9, 'classifier__min_child_weight': 4, 'classifier__n_estimators': 1000, 'classifier__n_jobs

In [61]:
# subsample and colsample_bytree를 튜닝한다.

pipe= Pipeline([
                ('classifier', XGBClassifier())
                ])

param_grid4 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[grid2.best_params_['classifier__learning_rate']],
             'classifier__n_estimators':[1000],
               'classifier__max_depth': [grid2.best_params_['classifier__max_depth']], # 3
               'classifier__min_child_weight': [grid2.best_params_['classifier__min_child_weight']],# 3
               'classifier__gamma':[grid3.best_params_['classifier__gamma']], # 0.5
               'classifier__subsample':[i/10.0 for i in range(6,10)],
               'classifier__colsample_bytree':[i/10.0 for i in range(6,10)],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1]
              }
             ]
grid4 = GridSearchCV(pipe, param_grid4, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid4.fit(X_train, y_train)
print(grid4.best_params_)
print(grid4.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 13.1min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0.0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.02,
              max_delta_step=None, max_depth=9, min_child_weight=4, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0.0, 'classifier__learning_rate': 0.02, 'classifier__max_depth': 9, 'classifier__min_child_weight': 4, 'classifier__n_estimators': 1000, 'classifier__n_jobs

In [62]:
# n_estimator 튜닝

pipe = Pipeline([
                ('classifier', XGBClassifier())
                ])

param_grid5 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[grid2.best_params_['classifier__learning_rate']],
             'classifier__n_estimators':[100, 200, 300, 400, 500, 700, 1000],
               'classifier__max_depth': [grid2.best_params_['classifier__max_depth']], # 3
               'classifier__min_child_weight': [grid2.best_params_['classifier__min_child_weight']],# 3
               'classifier__gamma':[grid3.best_params_['classifier__gamma']], # 0.5
               'classifier__subsample':[grid4.best_params_['classifier__subsample']], # 0.8
               'classifier__colsample_bytree':[grid4.best_params_['classifier__colsample_bytree']], # 0.8
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],

              }
             ]
grid5 = GridSearchCV(pipe, param_grid5, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid5.fit(X_train, y_train)
print(grid5.best_params_)
print(grid5.best_score_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:  1.8min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0.0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.02,
              max_delta_step=None, max_depth=9, min_child_weight=4, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.15, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021, skip_drop=0.33,
              subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0.0, 'classifier__learning_rate': 0.02, 'classifier__max_depth': 9, 'classifier__min_child_weight': 4, 'classifier__n_estimators': 1000, 'classifier__n_jobs

In [63]:
# rate_drop, skip_drop 튜닝

pipe = Pipeline([
                ('classifier', XGBClassifier())
                ])

param_grid6 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': np.arange(0.1, 0.55, 0.05),
              'classifier__skip_drop': np.arange(0.1, 0.55, 0.05),
               'classifier__learning_rate':[grid2.best_params_['classifier__learning_rate']], # 
             'classifier__n_estimators':[grid5.best_params_['classifier__n_estimators']], # 
               'classifier__max_depth': [grid2.best_params_['classifier__max_depth']], # 3
               'classifier__min_child_weight': [grid2.best_params_['classifier__min_child_weight']],# 3
               'classifier__gamma':[grid3.best_params_['classifier__gamma']], # 0.5
               'classifier__subsample':[grid4.best_params_['classifier__subsample']], # 0.8
               'classifier__colsample_bytree':[grid4.best_params_['classifier__colsample_bytree']], # 0.8
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
              }
             ]
grid6 = GridSearchCV(pipe, param_grid6, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid6.fit(X_train, y_train)
print(grid6.best_params_)
print(grid6.best_score_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 33.8min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 66.5min finished


{'classifier': XGBClassifier(base_score=None, booster='dart', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, eval_metric='error',
              gamma=0.0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.02,
              max_delta_step=None, max_depth=9, min_child_weight=4, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=None, objective='binary:logistic',
              random_state=None, rate_drop=0.20000000000000004, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=1, seed=2021,
              skip_drop=0.30000000000000004, subsample=0.8, tree_method=None, ...), 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0.0, 'classifier__learning_rate': 0.02, 'classifier__max_depth': 9, 'classifier__min_child_weight': 4, 'classifier__n_estimato

In [64]:
# 정규화 상수 튜닝

pipe = Pipeline([
                ('classifier', XGBClassifier())
                ])

param_grid7 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [grid6.best_params_['classifier__rate_drop']],
              'classifier__skip_drop': [grid6.best_params_['classifier__skip_drop']],
               'classifier__learning_rate':[grid2.best_params_['classifier__learning_rate']], # 
             'classifier__n_estimators':[grid5.best_params_['classifier__n_estimators']], # 
               'classifier__max_depth': [grid2.best_params_['classifier__max_depth']], # 3
               'classifier__min_child_weight': [grid2.best_params_['classifier__min_child_weight']],# 3
               'classifier__gamma':[grid3.best_params_['classifier__gamma']], # 0.5
               'classifier__subsample':[grid4.best_params_['classifier__subsample']], # 0.8
               'classifier__colsample_bytree':[grid4.best_params_['classifier__colsample_bytree']], # 0.8
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
               'classifier__alpha':[0, 1e-4, 1e-3, 1e-2, 0.1, 1],
               'classifier__lambda':[1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100]
              }
             ]
grid7 = GridSearchCV(pipe, param_grid7, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid7.fit(X_train, y_train)
print(grid7.best_params_)
print(grid7.best_score_)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 29.8min
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed: 37.7min finished


{'classifier': XGBClassifier(alpha=0.0001, base_score=None, booster='dart',
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, eval_metric='error', gamma=0.0, gpu_id=None,
              importance_type='gain', interaction_constraints=None, lambda=1,
              learning_rate=0.02, max_delta_step=None, max_depth=9,
              min_child_weight=4, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=-1, nthread=-1, num_parallel_tree=None,
              objective='binary:logistic', random_state=None,
              rate_drop=0.20000000000000004, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=1, seed=2021, skip_drop=0.30000000000000004, ...), 'classifier__alpha': 0.0001, 'classifier__booster': 'dart', 'classifier__colsample_bytree': 0.8, 'classifier__eval_metric': 'error', 'classifier__gamma': 0.0, 'classifier__lambda': 1, 'classifier__learning_rate': 0.02, 'classifier__max_depth': 9, 'classifier__

# Early Stopping

In [65]:
xgb_best = grid7.best_params_['classifier']
best_esr_stoprounds_rfe = (-1, -1)
for i, esr in enumerate(np.arange(10, 510, 10)):
    print(i, end='/')
    xgb_best = grid7.best_params_['classifier']
    xgb_best.fit(X_train, y_train, early_stopping_rounds=esr, eval_metric="error",
                 eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=0)
    acc = accuracy_score(y_test, xgb_best.predict(X_test))
    
    if acc > best_esr_stoprounds_rfe[1]:
        best_esr_stoprounds_rfe = (esr, acc)
        print(best_esr_stoprounds_rfe)

0/(10, 0.8215488215488216)
1/(20, 0.835016835016835)
2/3/4/5/6/7/8/9/10/11/12/13/14/15/16/17/18/19/20/21/22/23/24/25/26/27/28/29/30/31/32/33/34/35/36/37/(380, 0.8417508417508418)
38/39/40/41/42/43/44/45/46/47/48/49/

# Local Outlier Factor

In [18]:
def tune_lof_xgb2(model, df, stoprounds=50,
                  scaler=None, poly=None, dim_reduction=None, rfe=None,
                  preset=False):    
    best_params, best_acc = 0, 0  
    test_neighbors = np.linspace(1, 30, num=30).astype(int)
    test_contams = np.linspace(0.01, 0.3, num=30)
    metrics = ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
      'manhattan']
    
    if preset:
        X0_train, X0_valid, X0_test, y0_train, y0_valid, y0_test = df
        
    else:
        X0 = df.drop('sold', axis=1)
        y0 = df.sold
        X0_train, X0_test, y0_train, y0_test = train_test_split(X0, y0,
                                                                test_size=0.2,
                                                                shuffle=True,
                                                                stratify=y0,
                                                                random_state=11)
        X0_train, X0_valid, y0_train, y0_valid = train_test_split(X0_train, y0_train,
                                                                test_size=0.2,
                                                                shuffle=True,
                                                                stratify=y0_train,
                                                                random_state=11)

        if scaler:
            X0_train = scaler.fit_transform(X0_train)
            X0_valid = scaler.transform(X0_valid)
            X0_test = scaler.transform(X0_test)

        if poly:
            X0_train = poly.fit_transform(X0_train)
            X0_valid = poly.transform(X0_valid)
            X0_test = poly.transform(X0_test)

        if dim_reduction:
            X0_train = dim_reduction.fit_transform(X0_train)
            X0_valid = dim_reduction.transform(X0_valid)
            X0_test = dim_reduction.transform(X0_test)

        if rfe:
            X0_train = rfe.fit_transform(X0_train, y0_train)
            X0_valid = rfe.transform(X0_valid)
            X0_test = rfe.transform(X0_test)

        print('preprocessing complete')
    
    for i, tn in enumerate(test_neighbors):
        for j, tc in enumerate(test_contams):
            print(i, j, end=' / ')
            #for m in metrics:
            
            # 원본 보존을 위해 복사본 사용
            X_train_copy, X_valid_copy, X_test_copy = X0_train.copy(), X0_valid.copy(), X0_test.copy()
            y_train_copy, y_valid_copy, y_test_copy = y0_train.copy(), y0_valid.copy(), y0_test.copy()

            # LOF 모델 생성 및 트레인셋 학습
            clf = LocalOutlierFactor(n_neighbors=tn, contamination=tc,
                                    novelty=True, n_jobs=-1)
            clf.fit(X_train_copy)

            # 트레인셋 아웃라이어 제거
            y_pred = clf.predict(X_train_copy)
            lof_outlier_idx_train = pd.Series(y_pred)[pd.Series(y_pred)==-1].index
            X_train_copy = pd.DataFrame(X_train_copy).drop(lof_outlier_idx_train)
            y_train_copy = y_train_copy.reset_index(drop=True).drop(lof_outlier_idx_train)

            # 밸리데이션 셋 아웃라이어 제거
            yval_pred = clf.predict(X_valid_copy)
            lof_outlier_idx_valid = pd.Series(yval_pred)[pd.Series(yval_pred)==-1].index
            X_valid_copy = pd.DataFrame(X_valid_copy).drop(lof_outlier_idx_valid)
            y_valid_copy = y_valid_copy.reset_index(drop=True).drop(lof_outlier_idx_valid)

            # 테스트 셋 아웃라이어 제거
            ytest_pred = clf.predict(X_test_copy)
            lof_outlier_idx_test = pd.Series(ytest_pred)[pd.Series(ytest_pred)==-1].index
            X_test_copy = pd.DataFrame(X_test_copy).drop(lof_outlier_idx_test)
            y_test_copy = y_test_copy.reset_index(drop=True).drop(lof_outlier_idx_test)

            # 예측모델 정의 및 트레인/벨리데이션 셋으로 학습
            mod = model
            mod.fit(X_train_copy, y_train_copy, early_stopping_rounds=stoprounds, eval_metric="error",
                 eval_set=[(X_train_copy, y_train_copy), (X_valid_copy, y_valid_copy)], verbose=0)

            # 테스트 정확도 측정 및 최고기록 업데이트
            mod_acc = accuracy_score(y_test_copy, mod.predict(X_test_copy))
            if best_acc < mod_acc:
                best_acc = mod_acc
                best_params = (tn, tc)
#                 X2 = X2
#                 y2 = y2
                print((tn, tc, best_acc))
    
    return {'best_params':best_params,
           'best_accuracy':best_acc,
           'preprocessed_data':[X0_train, X0_valid, X0_test, y0_train, y0_valid, y0_test],
           'LOF_data':[X_train_copy, X_valid_copy, X_test_copy,
                      y_train_copy, y_valid_copy, y_test_copy]}

In [74]:
grid7.best_params_['classifier']

XGBClassifier(alpha=0.0001, base_score=0.5, booster='dart', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eval_metric='error',
              gamma=0.0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', lambda=1, learning_rate=0.02,
              max_delta_step=0, max_depth=9, min_child_weight=4, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=1, objective='binary:logistic',
              random_state=2021, rate_drop=0.20000000000000004,
              reg_alpha=9.99999975e-05, reg_lambda=1, scale_pos_weight=1,
              seed=2021, skip_drop=0.30000000000000004, ...)

In [19]:
xgb_best = XGBClassifier(alpha=0.0001, base_score=0.5, booster='dart', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eval_metric='error',
              gamma=0.0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.02,
              max_delta_step=0, max_depth=9, min_child_weight=4,
              monotone_constraints='()', n_estimators=1000, n_jobs=-1,
              nthread=-1, num_parallel_tree=1, objective='binary:logistic',
              random_state=2021, rate_drop=0.20000000000000004,
              reg_alpha=9.99999975e-05, reg_lambda=1, scale_pos_weight=1,
              seed=2021, skip_drop=0.30000000000000004)

In [20]:
#xgb_best = grid7.best_params_['classifier']

xgb_scaler = MinMaxScaler()
xgb_poly = PolynomialFeatures(degree=3)
xgb_rfe = RFE(XGBClassifier(objective='binary:logistic',
                           eval_metric='error'),
                          n_features_to_select=70)

df = (X_train.copy(), X_valid.copy(), X_test.copy(),
      y_train.copy(), y_valid.copy(), y_test.copy())

xgb_lof_tune = tune_lof_xgb2(xgb_best, df,
                              #stoprounds=best_esr_stoprounds_rfe[0],
                              stoprounds=380,
                              scaler=xgb_scaler,
                              poly=xgb_poly,
                              rfe=xgb_rfe,
                             preset=True)

xgb_lof_tune['best_params'], xgb_lof_tune['best_accuracy']

0 0 / (1, 0.01, 0.8175675675675675)
0 1 / (1, 0.019999999999999997, 0.8191126279863481)
0 2 / 0 3 / 0 4 / (1, 0.049999999999999996, 0.8204225352112676)
0 5 / 0 6 / 0 7 / 0 8 / 0 9 / 0 10 / 0 11 / 0 12 / 0 13 / 0 14 / 0 15 / 0 16 / 0 17 / (1, 0.18, 0.8210116731517509)
0 18 / (1, 0.18999999999999997, 0.82421875)
0 19 / (1, 0.19999999999999998, 0.8260869565217391)
0 20 / (1, 0.20999999999999996, 0.8285714285714286)
0 21 / (1, 0.21999999999999997, 0.8319672131147541)
0 22 / (1, 0.22999999999999998, 0.8326359832635983)
0 23 / 0 24 / (1, 0.24999999999999997, 0.8333333333333334)
0 25 / (1, 0.25999999999999995, 0.8362068965517241)
0 26 / (1, 0.26999999999999996, 0.8384279475982532)
0 27 / (1, 0.27999999999999997, 0.8392857142857143)
0 28 / (1, 0.29, 0.8423423423423423)
0 29 / 1 0 / 1 1 / 1 2 / 1 3 / 1 4 / 1 5 / 1 6 / 1 7 / 1 8 / 1 9 / 1 10 / (2, 0.10999999999999997, 0.8446969696969697)
1 11 / (2, 0.11999999999999998, 0.8461538461538461)
1 12 / 1 13 / 1 14 / 1 15 / (2, 0.15999999999999998, 0.85

((2, 0.18), 0.8669354838709677)