# Imports

In [19]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier
import scikitplot as skplt
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix, f1_score,auc,roc_curve,roc_auc_score, precision_recall_curve
from xgboost.sklearn import XGBClassifier
from sklearn import metrics
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterSampler, ParameterGrid
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.svm import SVC
import pickle
from scipy import stats
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from tqdm import tqdm
import datetime
import time
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate, StratifiedKFold, KFold
from sklearn.decomposition import PCA

# Global Var

In [20]:
KFOLDS=10

# Data Processing

In [21]:
# loading data and initial inspection to make sure data is correct
train_data = pd.read_csv("train.csv",encoding="utf_8_sig")
print(train_data.shape)
print(list(train_data), len(list(train_data)))
train_data.head()

(9681, 25)
['id', 'neighbourhood', 'room_type', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'host_since', 'host_is_superhost', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'cleaning_fee', 'guests_included', 'extra_people', 'maximum_nights', 'instant_bookable', 'is_business_travel_ready', 'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification', 'price'] 25


Unnamed: 0,id,neighbourhood,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,host_since,...,cleaning_fee,guests_included,extra_people,maximum_nights,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,price
0,727,Palermo,Entire home/apt,1,170,5/12/20,2.48,4,346,8/1/13,...,2092,1,0,365,t,f,strict_14_with_grace_period,f,f,2
1,6274,Colegiales,Private room,1,11,7/1/19,0.57,1,0,2/14/14,...,558,1,0,15,f,f,moderate,f,f,1
2,6025,Recoleta,Entire home/apt,3,2,11/23/19,0.08,19,360,10/19/17,...,1052,3,351,90,t,f,moderate,f,f,3
3,8931,Recoleta,Entire home/apt,2,1,11/1/19,0.13,1,267,2/1/19,...,0,4,0,1125,t,f,flexible,f,f,2
4,7524,San Nicolás,Entire home/apt,2,31,12/26/19,1.54,5,365,1/24/15,...,2092,1,0,365,t,f,moderate,f,f,3


In [22]:
# initialize one hot encoder from sklearn
neigh_encoder = LabelBinarizer()
room_encoder = LabelBinarizer()
bed_encoder = LabelBinarizer()
cancel_encoder = LabelBinarizer()
neigh_encoder.fit(train_data['neighbourhood'])
room_encoder.fit(train_data['room_type'])
bed_encoder.fit(train_data['bed_type'])
cancel_encoder.fit(train_data['cancellation_policy'])

def data_processing_onehotencode(data):
    # converting date features to the number of days from today (11/03/2020)
    current_date = ["11/03/20"]*len(data["number_of_reviews"])
    data["today_date"] = current_date
    data["last_review"] = (pd.to_datetime(data['today_date']) - pd.to_datetime(data['last_review']))
    data["host_since"] = (pd.to_datetime(data['today_date']) - pd.to_datetime(data['host_since']))
    data['last_review'] = data["last_review"].astype('timedelta64[D]').astype(float)
    data['host_since'] = data["host_since"].astype('timedelta64[D]').astype(float)
    data = data.drop(["today_date"],axis=1)
    
    # encode the neighbourhood feature with one hot encoding
    neigh_onehot = neigh_encoder.transform(data['neighbourhood'])
    neigh_onehot_df = pd.DataFrame(neigh_onehot)
    data.drop('neighbourhood', axis=1, inplace=True)
    data = pd.concat([neigh_onehot_df, data], axis=1)
    
    # encode the room_type feature with one hot encoding
    room_onehot = room_encoder.transform(data['room_type'])
    room_onehot_df = pd.DataFrame(room_onehot)
    data.drop('room_type', axis=1, inplace=True)
    data = pd.concat([room_onehot_df, data], axis=1)
    
    # encode the bed_type feature with one hot encoding
    bed_onehot = bed_encoder.transform(data['bed_type'])
    bed_onehot_df = pd.DataFrame(bed_onehot)
    data.drop('bed_type', axis=1, inplace=True)
    data = pd.concat([bed_onehot_df, data], axis=1)
    
    # encode the cancellation_policy feature with one hot encoding
    cancel_onehot = cancel_encoder.transform(data['cancellation_policy'])
    cancel_onehot_df = pd.DataFrame(cancel_onehot)
    data.drop('cancellation_policy', axis=1, inplace=True)
    data = pd.concat([cancel_onehot_df, data], axis=1)
    
    # encode the boolean features by mapping t to 2 and f to 1
    data['host_is_superhost'] = data.host_is_superhost.map(dict(t=2, f=1))
    data['instant_bookable'] = data.instant_bookable.map(dict(t=2, f=1))
    data['require_guest_profile_picture'] = data.require_guest_profile_picture.map(dict(t=2, f=1))
    data['require_guest_phone_verification'] = data.require_guest_phone_verification.map(dict(t=2, f=1))
    
    # dropping features that are useless for training purposes
    data.drop('id', axis=1, inplace=True)
    data.drop("is_business_travel_ready", axis=1, inplace=True)
    
    return data

In [24]:
#inspect data and encoding to make sure it is correct
print(list(train_data))
train_data = data_processing_onehotencode(train_data)
print(list(train_data))
print(train_data.shape)

['id', 'neighbourhood', 'room_type', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'host_since', 'host_is_superhost', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'cleaning_fee', 'guests_included', 'extra_people', 'maximum_nights', 'instant_bookable', 'is_business_travel_ready', 'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification', 'price']
[0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'host_since', 'host_is_superhost', 'bathrooms', 'bedrooms', 'beds', 'cleaning_fee', 'guests_included', 'extra_people', 'maximum_nights', 'instant_bookable', 'require_guest_profile_picture', 'req

In [26]:
# split data into X and y (data and label)
X_unscaled = train_data.drop(columns=['price'], axis=1)
y_train = train_data['price']

In [27]:
# apply standardscaler onto the training data
scale = StandardScaler()
scale.fit(X_unscaled)
X_train = scale.transform(X_unscaled)

# Model Eval Utility

In [28]:
# model evaluation utility to display cross validation results (accuracy, AUC)
def model_eval(clf, X=X_train, print_=True, k=KFOLDS, cat=False):
    scores = cross_validate(clf, X, y_train, cv=k, n_jobs=-1, scoring=['roc_auc_ovo', 'accuracy'], return_train_score=True)
    if print_:
        print("Train: Acc: {:.5f} +/- {:.5f},  Auc: {:.5f} | Test: Acc: {:.5f} +/- {:.5f},  Auc: {:.5f}".
              format(scores['train_accuracy'].mean(), scores['train_accuracy'].std() * 2, scores['train_roc_auc_ovo'].mean(), 
                     scores['test_accuracy'].mean(), scores['test_accuracy'].std() * 2, scores['test_roc_auc_ovo'].mean()))
    return scores['train_accuracy'].mean(), scores['train_roc_auc_ovo'].mean(), scores['test_accuracy'].mean(), scores['test_roc_auc_ovo'].mean()

# LlightGBM Tuning

In [13]:
lgb = LGBMClassifier(objective='multiclass',num_class=4, random_state=1, learning_rate=0.086, 
                         max_depth=10, num_leaves=200, min_child_samples=3, min_child_weight=0.007,
                        reg_alpha = 0.8, reg_lambda=0.8)
model_eval(lgb)

Acc: 0.55511 +/- 0.02413 | Auc: 0.80195


(0.5551082764093745, 0.8019513149131952)

### Tuning max_depth, num_leaves for LightGBM

In [25]:
params = {
    'max_depth': range(3, 11),
    'num_leaves': np.arange(10, 1500, 50)
}
lgb = LGBMClassifier(objective='multiclass',num_class=4, random_state=1, learning_rate=0.17, n_jobs=-1)
grid = GridSearchCV(estimator=lgb, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 240 candidates, totalling 2400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:  


 The best estimator across ALL searched params:
 LGBMClassifier(learning_rate=0.17, max_depth=9, num_class=4, num_leaves=160,
               objective='multiclass', random_state=1)

 The best score across ALL searched params:
 0.5569649847759897

 The best parameters across ALL searched params:
 {'max_depth': 9, 'num_leaves': 160}


### Tuning min_child_samples, min_child_weight for LightGBM

In [36]:
params = {
    'min_child_samples': range(1, 30),
    'min_child_weight': np.arange(0.001, 0.01, 0.001)
}
lgb = LGBMClassifier(objective='multiclass',num_class=4, random_state=1, learning_rate=0.17, n_jobs=-1, max_depth=9, num_leaves=160)
grid = GridSearchCV(estimator=lgb, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 261 candidates, totalling 2610 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:   39.9s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:   45.4s
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:  


 The best estimator across ALL searched params:
 LGBMClassifier(learning_rate=0.17, max_depth=9, min_child_samples=3,
               min_child_weight=0.009000000000000001, num_class=4,
               num_leaves=160, objective='multiclass', random_state=1)

 The best score across ALL searched params:
 0.5586200095523204

 The best parameters across ALL searched params:
 {'min_child_samples': 3, 'min_child_weight': 0.009000000000000001}


### Tuning reg_alpha, reg_lambda for LightGBM

In [38]:
params = {
    'reg_alpha': np.arange(0.0, 1.1, 0.1),
    'reg_lambda': np.arange(0.0, 1.1, 0.1)
}
lgb = LGBMClassifier(objective='multiclass',num_class=4, random_state=1, learning_rate=0.17, n_jobs=-1, 
                     max_depth=9, num_leaves=160,  min_child_samples=3, min_child_weight=0.009)
grid = GridSearchCV(estimator=lgb, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 121 candidates, totalling 1210 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   26.5s
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:   30.1s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:   37.3s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:   43.3s
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:  


 The best estimator across ALL searched params:
 LGBMClassifier(learning_rate=0.17, max_depth=9, min_child_samples=3,
               min_child_weight=0.009, num_class=4, num_leaves=160,
               objective='multiclass', random_state=1, reg_alpha=0.5,
               reg_lambda=0.6000000000000001)

 The best score across ALL searched params:
 0.559136218645788

 The best parameters across ALL searched params:
 {'reg_alpha': 0.5, 'reg_lambda': 0.6000000000000001}


### Tuning learning_rate for LightGBM

In [39]:
params = {
    'learning_rate': np.arange(0.001, 0.3, 0.001)
}
lgb = LGBMClassifier(objective='multiclass',num_class=4, random_state=1, learning_rate=0.17, n_jobs=-1, 
                     max_depth=9, num_leaves=160,  min_child_samples=3, min_child_weight=0.009,
                   reg_alpha=0.5, reg_lambda=0.6)
grid = GridSearchCV(estimator=lgb, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 299 candidates, totalling 2990 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:   30.8s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   35.0s
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:   39.0s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   42.9s
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:   47.3s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:   54.0s
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:  


 The best estimator across ALL searched params:
 LGBMClassifier(learning_rate=0.074, max_depth=9, min_child_samples=3,
               min_child_weight=0.009, num_class=4, num_leaves=160,
               objective='multiclass', random_state=1, reg_alpha=0.5,
               reg_lambda=0.6)

 The best score across ALL searched params:
 0.5614099054149716

 The best parameters across ALL searched params:
 {'learning_rate': 0.074}


### Revisit num_leaves for LightGBM with smaller steps

In [57]:
params = {
    'num_leaves': np.arange(110, 210, 1)
}
lgb = LGBMClassifier(objective='multiclass',num_class=4, random_state=1, learning_rate=0.074, n_jobs=-1, 
                     max_depth=9,  min_child_samples=3, min_child_weight=0.009,
                   reg_alpha=0.5, reg_lambda=0.6)
grid = GridSearchCV(estimator=lgb, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:   42.8s
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:  


 The best estimator across ALL searched params:
 LGBMClassifier(learning_rate=0.074, max_depth=9, min_child_samples=3,
               min_child_weight=0.009, num_class=4, num_leaves=151,
               objective='multiclass', random_state=1, reg_alpha=0.5,
               reg_lambda=0.6)

 The best score across ALL searched params:
 0.5637855120299533

 The best parameters across ALL searched params:
 {'num_leaves': 151}


### Evaluate with the best parameters

In [58]:
lgb = LGBMClassifier(objective='multiclass',num_class=4, random_state=1, learning_rate=0.074, n_jobs=-1, 
                     max_depth=9, num_leaves=151,  min_child_samples=3, min_child_weight=0.009,
                   reg_alpha=0.5, reg_lambda=0.6)
model_eval(lgb)

Train: Acc: 0.97165 +/- 0.00461,  Auc: 0.99854 | Test: Acc: 0.56379 +/- 0.02714,  Auc: 0.80478


(0.9716511954999639,
 0.9985403889605514,
 0.5637855120299533,
 0.8047785769290776)

# Random Forest Tuning

In [49]:
rf = RandomForestClassifier(n_estimators=887, criterion='gini', max_depth=None,random_state=1,
                            min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                            max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
                            min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=-1,
                            verbose=0, warm_start=False, class_weight=None)
model_eval(rf)

Train: Acc: 1.00000 +/- 0.00000,  Auc: 1.00000 | Test: Acc: 0.56007 +/- 0.04154,  Auc: 0.80941


(1.0, 1.0, 0.5600651178261649, 0.8094091706324203)

### Tuning n_estimators,  max_depth for Random Forest

In [56]:
params = {
    'n_estimators': np.arange(100, 1500, 100),
    'max_depth': range(5, 20)
}
rf = RandomForestClassifier(n_jobs=-1, min_samples_split=2, min_samples_leaf=1, random_state=1)
grid = GridSearchCV(estimator=rf, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 210 candidates, totalling 2100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   21.6s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:   34.8s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   46.3s
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:   50.5s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:   58.9s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:  


 The best estimator across ALL searched params:
 RandomForestClassifier(max_depth=19, n_estimators=1300, n_jobs=-1,
                       random_state=1)

 The best score across ALL searched params:
 0.5616148112137417

 The best parameters across ALL searched params:
 {'max_depth': 19, 'n_estimators': 1300}


### Tuning min_samples_split,  min_samples_leaf for Random Forest

In [66]:
params = {
    "min_samples_split": [1, 2, 3, 4, 5],
    "min_samples_leaf": [1, 2, 3, 4, 5]
}
rf = RandomForestClassifier(n_jobs=-1, random_state=1, 
                            max_depth=19, n_estimators=1300)
grid = GridSearchCV(estimator=rf, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   36.9s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   41.1s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   49.9s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 245 out of 250 | elapsed:  


 The best estimator across ALL searched params:
 RandomForestClassifier(max_depth=19, n_estimators=1300, n_jobs=-1,
                       random_state=1)

 The best score across ALL searched params:
 0.5616148112137417

 The best parameters across ALL searched params:
 {'min_samples_leaf': 1, 'min_samples_split': 2}


### Tuning bootstrap,  oob_score, max_features,  criterion for Random Forest

In [11]:
params = {
    "bootstrap": [True, False],
    "oob_score": [True, False],
    "max_features": ['sqrt', 'log2', None],
    "criterion" : ["gini", "entropy"]
}
rf = RandomForestClassifier(n_jobs=-1, random_state=1, 
                            max_depth=19, n_estimators=1300,
                           min_samples_leaf=1, min_samples_split=2)
grid = GridSearchCV(estimator=rf, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 234 out of 240 | elapsed: 11.5min remaining:   17.6s
[Parallel(n_jobs=-1)]: Done 240 out 


 The best estimator across ALL searched params:
 RandomForestClassifier(max_depth=19, max_features='sqrt', n_estimators=1300,
                       n_jobs=-1, oob_score=True, random_state=1)

 The best score across ALL searched params:
 0.5616148112137417

 The best parameters across ALL searched params:
 {'bootstrap': True, 'criterion': 'gini', 'max_features': 'sqrt', 'oob_score': True}


### Revisit n_estimators for Random Forest with smaller steps

In [12]:
params = {
    'n_estimators' : np.arange(1250, 1360, 10)
}
rf = RandomForestClassifier(n_jobs=-1, random_state=1, 
                            max_depth=19,
                           min_samples_leaf=1, min_samples_split=2, bootstrap=True, oob_score=True)
grid = GridSearchCV(estimator=rf, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 11 candidates, totalling 110 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   39.2s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   56.3s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  91 out of 110 | elapsed:  2.4min remaining:   30.5s
[Parallel(n_jobs=-1)]: Done 103 out of 110 | elapsed:  2.7min remaining:   10.9s
[Parallel(n_jobs=-1)]: Done 110 out of 110 | elapsed:  2.7min finished



 The best estimator across ALL searched params:
 RandomForestClassifier(max_depth=19, n_estimators=1300, n_jobs=-1,
                       oob_score=True, random_state=1)

 The best score across ALL searched params:
 0.5616148112137417

 The best parameters across ALL searched params:
 {'n_estimators': 1300}


### Evaluate the model with the best parameters

In [14]:
rf = RandomForestClassifier(n_jobs=-1, random_state=1, 
                            max_depth=19,n_estimators=1300,
                           min_samples_leaf=1, min_samples_split=2, bootstrap=True, oob_score=True)
model_eval(rf)

Train: Acc: 0.99207 +/- 0.00093,  Auc: 0.99971 | Test: Acc: 0.56161 +/- 0.03801,  Auc: 0.80943


(0.9920692268511097,
 0.9997137057593756,
 0.5616148112137417,
 0.8094318578626869)

# XGBoost Tuning

In [19]:
xgb = XGBClassifier(
    learning_rate= 0.17,
    random_state=1,
    n_jobs=-1,
    eval_metric= 'auc'
)
model_eval(xgb)

Train: Acc: 0.83229 +/- 0.01174,  Auc: 0.96696 | Test: Acc: 0.55149 +/- 0.03586,  Auc: 0.80271


(0.8322945935782815, 0.9669564742428742, 0.551492336821636, 0.8027099885947953)

### Tuning subsample, colsample_bytree, colsample_bylevel for XGBoost

In [20]:
params = {
    'subsample': np.arange(0.6, 1.0, 0.1),
    'colsample_bytree': np.arange(0.6, 1.0, 0.1),
    'colsample_bylevel': np.arange(0.6, 1.0, 0.1),
}
xgb = XGBClassifier(
    learning_rate= 0.17,
    random_state=1,
    n_jobs=-1,
    eval_metric= 'auc'
)

grid = GridSearchCV(estimator=xgb, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 64 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:   31.1s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   38.6s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   40.4s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:   52.8s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:  


 The best estimator across ALL searched params:
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.6,
              colsample_bynode=1, colsample_bytree=0.7, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.17, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, objective='multi:softprob', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=0.8999999999999999, tree_method='exact',
              validate_parameters=1, verbosity=None)

 The best score across ALL searched params:
 0.5619256880655698

 The best parameters across ALL searched params:
 {'colsample_bylevel': 0.6, 'colsample_bytree': 0.7, 'subsample': 0.8999999999999999}


### Tuning min_child_weight for XGBoost

In [22]:
params = {
    'min_child_weight': np.arange(0.1, 10.1, 0.1)
}
xgb = XGBClassifier(
    learning_rate= 0.17,
    colsample_bylevel=0.6,
    colsample_bytree=0.7,
    subsample=0.8999,
    random_state=1,
    n_jobs=-1,
    eval_metric= 'auc'
)

grid = GridSearchCV(estimator=xgb, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:   34.3s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   40.8s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   43.5s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:   53.7s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:  


 The best estimator across ALL searched params:
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.6,
              colsample_bynode=1, colsample_bytree=0.7, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.17, max_delta_step=0,
              max_depth=6, min_child_weight=0.4, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, objective='multi:softprob', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=0.8999, tree_method='exact', validate_parameters=1,
              verbosity=None)

 The best score across ALL searched params:
 0.5601679971684195

 The best parameters across ALL searched params:
 {'min_child_weight': 0.4}


### Tuning n_estimators, max_depth for XGBoost

In [15]:
params = {
    'n_estimators': np.arange(100, 1600, 100),
    'max_depth': range(3, 13)
}
xgb = XGBClassifier(
    learning_rate= 0.17,
    colsample_bylevel=0.6,
    colsample_bytree=0.7,
    subsample=0.8999,
    random_state=1,
    n_jobs=-1,
    min_child_weight=0.4,
    eval_metric= 'auc'
)

grid = GridSearchCV(estimator=xgb, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 150 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   51.5s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:  


 The best estimator across ALL searched params:
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.6,
              colsample_bynode=1, colsample_bytree=0.7, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.17, max_delta_step=0,
              max_depth=12, min_child_weight=0.4, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=-1,
              num_parallel_tree=1, objective='multi:softprob', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=0.8999, tree_method='exact', validate_parameters=1,
              verbosity=None)

 The best score across ALL searched params:
 0.5625421112333581

 The best parameters across ALL searched params:
 {'max_depth': 12, 'n_estimators': 200}


### Revisit n_estimators for XGBoost with a smaller step

In [11]:
params = {
    'n_estimators': np.arange(10, 260, 50),
    'max_depth': range(10, 15)
}
xgb = XGBClassifier(
    learning_rate= 0.17,
    colsample_bylevel=0.6,
    colsample_bytree=0.7,
    subsample=0.8999,
    random_state=1,
    n_jobs=-1,
    min_child_weight=0.4,
    eval_metric= 'auc'
)

grid = GridSearchCV(estimator=xgb, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   41.3s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:   48.7s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 245 out of 250 | elapsed:  


 The best estimator across ALL searched params:
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.6,
              colsample_bynode=1, colsample_bytree=0.7, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.17, max_delta_step=0,
              max_depth=12, min_child_weight=0.4, missing=nan,
              monotone_constraints='()', n_estimators=210, n_jobs=-1,
              num_parallel_tree=1, objective='multi:softprob', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=0.8999, tree_method='exact', validate_parameters=1,
              verbosity=None)

 The best score across ALL searched params:
 0.5628522418101646

 The best parameters across ALL searched params:
 {'max_depth': 12, 'n_estimators': 210}


### Tuning learning_rate for XGBoost

In [11]:
params = {
    'learning_rate': np.arange(0.01, 0.3, 0.01)
}
xgb = XGBClassifier(
    colsample_bylevel=0.6,
    colsample_bytree=0.7,
    subsample=0.8999,
    random_state=1,
    n_jobs=-1,
    min_child_weight=0.4,
    max_depth=12,
    n_estimators=210,
    eval_metric= 'auc'
)

grid = GridSearchCV(estimator=xgb, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 29 candidates, totalling 290 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   33.7s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   58.1s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:  


 The best estimator across ALL searched params:
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.6,
              colsample_bynode=1, colsample_bytree=0.7, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.05, max_delta_step=0,
              max_depth=12, min_child_weight=0.4, missing=nan,
              monotone_constraints='()', n_estimators=210, n_jobs=-1,
              num_parallel_tree=1, objective='multi:softprob', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=0.8999, tree_method='exact', validate_parameters=1,
              verbosity=None)

 The best score across ALL searched params:
 0.5663660244437054

 The best parameters across ALL searched params:
 {'learning_rate': 0.05}


In [13]:
xgb = XGBClassifier(
    colsample_bylevel=0.6,
    colsample_bytree=0.7,
    subsample=0.8999,
    random_state=1,
    n_jobs=-1,
    min_child_weight=0.4,
    max_depth=12,
    n_estimators=210,
    learning_rate=0.05,
    eval_metric= 'auc'
)
model_eval(xgb)

Train: Acc: 0.99984 +/- 0.00021,  Auc: 1.00000 | Test: Acc: 0.56637 +/- 0.03694,  Auc: 0.81222


(0.9998393192381018,
 0.9999999826794903,
 0.5663660244437054,
 0.8122189684499043)

# CatBoost Tuning (Discarded)

In [14]:
cat = CatBoostClassifier(verbose=False, random_state=1, thread_count=-1, eval_metric='AUC')
model_eval(cat)

Train: Acc: 0.84383 +/- 0.00451,  Auc: 0.96866 | Test: Acc: 0.55263 +/- 0.02428,  Auc: 0.80421


(0.843829228503644, 0.968662227634279, 0.552627634350826, 0.8042147544068671)

In [12]:
params = {
    'rsm': np.arange(0.6, 1.1, 0.1)
}
cat = CatBoostClassifier(
    verbose=False, 
    random_state=1, 
    thread_count=-1
)


grid = GridSearchCV(estimator=cat, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done  36 out of  60 | elapsed:  5.0min remaining:  3.3min
[Parallel(n_jobs=-1)]: Done  43 out of  60 | elapsed:  5.2min remaining:  2.0min
[Parallel(n_jobs=-1)]: Done  50 out of  60 | elapsed:  5.4min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  57 out of  60 | elapsed:  5.7min remaining:   18.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  5.8min finished



 The best estimator across ALL searched params:
 <catboost.core.CatBoostClassifier object at 0x0000027C393ADE80>

 The best score across ALL searched params:
 0.5547976955027336

 The best parameters across ALL searched params:
 {'rsm': 0.7999999999999999}


In [11]:
params = {
    'iterations': np.arange(800, 1500, 100),
    'depth': range(3, 13)
}
cat = CatBoostClassifier(
    verbose=False, 
    random_state=1, 
    thread_count=-1,
    rsm=0.8
)


grid = GridSearchCV(estimator=cat, param_grid=params, cv=KFOLDS, n_jobs=-1, scoring=['accuracy', 'roc_auc_ovo'], refit='accuracy', verbose=10)
grid.fit(X_train,y_train)

print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

Fitting 10 folds for each of 70 candidates, totalling 700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   56.0s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed: 2


 The best estimator across ALL searched params:
 <catboost.core.CatBoostClassifier object at 0x00000159B8FECB00>

 The best score across ALL searched params:
 0.5567598657557847

 The best parameters across ALL searched params:
 {'depth': 7, 'iterations': 1000}


In [16]:
cat = CatBoostClassifier(
    verbose=False, 
    random_state=1, 
    thread_count=-1,
    rsm=0.8,
    depth=7,
    iterations=1000
)

kf = KFold(n_splits=10, random_state=1, shuffle=True)

test_accs = []
test_aucs = []
train_accs = []
train_aucs = []

for train_index, test_index in kf.split(X_train):
    X_train_k, X_test_k = X_train[train_index], X_train[test_index]
    y_train_k, y_test_k = y_train[train_index], y_train[test_index]
    
    cat = cat.fit(X_train_k, y_train_k,
         eval_set=(X_test_k, y_test_k),
         use_best_model=True
         )

    y_train_ypred = cat.predict(X_train_k)
    y_train_prob = cat.predict_proba(X_train_k)

    y_test_ypred = cat.predict(X_test_k)
    y_test_prob = cat.predict_proba(X_test_k)
    y_probas = cat.predict_proba(X_test_k)
    
    train_acc, train_auc, test_acc, test_auc = metrics.accuracy_score(y_train_k, y_train_ypred), roc_auc_score(y_train_k, y_train_prob, multi_class='ovo'),metrics.accuracy_score(y_test_k, y_test_ypred), roc_auc_score(y_test_k, y_test_prob, multi_class='ovo')
    
    train_accs.append(train_acc)
    train_aucs.append(train_auc)
    test_accs.append(test_acc)
    test_aucs.append(test_auc)

test_accs = np.asarray(test_accs)
test_aucs = np.asarray(test_aucs)
train_accs = np.asarray(train_accs)
train_aucs = np.asarray(train_aucs)
print("Train: Acc: {:.5f} +/- {:.5f},  Auc: {:.5f} | Test: Acc: {:.5f} +/- {:.5f},  Auc: {:.5f}".
              format(train_accs.mean(), train_accs.std() * 2, train_aucs.mean(), 
                     test_accs.mean(), test_accs.std() * 2, test_aucs.mean()))
    

Train: Acc: 0.76785 +/- 0.06553,  Auc: 0.93710 | Test: Acc: 0.54850 +/- 0.01249,  Auc: 0.80256


# Stacking (Final SVM) Tuning
### Tuning C parameter for the final classifier SVM of the stacking classifier 

In [36]:
rf = RandomForestClassifier(n_jobs=-1, random_state=1, 
                            max_depth=19,n_estimators=1300,
                           min_samples_leaf=1, min_samples_split=2, bootstrap=True, oob_score=True)

xgb = XGBClassifier(
    colsample_bylevel=0.6,
    colsample_bytree=0.7,
    subsample=0.8999,
    random_state=1,
    n_jobs=-1,
    min_child_weight=0.4,
    max_depth=12,
    n_estimators=210,
    learning_rate=0.05,
    eval_metric= 'auc'
)


lgb = LGBMClassifier(objective='multiclass',num_class=4, random_state=1, learning_rate=0.074, n_jobs=-1, 
                     max_depth=9, num_leaves=151,  min_child_samples=3, min_child_weight=0.009,
                   reg_alpha=0.5, reg_lambda=0.6)

cat = CatBoostClassifier(
    verbose=False, 
    random_state=1, 
    thread_count=-1,
    rsm=0.8,
    depth=7,
    iterations=1000
)

estimators = [
    ('rf', rf),
    ('xgb', xgb),
    ('lgb', lgb)
#     ('cat', cat)
]

params = {'C': [0.1, 1, 10, 100, 1000]
#           'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 'scale'], 
}

all_params = []
all_train_accs = []
all_train_aucs = []
all_test_accs = []
all_test_aucs = []


pbar = tqdm(ParameterGrid(params))
for param in pbar:
    stack = StackingClassifier(estimators=estimators, 
                               final_estimator=SVC(kernel='linear',
#                                                    gamma=param['gamma'],
                                                   C=param['C'],
                                                   random_state=1, probability=True), n_jobs=-1)
    kf = KFold(n_splits=5, random_state=1, shuffle=True)
    train_accs = []
    test_accs = []
    train_aucs = []
    test_aucs = []
    split_count = 1
    for train_index, test_index in kf.split(X_train):
        pbar.set_description('{} '.format(split_count)+str(param))
        X_train_k, X_test_k = X_train[train_index], X_train[test_index]
        y_train_k, y_test_k = y_train[train_index], y_train[test_index]
        stack.fit(X_train_k,y_train_k)

        y_train_ypred = stack.predict(X_train_k)
        y_train_prob = stack.predict_proba(X_train_k)

        y_test_ypred = stack.predict(X_test_k)
        y_test_prob = stack.predict_proba(X_test_k)

        train_acc, train_auc, test_acc, test_auc = metrics.accuracy_score(y_train_k, y_train_ypred), roc_auc_score(y_train_k, y_train_prob, multi_class='ovo'),metrics.accuracy_score(y_test_k, y_test_ypred), roc_auc_score(y_test_k, y_test_prob, multi_class='ovo')
        train_accs.append(train_acc)
        test_accs.append(test_acc)
        train_aucs.append(train_auc)
        test_aucs.append(test_auc)
        split_count += 1
    train_accs = np.asarray(train_accs)
    test_accs = np.asarray(test_accs)
    train_aucs = np.asarray(train_aucs)
    test_aucs = np.asarray(test_aucs)

    print("Train: Acc: {:.5f} +/- {:.5f},  Auc: {:.5f} | Test: Acc: {:.5f} +/- {:.5f},  Auc: {:.5f}".
              format(train_accs.mean(), train_accs.std() * 2, train_aucs.mean(),
                     test_accs.mean(), test_accs.std() * 2, test_aucs.mean()))
    
    all_params.append(param)
    all_train_accs.append(train_accs)
    all_train_aucs.append(train_aucs)
    all_test_accs.append(test_accs)
    all_test_aucs.append(test_aucs)


1 {'C': 1}:  20%|█████████████████████████████▍                                                                                                                     | 1/5 [08:14<32:59, 494.89s/it]

Train: Acc: 0.99907 +/- 0.00085,  Auc: 1.00000 | Test: Acc: 0.56451 +/- 0.00775,  Auc: 0.81167


1 {'C': 10}:  40%|██████████████████████████████████████████████████████████▍                                                                                       | 2/5 [16:00<24:18, 486.16s/it]

Train: Acc: 0.99979 +/- 0.00039,  Auc: 1.00000 | Test: Acc: 0.56275 +/- 0.00687,  Auc: 0.81118


1 {'C': 100}:  60%|███████████████████████████████████████████████████████████████████████████████████████                                                          | 3/5 [23:50<16:02, 481.21s/it]

Train: Acc: 0.99987 +/- 0.00023,  Auc: 1.00000 | Test: Acc: 0.56347 +/- 0.00752,  Auc: 0.81087


1 {'C': 1000}:  80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 4/5 [31:51<08:01, 481.09s/it]

Train: Acc: 0.99987 +/- 0.00023,  Auc: 1.00000 | Test: Acc: 0.56358 +/- 0.00889,  Auc: 0.81081


5 {'C': 1000}: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [48:26<00:00, 581.40s/it]

Train: Acc: 0.99987 +/- 0.00023,  Auc: 1.00000 | Test: Acc: 0.56347 +/- 0.00887,  Auc: 0.81082





In [41]:
mean_test_accs = [x.mean() for x in all_test_accs]
mean_test_aucs = [x.mean() for x in all_test_aucs]
print(mean_test_accs)
print(mean_test_aucs)
print('Best C:', all_params[mean_test_accs.index(max(mean_test_accs))])

[0.5645076095350652, 0.5627517845181054, 0.5634746050167039, 0.5635778574689496, 0.5634744983509474]
[0.8116732111692183, 0.8111841687308171, 0.8108694627409158, 0.8108104329506667, 0.8108164993032758]
Best C: {'C': 0.1}
