# Tuning

In [61]:
import xgboost as xgb

In [62]:
from xgboost import XGBClassifier

## Baseline Model

In [130]:
# base model
base_model = XGBClassifier(random_state=10)

In [131]:
base_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=10,
              reg_alpha=0, reg_lambda=1, ...)

In [134]:
accuracy_score(y_train,base_model.predict(X_train)) , accuracy_score(y_test,base_model.predict(X_test)) 

(0.9990108803165183, 0.8003952569169961)

## Steps

min_samples_split = 500 : ~0.5-1% of total values. Since this is imbalanced class problem, we'll take small value

min_samples_leaf = 50 : Just using for preventing overfitting. will be tuned later.

max_depth = 8 : since high number of observations and predictors, choose relatively high value

max_features = 'sqrt' : general thumbrule to start with

subsample = 0.8 : typically used value (will be tuned later)

0.1 is assumed to be a good learning rate to start with. 

n_estimators: 

Note that 60 is a reasonable value and can be used as it is. But it might not be the same in all cases. Other situations:

If the value is around 20, you might want to try lowering the learning rate to 0.05 and re-run grid search

If the values are too high ~100, tuning the other parameters will take long time and you can try a higher learning rate

In [141]:
#Grid seach on n_estimators
param_test1 = {'n_estimators':range(20,81,10)}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.05, min_samples_split=20,
                                  min_samples_leaf=50,max_depth=8,max_features='sqrt', subsample=0.8,random_state=10), 
                       param_grid = param_test1, scoring='accuracy',n_jobs=4, cv=5)
gsearch1.fit(X_train, y_train)

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=0.05, max_bin=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=8,
                                     max_features='sqrt', max_leaves=None,
                                     min_child_weight=None, min_samples_leaf=50,
     

In [142]:
gsearch1.best_params_, gsearch1.best_score_

({'n_estimators': 60}, 0.8185026280405818)

In [143]:
accuracy_score(y_train,gsearch1.predict(X_train)) , accuracy_score(y_test,gsearch1.predict(X_test)) 

(0.9124629080118695, 0.8201581027667985)

In [245]:
#Grid seach on subsample and max_features
param_test2 = {'max_depth':range(5,16,2), 'min_samples_split':range(20,101,20)}
gsearch2 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.05, n_estimators=60,
                                                min_samples_leaf=50,max_features='sqrt', subsample=0.8, random_state=10), 
                       param_grid = param_test2, scoring='accuracy',n_jobs=4, cv=5)
gsearch2.fit(X_train, y_train)

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=0.05, max_bin=None,
                                     max_ca...ot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_features='sqrt', max_leaves=None,
                                     min_child_weight=None, min_samples_leaf=50,
        

In [246]:
gsearch2.best_params_, gsearch2.best_score_

({'max_depth': 7, 'min_samples_split': 20}, 0.8185087397628653)

In [247]:
accuracy_score(y_train,gsearch2.predict(X_train)) , accuracy_score(y_test,gsearch2.predict(X_test)) 

(0.8951533135509396, 0.8241106719367589)

Since we reached the minimum of min_sales_split, we should check higher values as well. Also, we can tune min_samples_leaf with it now as max_depth is fixed. 

One might argue that max depth might change for higher value but if you observe the output closely, a max_depth of 9 had a better model for most of cases. 

So lets perform a grid search on them:

In [176]:
#Grid seach on subsample and max_features
param_test3 = {'min_samples_split':range(2,10,2), 'min_samples_leaf':range(30,71,10)}
gsearch3 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.05, n_estimators=60,max_depth=7,
                                                    max_features='sqrt', subsample=0.8, random_state=10), 
                       param_grid = param_test3, scoring='accuracy',n_jobs=4, cv=5)
gsearch3.fit(X_train, y_train)

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=0.05, max_bin=None,
                                     max_ca...ehot=None,
                                     max_delta_step=None, max_depth=7,
                                     max_features='sqrt', max_leaves=None,
                                     min_child_weight=None, missing=nan,
                 

In [177]:
gsearch3.best_params_, gsearch3.best_score_

({'min_samples_leaf': 30, 'min_samples_split': 2}, 0.8185087397628653)

In [178]:
accuracy_score(y_train,gsearch3.predict(X_train)) , accuracy_score(y_test,gsearch3.predict(X_test)) 

(0.8951533135509396, 0.8241106719367589)

In [204]:
#Grid seach on subsample and max_features
param_test4 = {'max_features':range(7,20,2)}
gsearch4 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.05, n_estimators=60,max_depth=7, 
                            min_samples_split=2, min_samples_leaf=30, subsample=0.8, random_state=10),
                       param_grid = param_test4, scoring='roc_auc',n_jobs=4, cv=5)
gsearch4.fit(X_train, y_train)

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=0.05, max_bin=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=7,
                                     max_leaves=None, min_child_weight=None,
                                     min_samples_leaf=30, min_samples_split=2,
     

In [205]:
gsearch4.best_params_, gsearch4.best_score_

({'max_features': 7}, 0.8811841562525526)

In [206]:
accuracy_score(y_train,gsearch4.predict(X_train)) , accuracy_score(y_test,gsearch4.predict(X_test)) 

(0.8951533135509396, 0.8241106719367589)

In [216]:
#Grid seach on subsample and max_features
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.05, n_estimators=60,max_depth=7, 
                            min_samples_split=2, min_samples_leaf=30, random_state=10, max_features=7),
                       param_grid = param_test5, scoring='accuracy',n_jobs=4, cv=5)
gsearch5.fit(X_train, y_train)

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=0.05, max_bin=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=7,
                                     max_features=7, max_leaves=None,
                                     min_child_weight=None, min_samples_leaf=30,
          

In [217]:
gsearch5.best_params_, gsearch5.best_score_

({'subsample': 0.8}, 0.8185087397628653)

In [218]:
accuracy_score(y_train,gsearch5.predict(X_train)) , accuracy_score(y_test,gsearch5.predict(X_test)) 

(0.8951533135509396, 0.8241106719367589)

With all tuned lets try reducing the learning rate and proportionally increasing the number of estimators to get more robust results:

In [221]:
#Choose all predictors except target & IDcols
gbm_tuned_1  = XGBClassifier(learning_rate=0.01, n_estimators=300,max_depth=7, subsample=0.8,
                            min_samples_split=2, min_samples_leaf=30, random_state=10, max_features=7)
gbm_tuned_1.fit(X_train, y_train)

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=7, max_features=7, max_leaves=0,
              min_child_weight=1, min_samples_leaf=30, min_samples_split=2,
              missing=nan, monotone_constraints='()', n_estimators=300,
              n_jobs=0, num_parallel_tree=1, predictor='auto', ...)

In [222]:
accuracy_score(y_train,gbm_tuned_1.predict(X_train)) , param_grid = param_test5, accuracy_score(y_test,gbm_tuned_1.predict(X_test)) 

(0.8986152324431256, 0.8181818181818182)

In [226]:
#Choose all predictors except target & IDcols
gbm_tuned_2  = XGBClassifier(learning_rate=0.005, n_estimators=600,max_depth=7, subsample=0.8,
                            min_samples_split=2, min_samples_leaf=30, random_state=10, max_features=7)
gbm_tuned_2.fit(X_train, y_train)

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.005, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=7, max_features=7, max_leaves=0,
              min_child_weight=1, min_samples_leaf=30, min_samples_split=2,
              missing=nan, monotone_constraints='()', n_estimators=600,
              n_jobs=0, num_parallel_tree=1, predictor='auto', ...)

In [225]:
accuracy_score(y_train,gbm_tuned_2.predict(X_train)) , accuracy_score(y_test,gbm_tuned_2.predict(X_test)) 

(0.9742828882294757, 0.8221343873517787)

In [229]:
#Choose all predictors except target & IDcols
gbm_tuned_3  = XGBClassifier(learning_rate=0.001, n_estimators=3000,max_depth=7, subsample=0.8,
                            min_samples_split=2, min_samples_leaf=30, random_state=10, max_features=7)
gbm_tuned_3.fit(X_train, y_train)

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.001, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=7, max_features=7, max_leaves=0,
              min_child_weight=1, min_samples_leaf=30, min_samples_split=2,
              missing=nan, monotone_constraints='()', n_estimators=3000,
              n_jobs=0, num_parallel_tree=1, predictor='auto', ...)

In [230]:
accuracy_score(y_train,gbm_tuned_3.predict(X_train)) , accuracy_score(y_test,gbm_tuned_3.predict(X_test)) 

(0.9010880316518298, 0.8181818181818182)

In [237]:
#Choose all predictors except target & IDcols
gbm_tuned_4  = XGBClassifier(learning_rate=0.005, n_estimators=800,max_depth=7, subsample=0.8,
                            min_samples_split=2, min_samples_leaf=30, random_state=10, max_features=7)
gbm_tuned_4.fit(X_train, y_train)

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.005, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=7, max_features=7, max_leaves=0,
              min_child_weight=1, min_samples_leaf=30, min_samples_split=2,
              missing=nan, monotone_constraints='()', n_estimators=800,
              n_jobs=0, num_parallel_tree=1, predictor='auto', ...)

In [238]:
accuracy_score(y_train,gbm_tuned_4.predict(X_train)) , accuracy_score(y_test,gbm_tuned_4.predict(X_test)) 

(0.9144411473788329, 0.8221343873517787)

# Early Stopping

In [239]:
from sklearn.model_selection import train_test_split
X_train_2, X_valid, y_train_2, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=42,stratify=y_train)

In [275]:
my_model = XGBClassifier(n_estimators=2000,learning_rate=0.01,n_jobs=-1)
my_model.fit(X_train_2, y_train_2, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)],
             verbose=False)



XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=2000,
              n_jobs=-1, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [276]:
my_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=2000,
              n_jobs=-1, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [277]:
accuracy_score(y_train,my_model.predict(X_train)) 

0.9817012858555886

In [278]:
y_pred = my_model.predict(X_test) 

In [279]:
accuracy_score(y_test, y_pred) 

0.8181818181818182