# gSearch

This notebook handles arbitrary data input and executes a complete gridsearch on XGBoost parameters. In the end, a dictionary with all optimal par:value pairs are returned.

In [1]:
import xgboost as xgb
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import make_classification

seed = 1337

In [2]:
X, y = make_classification(n_samples=1000, n_features=20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [None]:
def gSearch(X, y, params, tune_params):
    
    # define model
    model = xgb.XGBClassifier(learning_rate = params['learning_rate'], n_estimators = params['n_estimator']
                              , max_depth = params['max_depth'], min_child_weight = params['min_child_weight']
                              , gamma = params['gamma'], subsample = params['subsample']
                              , colsample_bytree = params['colsample_bytree'], objective = params['objective']
                              , scale_pos_weight = params['scale_pos_weight'], seed = params['seed'])
    
    # define gridsearch
    gsearch = GridSearchCV(estimator=model, param_grid=tune_params, scoring=params['scoring']
                           , n_jobs = 1, iid = False, verbose = 1)

    # obtain optimal parameters
    gsearch.fit(X, y)
    print(gsearch.best_params_)
    
    # return optimal parameters
    return gsearch

def updatePars(base, new):
    for k in new.keys():
        base[k] = new[k]
    return base

## n_estimator

In [22]:
# Set initial parameters and find optimal number of boosting rounds

xgb_params = {
    'objective': 'multi:softmax',
    'eval_metric': 'merror',
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'seed': seed,
    'silent': 1,
    'num_parallel_tree': 1,
    'num_class': 2
}

res = xgb.cv(xgb_params,
             dtrain,
             num_boost_round=750,
             nfold=4,
             seed=seed,
             stratified=False,
             early_stopping_rounds=15,
             verbose_eval=20,
             show_stdv=True,
             maximize=False)

[0]	train-merror:0.04625+0.00246515	test-merror:0.08875+0.0147373
[20]	train-merror:0.0129168+0.0024651	test-merror:0.08125+0.0134048


## Hyperparameter tuning

In [23]:
# formulate initial parameters
params = {
    'objective': 'binary:logistic',
    'scoring': 'f1',
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'scale_pos_weight': 1,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'seed': 1337,
    'silent': 1,
    'num_parallel_tree': 1,
    'gamma': 0,
    'n_estimator': res.shape[0]
}

### max_depth and min_child_weight

In [24]:
# formulate the grid
tunePar = {
 'max_depth': list(range(3,10,2)),
 'min_child_weight': list(range(1,6,2))}
tmp = gSearch(X_train, y_train, params, tunePar)

tunePar = {
 'max_depth': [tmp.best_params_['max_depth'] + i for i in range(-1, 2)],
 'min_child_weight': [tmp.best_params_['min_child_weight'] + i for i in range(-1, 2)]}
tmp = gSearch(X_train, y_train, params, tunePar)

params = updatePars(params, tmp.best_params_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:    0.4s finished


{'max_depth': 3, 'min_child_weight': 1}
Fitting 3 folds for each of 9 candidates, totalling 27 fits
{'max_depth': 3, 'min_child_weight': 1}


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    0.3s finished


### gamma

In [25]:
tunePar = {
 'gamma':[i/10.0 for i in list(range(0,5))]
}
tmp = gSearch(X_train, y_train, params, tunePar)
params = updatePars(params, tmp.best_params_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
{'gamma': 0.0}


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.2s finished


## Update n_estimator

In [26]:
pars = ['scale_pos_weight','gamma','colsample_bytree','max_depth'
        ,'subsample','num_parallel_tree','min_child_weight']
for par in pars:
    xgb_params[par] = params[par]

res = xgb.cv(xgb_params,
             dtrain,
             num_boost_round=750,
             nfold=4,
             seed=seed,
             stratified=False,
             early_stopping_rounds=15,
             verbose_eval=20,
             show_stdv=True,
             maximize=False)

[0]	train-merror:0.06125+0.00430981	test-merror:0.08125+0.0155624
[20]	train-merror:0.0495833+0.00246485	test-merror:0.0775+0.00901388


### subsample and colsample_bytree 

In [27]:
tunePar = {
 'subsample': [i/10.0 for i in list(range(5,10))],
 'colsample_bytree': [i/10.0 for i in list(range(5,10))]
}
tmp = gSearch(X_train, y_train, params, tunePar)
params = updatePars(params, tmp.best_params_)

tunePar = {
 'subsample': [i/100.0 for i in list(range(int(tmp.best_params_['subsample'] * 100) - 15
                                           , int(tmp.best_params_['subsample'] * 100) + 15, 5))],
 'colsample_bytree': [i/100.0 for i in list(range(int(tmp.best_params_['subsample'] * 100) - 15
                                                  , int(tmp.best_params_['subsample'] * 100) + 15, 5))]
}
tmp = gSearch(X_train, y_train, params, tunePar)
params = updatePars(params, tmp.best_params_)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    0.9s finished


{'colsample_bytree': 0.6, 'subsample': 0.6}
Fitting 3 folds for each of 36 candidates, totalling 108 fits
{'colsample_bytree': 0.6, 'subsample': 0.6}


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:    1.1s finished


### alpha

In [28]:
tunePar = {
 'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05]
}
tmp = gSearch(X_train, y_train, params, tunePar)
params = updatePars(params, tmp.best_params_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
{'reg_alpha': 0}


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.2s finished


## Results
This dictionary contains all optimal parameter values

In [32]:
params

{'colsample_bytree': 0.6,
 'gamma': 0.0,
 'learning_rate': 0.1,
 'max_depth': 3,
 'min_child_weight': 1,
 'n_estimator': 23,
 'num_parallel_tree': 1,
 'objective': 'binary:logistic',
 'reg_alpha': 0,
 'scale_pos_weight': 1,
 'scoring': 'f1',
 'seed': 1337,
 'silent': 1,
 'subsample': 0.6}