In [1]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

''' Function to perform gridsearch parameter tuning on xgboost algorithm.

# Arguments:
    x_train:     ndarray, the trainset features
    y_train:     array, the trainset labels
    params:      dict, initial xgboost parameters
    tune_params: dict, parameters to tune with value grid
    
# Returns:
    gsearch:     dict, optimal values for tune_params parameters
'''

def gridsearch(X, y, params, tune_params):

    model = xgb.XGBClassifier(learning_rate = params['learning_rate'], n_estimators = params['n_estimator']
                              , max_depth = params['max_depth'], min_child_weight = params['min_child_weight']
                              , gamma = params['gamma'], subsample = params['subsample']
                              , colsample_bytree = params['colsample_bytree'], objective = params['objective']
                              , scale_pos_weight = params['scale_pos_weight'], seed = params['seed'])

    gsearch = GridSearchCV(estimator=model, param_grid=tune_params, scoring=params['scoring']
                           , n_jobs=1, iid=False, verbose=1)

    gsearch.fit(X, y)
    print(gsearch.best_params_)

    return gsearch.best_params_

def update(base, new):
    for par in new.keys():
        base[par] = new[par]
    return base

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('../../data/processed.csv')
X = df.drop('target', axis=1).values
y = df['target'].values

# keep 15% test data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=33)

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

In [3]:
# Set initial parameters and find optimal number of boosting rounds

xgb_params = {
    'objective': 'multi:softmax',
    'eval_metric': 'merror',
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'seed': 2017,
    'silent': 1,
    'num_parallel_tree': 1,
    'num_class': 5
}

res = xgb.cv(xgb_params,
             dtrain,
             num_boost_round=750,
             nfold=4,
             seed=2017,
             stratified=False,
             early_stopping_rounds=15,
             verbose_eval=20,
             show_stdv=True,
             maximize=False)

[0]	train-merror:0.435078+0.00192528	test-merror:0.493408+0.0110683
[20]	train-merror:0.383806+0.00834742	test-merror:0.471334+0.00742257


In [4]:
# formulate initial parameters
params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'scoring': 'accuracy',
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'scale_pos_weight': 1,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'seed': 1337,
    'silent': 1,
    'num_parallel_tree': 1,
    'gamma': 0,
    'n_estimator': res.shape[0]
}

In [5]:
# formulate the grid
tune_params = {
 'max_depth': list(range(3,10,2)),
 'min_child_weight': list(range(1,6,2))}
tmp = gridsearch(x_train, y_train, params, tune_params)

# narrow down grid
tune_params = {
 'max_depth': [tmp['max_depth'] + i for i in range(-1, 2)],
 'min_child_weight': [tmp['min_child_weight'] + i for i in range(-1, 2)]}
tmp = gridsearch(x_train, y_train, params, tune_params)

# update base parameters with optimal values
params = update(params, tmp)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   19.2s finished


{'max_depth': 3, 'min_child_weight': 1}
Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    7.2s finished


{'max_depth': 3, 'min_child_weight': 1}


In [6]:
tune_params = {
 'gamma':[i/10.0 for i in list(range(0,5))]
}
tmp = gridsearch(x_train, y_train, params, tune_params)
params = update(params, tmp)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    4.1s finished


{'gamma': 0.0}


In [7]:
pars = ['scale_pos_weight','gamma','colsample_bytree','max_depth'
        ,'subsample','num_parallel_tree','min_child_weight']
for par in pars:
    xgb_params[par] = params[par]

res = xgb.cv(xgb_params,
             dtrain,
             num_boost_round=750,
             nfold=4,
             seed=2017,
             stratified=False,
             early_stopping_rounds=15,
             verbose_eval=20,
             show_stdv=True,
             maximize=False)

[0]	train-merror:0.469237+0.00275582	test-merror:0.476728+0.00853778
[20]	train-merror:0.460547+0.00428149	test-merror:0.470135+0.00842014


In [8]:
tune_params = {
 'subsample': [i/10.0 for i in list(range(5,10))],
 'colsample_bytree': [i/10.0 for i in list(range(5,10))]
}
tmp = gridsearch(x_train, y_train, params, tune_params)
params = update(params, tmp)

tune_params = {
 'subsample': [i/100.0 for i in list(range(int(tmp['subsample'] * 100) - 15
                                           , int(tmp['subsample'] * 100) + 15, 5))],
 'colsample_bytree': [i/100.0 for i in list(range(int(tmp['subsample'] * 100) - 15
                                                  , int(tmp['subsample'] * 100) + 15, 5))]
}
tmp = gridsearch(x_train, y_train, params, tune_params)
params = update(params, tmp)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:   20.3s finished


{'subsample': 0.5, 'colsample_bytree': 0.7}
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:   21.7s finished


{'subsample': 0.55, 'colsample_bytree': 0.35}


In [9]:
tune_params = {
 'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05]
}
tmp = gridsearch(x_train, y_train, params, tune_params)
params = update(params, tmp)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    2.8s finished


{'reg_alpha': 0}


In [10]:
params

{'colsample_bytree': 0.35,
 'gamma': 0.0,
 'learning_rate': 0.1,
 'max_depth': 3,
 'min_child_weight': 1,
 'n_estimator': 17,
 'num_class': 3,
 'num_parallel_tree': 1,
 'objective': 'multi:softmax',
 'reg_alpha': 0,
 'scale_pos_weight': 1,
 'scoring': 'accuracy',
 'seed': 1337,
 'silent': 1,
 'subsample': 0.55}