# Manual tuning the hyper-parameters

In this file, we manually tune our hyper-parmeters.

First, we take the previous steps again.

In [2]:
# Supress unnecessary warnings so that presentation looks clean
import warnings
warnings.filterwarnings('ignore')

#importing the  necessary modules
import pandas                                      #to read and manipulate data
import zipfile                                     #to extract data
import numpy as np                                 #for matrix operations
#rest will be imported as and when required
#read the train and test zip file
zip_ref = zipfile.ZipFile("train.csv.zip", 'r')    
zip_ref.extractall()                               
zip_ref.close()

train_data = pandas.read_csv("train.csv")

import copy
test_data = copy.deepcopy(train_data.iloc[150000:])
train_data = train_data.iloc[:150000]

y_true = test_data['loss']

ids = test_data['id']

target = train_data['loss']

#drop the unnecessary column id and loss from both train and test set.
train_data.drop(['id','loss'],1,inplace=True)
test_data.drop(['id','loss'],1,inplace=True)

shift = 200
target = np.log(target+shift)

#merging both the datasets to make single joined dataset
joined = pandas.concat([train_data, test_data],ignore_index = True)
del train_data,test_data                                         #deleting previous one to save memory.

cat_feature = [n for n in joined.columns if n.startswith('cat')]  #list of all the features containing categorical values

#factorizing them
for column in cat_feature:
    joined[column] = pandas.factorize(joined[column].values, sort=True)[0]
        
del cat_feature

#dividing the training data between training and testing set
train_data = joined.iloc[:150000,:]
test_data = joined.iloc[150000:,:]

In [3]:
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))



## Fine-tuning max_depth and min_child_weight

First, we vary max_depth in (10,12,14) and min_child_weight in (1,3,5,7)

NOTE - Since i ran it on cloud, i'm going to post the results obtained.

In [3]:
RANDOM_STATE = 2016
params = {
        'min_child_weight': 1,
        'eta': 0.1,
        'colsample_bytree': 0.8,
        'max_depth': 5,
        'subsample': 0.8,
        'alpha': 1,
        'gamma': 0,
        'silent': 1,
        'verbose_eval': True,
        'seed': RANDOM_STATE,'eval_metric': 'mae','verbose_eval': 2,
}


max_depth_list = [10,12,14]
min_child_weight_list = [1,3,5,7]
num_rounds = 3000

xgtrain = xgb.DMatrix(train_data, label=target)
xgtest = xgb.DMatrix(test_data)
maxima = 100000
for maxdep in max_depth_list:
    for minchild in min_child_weight_list:
        params['max_depth'] = maxdep
        params['min_child_weight']=minchild
        
        cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,seed=RANDOM_STATE,
             callbacks=[xgb.callback.early_stop(50)])
        if (cv_result['test-mae-mean'].values[-1] <maxima ):
            maxima = cv_result['test-mae-mean'].values[-1]
            bestmaxdep = maxdep
            bestminchild = minchild
        print (maxdep,'     ',minchild,"     ",cv_result['test-mae-mean'].values[-1])

print (bestmaxdep,"           ",bestminchild)




It gives values as max_depth = 10 and min_child_weight = 7. Since both are border cases, we need to check for more.

Now, we check for max_depth in (4,5,6,7,8,9,10,11) and min_child_weight in (6,7,8,9)

NOTE- I ran it different ties, like at first for max_depth = 7,8,9 and then max_depth = 5,6,7 etc.However, for the sake of simplicity and not wanting to make this file longer than required i have combined the results in a single file.  

In [None]:
max_depth_list = [4,5,6,7,8,9,10,11]
min_child_weight_list = [6,7,8,9]
num_rounds = 3000

maxima = 100000
for maxdep in max_depth_list:
    for minchild in min_child_weight_list:
        params['max_depth'] = maxdep
        params['min_child_weight']=minchild
        
        cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,seed=RANDOM_STATE,
             callbacks=[xgb.callback.early_stop(50)])
        if (cv_result['test-mae-mean'].values[-1] <maxima ):
            maxima = cv_result['test-mae-mean'].values[-1]
            bestmaxdep = maxdep
            bestminchild = minchild
        print (maxdep,'     ',minchild,"     ",cv_result['test-mae-mean'].values[-1])

print (bestmaxdep,"           ",bestminchild)



Finally, we get the best value of max_depth = 8 and min_child_weight = 8.

## Tuning Gamma now

Now, we set the values of max_depth and min_child_weight and vary gamma in (0.0,0.1,0.2,0.5,0.8,1.0)

In [None]:
params['max_depth'] = 6
params['min_child_weight']=8

gamma_list = [0.0,0.1,0.2,0.3,0.4]

num_rounds = 3000

maxima = 100000

for gamma_val in gamma_list:
    params['gamma'] = gamma_val
    cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,seed=RANDOM_STATE,
             callbacks=[xgb.callback.early_stop(50)])
    if (cv_result['test-mae-mean'].values[-1] <maxima ):
        maxima = cv_result['test-mae-mean'].values[-1]
        bestgamma = gamma_val
            
    print ('gamma:','     ',gamma_val,"     ",cv_result['test-mae-mean'].values[-1])

print ('best gamma value:',bestgamma)


The best value for gamma is 0.0 only.

## Tuning col_sample_list and subsample

Now, we set the values of max_depth and min_child_weight and vary col_sample in (0.0,0.1,0.2,0.5,0.8,1.0) and subsample in ()

In [None]:
params['gamma'] = 0.0

col_sample_list = [0.3,0.5,0.7,0.9]
subsample_list = [0.3,0.5,0.7,0.9]

num_rounds = 3000

maxima = 100000

for colsample in col_sample_list:
    for subsample in subsample_list:
        params['colsample_bytree'] = colsample
        params['subsample'] = subsample
        
        cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,seed=RANDOM_STATE,
             callbacks=[xgb.callback.early_stop(50)])
        if (cv_result['test-mae-mean'].values[-1] <maxima ):
            maxima = cv_result['test-mae-mean'].values[-1]
            bestcol = colsample
            bestsub = subsample
        print ("colsample:",colsample,'     ',"subsample:",subsample,"     ",cv_result['test-mae-mean'].values[-1])

print ("col:",bestcol,"           ","sub:",bestsub)


The best values obtained are col_sample = 0.5 and sub_sample = 0.9. Next, we try to values in a gap of 0.05

In [None]:
col_sample_list = [0.45,0.50,0.55]
subsample_list = [0.85,0.90,0.95,1.0]

num_rounds = 3000

maxima = 100000

for colsample in col_sample_list:
    for subsample in subsample_list:
        params['colsample_bytree'] = colsample
        params['subsample'] = subsample
        
        cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,seed=RANDOM_STATE,
             callbacks=[xgb.callback.early_stop(50)])
        if (cv_result['test-mae-mean'].values[-1] <maxima ):
            maxima = cv_result['test-mae-mean'].values[-1]
            bestcol = colsample
            bestsub = subsample
        print ("colsample:",colsample,'     ',"subsample:",subsample,"     ",cv_result['test-mae-mean'].values[-1])

print ("col:",bestcol,"           ","sub:",bestsub)

The best values obtained are col_sample = 0.45 and sub_sample = 1.0

We don't need to tune alpha as most of the regularization is provided by gamma itself.


Let's lower the learning rate now to 0.01 and use these values in the final model.

## Open Tuned_XGBoost_part4.ipynb