# XGBoost parameters tuning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split,  GridSearchCV
import sklearn.metrics as metrics
import time

### Parameters tunning

General approach to [parameters tuning](https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/):
1. Choose a relatively high learning rate. Generally a default learning rate = 0.1 of xgboost() function works but somewhere between 0.05 to 0.3 should work for different problems. Determine the optimum number of trees for this learning rate.
2. Tune tree-specific parameters ( max_depth, min_child_weight, gamma, subsample, colsample_bytree) for decided learning rate and number of trees. Note that we can choose different parameters to define a tree and I’ll take up an example here.
3. Tune regularization parameters (lambda, alpha) for xgboost which can help reduce model complexity and enhance performance.
4. Lower the learning rate and decide the optimal parameters .

Data is from [bnp-paribas-cardif-claims-management](https://www.kaggle.com/c/bnp-paribas-cardif-claims-management/leaderboard) 

In [2]:
data = pd.read_csv('train.csv', nrows = 10000)
target = 'target'
IDcol = 'ID'
data.head()

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,4,1,,,C,,9.191265,,,2.30163,...,,,0.598896,AF,,,1.957825,0,,
1,5,1,0.943877,5.310079,C,4.410969,5.326159,3.979592,3.928571,0.019645,...,9.333333,2.477596,0.013452,AE,1.773709,3.922193,1.120468,2,0.883118,1.176472
2,6,1,0.797415,8.304757,C,4.22593,11.627438,2.0977,1.987549,0.171947,...,7.018256,1.812795,0.002267,CJ,1.41523,2.954381,1.990847,1,1.677108,1.034483
3,8,1,,,C,,,,,,...,,,,Z,,,,0,,
4,9,0,,,C,,8.856791,,,0.359993,...,,,0.049861,X,,,1.536222,0,,


Keeping only numerical features for now

In [3]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_col = data.select_dtypes(include=numerics).fillna(0)
categorical_col = data.select_dtypes(exclude=numerics)
df = numeric_col
predictors = [x for x in df.columns if x not in [target, IDcol]]
df.head()

Unnamed: 0,ID,target,v1,v2,v4,v5,v6,v7,v8,v9,...,v121,v122,v123,v124,v126,v127,v128,v129,v130,v131
0,4,1,0.0,0.0,0.0,9.191265,0.0,0.0,2.30163,0.0,...,0.0,0.0,0.0,0.598896,0.0,0.0,1.957825,0,0.0,0.0
1,5,1,0.943877,5.310079,4.410969,5.326159,3.979592,3.928571,0.019645,12.666667,...,2.238806,9.333333,2.477596,0.013452,1.773709,3.922193,1.120468,2,0.883118,1.176472
2,6,1,0.797415,8.304757,4.22593,11.627438,2.0977,1.987549,0.171947,8.965516,...,1.956521,7.018256,1.812795,0.002267,1.41523,2.954381,1.990847,1,1.677108,1.034483
3,8,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
4,9,0,0.0,0.0,0.0,8.856791,0.0,0.0,0.359993,0.0,...,0.0,0.0,0.0,0.049861,0.0,0.0,1.536222,0,0.0,0.0


Split in train and test datasets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df[predictors], df[target], test_size=0.3, random_state=2)

function that calculates AUC on test and train datasets

In [5]:
def alg_auc(alg, X_train, y_train, X_test, y_test):
    alg.fit(X_train, y_train)
    auc_train = metrics.roc_auc_score(y_train, alg.predict_proba(X_train)[:,1]) 
    auc_test = metrics.roc_auc_score(y_test, alg.predict_proba(X_test)[:,1])
    return auc_train, auc_test

XGBoost out of the box AUC scores

In [47]:
start_time = time.time()
reg_base = xgb.XGBClassifier()
alg_accuracy = alg_auc(reg_base, X_train, y_train, X_test, y_test)
execution_time = round((time.time() - start_time),4)
print("execution time %s seconds"%execution_time, 
      "train set AUC is %s"%alg_accuracy[0],
        "validation set AUC is %s"%alg_accuracy[1])


execution time 6.5827 seconds train set AUC is 0.7965274990551255 validation set AUC is 0.7297868159365959


Data frame to store improvements

In [149]:
auc_improvement = pd.DataFrame({
                   'auc_validation': [alg_accuracy[1]],
                   'current_auc_improvement': [np.nan], 
                   'total_auc_improvement': [np.nan],
                   'execution_time': [execution_time],
                   'comment': ['out of box'] 
})                               
auc_improvement

Unnamed: 0,auc_validation,current_auc_improvement,total_auc_improvement,execution_time,comment
0,0.729787,,,6.6643,out of box


Function that calculates optimal number of trees for given set of parameters

In [26]:
def optimal_trees(alg, X_train, y_train, X_test, y_test):
    xgb_param = alg.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train, y_train)
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=5,
            metrics='auc', early_stopping_rounds=10)#, verbose_eval = True)
    alg.set_params(n_estimators=cvresult.shape[0])
    alg.fit(X_train, y_train)
    auc_train = metrics.roc_auc_score(y_train, alg.predict_proba(X_train)[:,1])    
    auc_test = metrics.roc_auc_score(y_test, alg.predict_proba(X_test)[:,1])
    return cvresult.shape[0], auc_train, auc_test
    

### 1. Select default learning rate and optimize for number of trees

In [150]:
start_time = time.time()
reg = xgb.XGBClassifier()
opt_tree = optimal_trees(reg, X_train, y_train,X_test, y_test)
execution_time = round((time.time() - start_time),4)
print("execution time %s seconds"%execution_time, opt_tree)      

execution time 20.3111 seconds (49, 0.7548814950694323, 0.7331687612208257)


In [151]:
improvement = opt_tree[2]/auc_improvement['auc_validation'][len(auc_improvement.index)-1] - 1
auc_improvement = auc_improvement.append(pd.DataFrame({'auc_validation': [opt_tree[2]],
                   'current_auc_improvement': [improvement], 
                   'total_auc_improvement': [improvement],                                    
                   'execution_time': [execution_time],
                   'comment': ['optimized number of trees']}), ignore_index = True)
auc_improvement

Unnamed: 0,auc_validation,current_auc_improvement,total_auc_improvement,execution_time,comment
0,0.729787,,,6.6643,out of box
1,0.733169,0.004634,0.004634,20.3111,optimized number of trees


### 2. Tune tree specific parameters max_depth and min_child_weight

In [110]:
start_time = time.time()
param_test = {
 'max_depth':range(1,6,1),
 'min_child_weight':range(1,6,1)
}
gsearch = GridSearchCV(estimator = xgb.XGBClassifier(n_estimators = opt_tree[0]), 
 param_grid = param_test, scoring='roc_auc', cv=5)
gsearch.fit(X_train,y_train)
gsearch.best_params_, gsearch.best_score_

({'max_depth': 2, 'min_child_weight': 5}, 0.7048278603858668)

Optimize for number of trees with optimal depth and child weight

In [111]:
reg = xgb.XGBClassifier(max_depth = gsearch.best_params_['max_depth'], 
                        min_child_weight = gsearch.best_params_['min_child_weight'])
opt_tree = optimal_trees(reg, X_train, y_train,X_test, y_test)
execution_time = round((time.time() - start_time),4)

In [152]:
current_improvement = opt_tree[2]/auc_improvement['auc_validation'][len(auc_improvement.index)-1] - 1
total_improvement = opt_tree[2]/auc_improvement['auc_validation'][0] - 1
auc_improvement = auc_improvement.append(pd.DataFrame({'auc_validation': [opt_tree[2]],
                   'current_auc_improvement': [current_improvement], 
                   'total_auc_improvement': [total_improvement],                                    
                   'execution_time': [execution_time],
                   'comment': ['optimized depth and child weight']}) , ignore_index = True)
auc_improvement

Unnamed: 0,auc_validation,current_auc_improvement,total_auc_improvement,execution_time,comment
0,0.729787,,,6.6643,out of box
1,0.733169,0.004634,0.004634,20.3111,optimized number of trees
2,0.733692,0.000714,0.005352,382.5486,optimized depth and child weight


### 3. Tune gamma

In [154]:
start_time = time.time()
param_test2 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch2 = GridSearchCV(estimator = xgb.XGBClassifier(max_depth = gsearch.best_params_['max_depth'], 
                        min_child_weight = gsearch.best_params_['min_child_weight'],
                        n_estimators = opt_tree[0]), 
 param_grid = param_test2, scoring='roc_auc', cv=5)
gsearch2.fit(X_train,y_train)
execution_time = round((time.time() - start_time),4)
gsearch2.best_params_, gsearch2.best_score_, execution_time

({'gamma': 0.0}, 0.7048278603858668, 58.6462)

gamma = 0 is default value, don't need to optimize anything further

### 4. Tune subsample and colsample_bytree

In [155]:
start_time = time.time()
param_test3 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch3 = GridSearchCV(estimator = xgb.XGBClassifier(
            n_estimators = opt_tree[0],
            max_depth = gsearch.best_params_['max_depth'], 
            min_child_weight = gsearch.best_params_['min_child_weight']), 
 param_grid = param_test3, scoring='roc_auc', cv=5)
gsearch3.fit(X_train,y_train)
gsearch3.best_params_, gsearch3.best_score_

({'colsample_bytree': 0.9, 'subsample': 0.8}, 0.7034725786950405)

Optimize for number of trees with optimal subsample and colsample_bytree

In [170]:
reg = xgb.XGBClassifier(max_depth = gsearch.best_params_['max_depth'], 
                        min_child_weight = gsearch.best_params_['min_child_weight'],
                        colsample_bytree = gsearch3.best_params_['colsample_bytree'],
                        subsample = gsearch3.best_params_['subsample'])
opt_tree = optimal_trees(reg, X_train, y_train, X_test, y_test)
execution_time = round((time.time() - start_time),4)

In [179]:
auc_improvement

Unnamed: 0,auc_validation,current_auc_improvement,total_auc_improvement,execution_time,comment
0,0.729787,,,6.6643,out of box
1,0.733169,0.004634,0.004634,20.3111,optimized number of trees
2,0.733692,0.000714,0.005352,382.5486,optimized depth and child weight
3,0.735892,0.002998,0.008365,166.7705,optimized subsample and colsample_bytree
4,0.736202,0.00342,0.00879,48.6152,optimized regularization parameter


In [160]:
current_improvement = opt_tree[2]/auc_improvement['auc_validation'][len(auc_improvement.index)-1] - 1
total_improvement = opt_tree[2]/auc_improvement['auc_validation'][0] - 1
auc_improvement = auc_improvement.append(pd.DataFrame({'auc_validation': [opt_tree[2]],
                   'current_auc_improvement': [current_improvement], 
                   'total_auc_improvement': [total_improvement],                                    
                   'execution_time': [execution_time],
                   'comment': ['optimized subsample and colsample_bytree']}) , ignore_index = True)
auc_improvement

Unnamed: 0,auc_validation,current_auc_improvement,total_auc_improvement,execution_time,comment
0,0.729787,,,6.6643,out of box
1,0.733169,0.004634,0.004634,20.3111,optimized number of trees
2,0.733692,0.000714,0.005352,382.5486,optimized depth and child weight
3,0.733692,0.002998,0.008365,166.7705,optimized subsample and colsample_bytree


### 5. Tune regularization parameter

In [164]:
start_time = time.time()
param_test4 = {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]}

gsearch4 = GridSearchCV(estimator = xgb.XGBClassifier(n_estimators = opt_tree[0],
                        max_depth = gsearch.best_params_['max_depth'], 
                        min_child_weight = gsearch.best_params_['min_child_weight'],
                        colsample_bytree = gsearch3.best_params_['colsample_bytree'],
                        subsample = gsearch3.best_params_['subsample']), 
 param_grid = param_test4, scoring='roc_auc', cv=5)
gsearch4.fit(X_train,y_train)
gsearch4.best_params_, gsearch4.best_score_

({'reg_alpha': 1}, 0.7030374169917939)

In [165]:
reg = xgb.XGBClassifier(max_depth = gsearch.best_params_['max_depth'], 
                        min_child_weight = gsearch.best_params_['min_child_weight'],
                        colsample_bytree = gsearch3.best_params_['colsample_bytree'],
                        subsample = gsearch3.best_params_['subsample'],
                        reg_alpha = gsearch4.best_params_['reg_alpha'])
opt_tree = optimal_trees(reg, X_train, y_train, X_test, y_test)
execution_time = round((time.time() - start_time),4)

In [169]:
current_improvement = opt_tree[2]/auc_improvement['auc_validation'][len(auc_improvement.index)-1] - 1
total_improvement = opt_tree[2]/auc_improvement['auc_validation'][0] - 1
auc_improvement = auc_improvement.append(pd.DataFrame({'auc_validation': [opt_tree[2]],
                   'current_auc_improvement': [current_improvement], 
                   'total_auc_improvement': [total_improvement],                                    
                   'execution_time': [execution_time],
                   'comment': ['optimized regularization parameter']}) , ignore_index = True)
auc_improvement

Unnamed: 0,auc_validation,current_auc_improvement,total_auc_improvement,execution_time,comment
0,0.729787,,,6.6643,out of box
1,0.733169,0.004634,0.004634,20.3111,optimized number of trees
2,0.733692,0.000714,0.005352,382.5486,optimized depth and child weight
3,0.733692,0.002998,0.008365,166.7705,optimized subsample and colsample_bytree
4,0.736202,0.00342,0.00879,48.6152,optimized regularization parameter


In [185]:
auc_improvement

Unnamed: 0,auc_validation,current_auc_improvement,total_auc_improvement,execution_time,comment
0,0.729787,,,6.6643,out of box
1,0.733169,0.004634,0.004634,20.3111,optimized number of trees
2,0.733692,0.000714,0.005352,382.5486,optimized depth and child weight
3,0.735892,0.002998,0.008365,166.7705,optimized subsample and colsample_bytree
4,0.736202,0.000421,0.00879,48.6152,optimized regularization parameter


### 6. Checking if decreased learning rate improves AUC

In [106]:
reg = xgb.XGBClassifier(n_estimators =1000, learning_rate = 0.01, max_depth = 2, min_child_weight = 5, 
                        colsample_bytree = 0.9, subsample = 0.8, reg_alpha = 1)

optimal_trees(reg, X_train, y_train, X_test, y_test)

(420, 0.7221150038864224, 0.7359065264509167)

AUC is lower hence we don't change it

### Total improvement

In [186]:
auc_improvement

Unnamed: 0,auc_validation,current_auc_improvement,total_auc_improvement,execution_time,comment
0,0.729787,,,6.6643,out of box
1,0.733169,0.004634,0.004634,20.3111,optimized number of trees
2,0.733692,0.000714,0.005352,382.5486,optimized depth and child weight
3,0.735892,0.002998,0.008365,166.7705,optimized subsample and colsample_bytree
4,0.736202,0.000421,0.00879,48.6152,optimized regularization parameter


### Adding categorical features

In [156]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_col = data.select_dtypes(include=numerics).fillna(0)
categorical_col = data.select_dtypes(exclude=numerics)
categorical_col.describe()

Unnamed: 0,v3,v22,v24,v30,v31,v47,v52,v56,v66,v71,v74,v75,v79,v91,v107,v110,v112,v113,v125
count,9714,9953,9999,4726,9714,9999,9999,9405,9999,9999,9999,9999,9999,9999,9999,9999,9961,5133,9993
unique,3,4874,5,7,3,9,12,84,3,4,3,4,16,7,7,3,22,34,89
top,C,AGDF,E,C,A,C,J,BW,A,F,B,D,C,A,E,B,F,G,BM
freq,9700,215,4823,2799,7758,4884,969,1005,6147,6552,9941,6552,3034,2397,2397,4884,1916,1384,471


Keeping only variables with less than 100 unique values and then apply one hot encoding

In [189]:
list_index = categorical_col.describe().loc['unique']
list_index = list_index[list_index< 100].index
def transform_data(df, list_index):
    # filling nan
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric_col = df.select_dtypes(include=numerics).fillna(0)
    categorical_col = df.select_dtypes(exclude=numerics)
    categorical_col = categorical_col[list_index] 
    

    categorical_tr = pd.get_dummies(categorical_col, prefix=categorical_col.columns)#.reset_index()
    
    out = pd.concat([numeric_col, categorical_tr], axis=1)
    return(out)

predictor_transformed = transform_data(data, list_index) 
predictor_transformed.head()

Unnamed: 0,ID,target,v1,v2,v4,v5,v6,v7,v8,v9,...,v125_Q,v125_R,v125_S,v125_T,v125_U,v125_V,v125_W,v125_X,v125_Y,v125_Z
1,4,1,0.0,0.0,0.0,9.191265,0.0,0.0,2.30163,0.0,...,0,0,0,0,0,0,0,0,0,0
2,5,1,0.943877,5.310079,4.410969,5.326159,3.979592,3.928571,0.019645,12.666667,...,0,0,0,0,0,0,0,0,0,0
3,6,1,0.797415,8.304757,4.22593,11.627438,2.0977,1.987549,0.171947,8.965516,...,0,0,0,0,0,0,0,0,0,0
4,8,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
5,9,0,0.0,0.0,0.0,8.856791,0.0,0.0,0.359993,0.0,...,0,0,0,0,0,0,0,1,0,0


In [191]:
predictors = [x for x in predictor_transformed.columns if x not in [target, IDcol]]
X_train, X_test, y_train, y_test = train_test_split(predictor_transformed[predictors], predictor_transformed[target], test_size=0.3, random_state=2)

start_time = time.time()
reg_base = xgb.XGBClassifier()
opt_tree = optimal_trees(reg_base, X_train, y_train, X_test, y_test)
execution_time = round((time.time() - start_time),4)


In [195]:
total_improvement = opt_tree[2]/auc_improvement['auc_validation'][0] - 1
current_improvement = total_improvement
auc_improvement = auc_improvement.append(pd.DataFrame({'auc_validation': [opt_tree[2]],
                   'current_auc_improvement': [current_improvement], 
                   'total_auc_improvement': [total_improvement],                                    
                   'execution_time': [execution_time],
                   'comment': ['categorical variables and optimized number of trees']}) , ignore_index = True)
auc_improvement

Unnamed: 0,auc_validation,current_auc_improvement,total_auc_improvement,execution_time,comment
0,0.729787,,,6.6643,out of box
1,0.733169,0.004634,0.004634,20.3111,optimized number of trees
2,0.733692,0.000714,0.005352,382.5486,optimized depth and child weight
3,0.735892,0.002998,0.008365,166.7705,optimized subsample and colsample_bytree
4,0.736202,0.000421,0.00879,48.6152,optimized regularization parameter
5,0.751729,0.030067,0.030067,60.0074,categorical variables optimized for number of ...


### Optimizing max depth of the tree and child weight

In [196]:
start_time = time.time()
param_test = {
 'max_depth':range(1,6,1),
 'min_child_weight':range(1,6,1)
}
gsearch5 = GridSearchCV(estimator = xgb.XGBClassifier(n_estimators = opt_tree[0]), 
 param_grid = param_test, scoring='roc_auc', cv=5)
gsearch5.fit(X_train,y_train)
gsearch5.best_params_, gsearch.best_score_

({'max_depth': 3, 'min_child_weight': 3}, 0.7048278603858668)

We increased number of variables and because of that depth of the tree increased as well

In [197]:
reg_base = xgb.XGBClassifier(max_depth = gsearch5.best_params_['max_depth'], 
                        min_child_weight = gsearch5.best_params_['min_child_weight'])
opt_tree = optimal_trees(reg_base, X_train, y_train, X_test, y_test)
execution_time = round((time.time() - start_time),4)

In [201]:
current_improvement = opt_tree[2]/auc_improvement['auc_validation'][len(auc_improvement.index)-1] - 1
total_improvement = opt_tree[2]/auc_improvement['auc_validation'][0] - 1
auc_improvement = auc_improvement.append(pd.DataFrame({'auc_validation': [opt_tree[2]],
                   'current_auc_improvement': [current_improvement], 
                   'total_auc_improvement': [total_improvement],                                    
                   'execution_time': [execution_time],
                   'comment': ['categorical variables and optimized depth and child weight']}) , ignore_index = True)

auc_improvement

Unnamed: 0,auc_validation,current_auc_improvement,total_auc_improvement,execution_time,comment
0,0.729787,,,6.6643,out of box
1,0.733169,0.004634,0.004634,20.3111,optimized number of trees
2,0.733692,0.000714,0.005352,382.5486,optimized depth and child weight
3,0.735892,0.002998,0.008365,166.7705,optimized subsample and colsample_bytree
4,0.736202,0.000421,0.00879,48.6152,optimized regularization parameter
5,0.751729,0.030067,0.030067,60.0074,categorical variables and optimized number of ...
6,0.752109,0.000505,0.030587,1022.3153,categorical variables and optimized depth and ...


So having right variables usually gives more improvement than hyperparameter tuning