Template for parameter tuning of xgboost model using grid seach CV

In [13]:
## Load libraries
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.model_selection import GridSearchCV
import os
import xgboost as xgb 
from sklearn import metrics
from sklearn.metrics import r2_score

In [2]:
## Configurable options
REMOVE_ZERO_VAR = True
REMOVE_PERFECT_CORRELATED_VAR = False
USE_TSVD = True
USE_PCA = True
USE_ICA = True
USE_GRP = True
USE_SRP = True 
NCOMP = 10
USE_PROP_BINARY_VARS = True
CVFOLD = 5
SEED = 12345

In [3]:
## Read in and process train and test datasets
datadir = '../input'
trainFile = os.path.join(datadir,'train.csv')
testFile = os.path.join(datadir,'test.csv')
train = pd.read_csv(trainFile)
test = pd.read_csv(testFile) 

## remove non-feature columns from train/test datasets
y_train = train['y'].values
test_ids = test['ID'].values
train.drop(['ID','y'],axis=1,inplace=True)
test.drop(['ID'],axis=1,inplace=True)

## seperate categorical and binary variables
catg_vars = ['X'+str(i) for i in [0,1,2,3,4,5,6,8]]
binary_vars = [var for var in list(train.columns) if not var in catg_vars]

print('Number of variables : {}'.format(len(train.columns)))
print()
print('Categorical variables :\n')
print(', '.join(catg_vars))
print()
print('Binary variables :\n')
print(', '.join(binary_vars))

Number of variables : 376

Categorical variables :

X0, X1, X2, X3, X4, X5, X6, X8

Binary variables :

X10, X11, X12, X13, X14, X15, X16, X17, X18, X19, X20, X21, X22, X23, X24, X26, X27, X28, X29, X30, X31, X32, X33, X34, X35, X36, X37, X38, X39, X40, X41, X42, X43, X44, X45, X46, X47, X48, X49, X50, X51, X52, X53, X54, X55, X56, X57, X58, X59, X60, X61, X62, X63, X64, X65, X66, X67, X68, X69, X70, X71, X73, X74, X75, X76, X77, X78, X79, X80, X81, X82, X83, X84, X85, X86, X87, X88, X89, X90, X91, X92, X93, X94, X95, X96, X97, X98, X99, X100, X101, X102, X103, X104, X105, X106, X107, X108, X109, X110, X111, X112, X113, X114, X115, X116, X117, X118, X119, X120, X122, X123, X124, X125, X126, X127, X128, X129, X130, X131, X132, X133, X134, X135, X136, X137, X138, X139, X140, X141, X142, X143, X144, X145, X146, X147, X148, X150, X151, X152, X153, X154, X155, X156, X157, X158, X159, X160, X161, X162, X163, X164, X165, X166, X167, X168, X169, X170, X171, X172, X173, X174, X175, X176, X177, 

In [4]:
## LabelEncode categorical variables 
for c in catg_vars:
    lbl = LabelEncoder() 
    lbl.fit(list(train[c].values) + list(test[c].values)) 
    train[c] = lbl.transform(list(train[c].values))
    test[c] = lbl.transform(list(test[c].values))

In [5]:
## Remove zero variance variables
zero_variance_vars = []
for c in train.columns:
    if len(set(train[c].values))==1:
        zero_variance_vars.append(c)
        
print(' ,'.join(zero_variance_vars))

X11 ,X93 ,X107 ,X233 ,X235 ,X268 ,X289 ,X290 ,X293 ,X297 ,X330 ,X347


In [6]:
## Remove perfectly correlated variables
from scipy.stats import pearsonr
vars_perfect_corr = []
var_pairs_perfect_corr = []

for i in range(0,len(train.columns)-1):
    for j in range(i+1,len(train.columns)):
        tmpcorr = pearsonr(train[train.columns[i]],train[train.columns[j]])[0]
        if(abs(tmpcorr)==1):
            vars_perfect_corr.append(train.columns[i])
            var_pairs_perfect_corr.append((train.columns[i],train.columns[j]))

for i,j in var_pairs_perfect_corr:
    print('({},{})'.format(i,j),end='\t')

  r = r_num / r_den


(X17,X382)	(X29,X232)	(X29,X263)	(X29,X279)	(X31,X35)	(X31,X37)	(X33,X39)	(X35,X37)	(X44,X302)	(X48,X113)	(X48,X134)	(X48,X147)	(X48,X222)	(X52,X120)	(X53,X102)	(X53,X214)	(X53,X239)	(X54,X76)	(X54,X136)	(X58,X324)	(X60,X248)	(X60,X253)	(X60,X385)	(X62,X172)	(X62,X216)	(X67,X213)	(X71,X84)	(X71,X244)	(X76,X136)	(X84,X244)	(X88,X122)	(X88,X243)	(X88,X320)	(X89,X245)	(X90,X94)	(X90,X242)	(X94,X242)	(X102,X214)	(X102,X239)	(X112,X199)	(X113,X134)	(X113,X147)	(X113,X222)	(X118,X119)	(X122,X243)	(X122,X320)	(X125,X227)	(X128,X130)	(X134,X147)	(X134,X222)	(X138,X146)	(X142,X158)	(X147,X222)	(X152,X226)	(X152,X326)	(X155,X360)	(X156,X157)	(X172,X216)	(X184,X262)	(X184,X266)	(X186,X194)	(X202,X247)	(X204,X205)	(X214,X239)	(X226,X326)	(X230,X254)	(X232,X263)	(X232,X279)	(X240,X364)	(X240,X365)	(X243,X320)	(X248,X253)	(X248,X385)	(X253,X385)	(X262,X266)	(X263,X279)	(X295,X296)	(X298,X299)	(X364,X365)	

In [7]:
## Dimension reduction 

# tSVD
tsvd = TruncatedSVD(n_components=NCOMP, random_state=SEED)
tsvd_results_train = tsvd.fit_transform(train)
tsvd_results_test = tsvd.transform(test)
for i in range(1, NCOMP+1):
    train['tsvd_' + str(i)] = tsvd_results_train[:,i-1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i-1]

# PCA
pca = PCA(n_components=NCOMP, random_state=SEED)
pca_results_train = pca.fit_transform(train)
pca_results_test = pca.transform(test)
for i in range(1, NCOMP+1):
    train['pca_' + str(i)] = pca_results_train[:,i-1]
    test['pca_' + str(i)] = pca_results_test[:, i-1]
    
# ICA
ica = FastICA(n_components=NCOMP, random_state=SEED)
ica_results_train = ica.fit_transform(train)
ica_results_test = ica.transform(test)
for i in range(1, NCOMP+1):
    train['ica_' + str(i)] = ica_results_train[:,i-1]
    test['ica_' + str(i)] = ica_results_test[:, i-1]
    
# GRP
grp = GaussianRandomProjection(n_components=NCOMP, eps=0.1, random_state=SEED)
grp_results_train = grp.fit_transform(train)
grp_results_test = grp.transform(test)
for i in range(1, NCOMP+1):
    train['grp_' + str(i)] = grp_results_train[:,i-1]
    test['grp_' + str(i)] = grp_results_test[:, i-1]

# SRP
srp = SparseRandomProjection(n_components=NCOMP, dense_output=True, random_state=SEED)
srp_results_train = srp.fit_transform(train)
srp_results_test = srp.transform(test)
for i in range(1, NCOMP+1):
    train['srp_' + str(i)] = srp_results_train[:,i-1]
    test['srp_' + str(i)] = srp_results_test[:, i-1]

In [8]:
# for each record, find the proportion of binary variables with value = 1
train['prop_binary_vars_equals_1'] = [np.mean(i) for i in train[binary_vars].values]
test['prop_binary_vars_equals_1'] = [np.mean(i) for i in test[binary_vars].values]

In [9]:
## Vars to drop
features = set(train.columns)
if REMOVE_ZERO_VAR:
    features = set(features)-set(zero_variance_vars)
if REMOVE_PERFECT_CORRELATED_VAR:
    features = set(features)-set(vars_perfect_corr)
if not USE_TSVD:
    features = set(features)-set(['tsvd_' + str(i) for i in range(1, NCOMP+1)])
if not USE_PCA:
    features = set(features)-set(['pca_' + str(i) for i in range(1, NCOMP+1)])
if not USE_ICA:
    features = set(features)-set(['ica_' + str(i) for i in range(1, NCOMP+1)])
if not USE_GRP:
    features = set(features)-set(['grp_' + str(i) for i in range(1, NCOMP+1)])
if not USE_SRP:
    features = set(features)-set(['srp_' + str(i) for i in range(1, NCOMP+1)])
if not USE_PROP_BINARY_VARS:
    features = set(features)-set(['prop_binary_vars_1'])
    
features = list(features)
print('Features selected : ')
print()
print(', '.join(features))

train = train[features]
test = test[features]

Features selected : 

X362, X186, grp_10, X36, X23, X76, X239, X131, X134, X2, ica_5, X238, X298, X317, X378, X101, X250, X199, X200, X179, X50, X61, tsvd_5, X138, X137, X171, X0, X205, X24, X305, pca_8, X176, ica_7, X13, X338, X364, X10, X29, X256, X353, X27, X197, X259, X65, X282, X154, X339, X187, X367, X219, X291, X232, ica_1, X270, X45, X155, X185, X216, X150, pca_2, X114, X184, X335, tsvd_2, X203, X234, X195, X350, X278, X194, X277, X336, grp_6, X153, X252, tsvd_7, X174, pca_9, X151, X323, grp_9, X373, X144, ica_10, X16, X321, X343, X340, X249, X78, tsvd_8, X344, X361, X226, X58, X103, X30, grp_8, X375, X162, X136, grp_2, X326, X118, X146, X82, X108, X368, X314, X341, X181, X127, X286, X287, X217, X304, grp_5, X167, X20, X351, X59, X95, X218, X172, X152, X311, X189, X300, X166, X192, X56, tsvd_10, X222, X4, X337, X130, X62, X14, X284, ica_8, X126, X276, srp_5, X320, X359, X358, X257, X160, X135, X124, X85, X8, X49, X148, X170, X306, X202, X70, X308, X360, X365, X96, X263, X281, X

In [11]:
## Parameter grid (edit if neccessary)
params = {}
params['n_estimators'] = [800]
params['learning_rate'] = [0.0040]
params['max_depth'] = [3]
params['subsample'] = [0.7]

In [14]:
## Parameter tuning using Grid Search

# xgb regressor
xgb_model = xgb.XGBRegressor(
    silent=1,
    objective='reg:linear',
    base_score=np.mean(y_train),
    random_state=SEED
)

# create r2 scorer
r2_scorer = metrics.make_scorer(r2_score, greater_is_better = True)

# grid search
model = GridSearchCV(
    estimator = xgb_model,
    param_grid = params,
    scoring = r2_scorer,
    cv = CVFOLD,
    verbose=100,
    n_jobs=-1
)

In [15]:
## Run grid search
model.fit(train,y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[CV] learning_rate=0.004, max_depth=3, n_estimators=800, subsample=0.7 


[CV] learning_rate=0.004, max_depth=3, n_estimators=800, subsample=0.7 


[CV] learning_rate=0.004, max_depth=3, n_estimators=800, subsample=0.7 


[CV] learning_rate=0.004, max_depth=3, n_estimators=800, subsample=0.7 


Pickling array (shape=(415,), dtype=object).
Memmaping (shape=(364, 4209), dtype=int64) to new file /dev/shm/joblib_memmaping_pool_12_140362642837120/12-140362641433376-83d3c55425e4384ef09a78e32aa3675c.pkl
Memmaping (shape=(51, 4209), dtype=float64) to new file /dev/shm/joblib_memmaping_pool_12_140362642837120/12-140362641433376-7a91064fa17a454727a22d994d6d1e88.pkl
Pickling array (shape=(364,), dtype=object).
Pickling array (shape=(51,), dtype=object).
Pickling array (shape=(364,), dtype=int64).
Pickling array (shape=(51,), dtype=int64).
Pickling array (shape=(4209,), dtype=float64).
Pickling array (shape=(3367,), dtype=int64).
Pickling array (shape=(842,), dtype=int64).
Pickling array (shape=(415,), dtype=object).
Memmaping (shape=(364, 4209), dtype=int64) to old file /dev/shm/joblib_memmaping_pool_12_140362642837120/12-140362641433376-83d3c55425e4384ef09a78e32aa3675c.pkl
Memmaping (shape=(51, 4209), dtype=float64) to old file /dev/shm/joblib_memmaping_pool_12_140362642837120/12-14036

[CV] learning_rate=0.004, max_depth=3, n_estimators=800, subsample=0.7 


Memmaping (shape=(364, 4209), dtype=int64) to old file /dev/shm/joblib_memmaping_pool_12_140362642837120/12-140362641433376-83d3c55425e4384ef09a78e32aa3675c.pkl
Memmaping (shape=(51, 4209), dtype=float64) to old file /dev/shm/joblib_memmaping_pool_12_140362642837120/12-140362641433376-7a91064fa17a454727a22d994d6d1e88.pkl
Pickling array (shape=(364,), dtype=object).
Pickling array (shape=(51,), dtype=object).
Pickling array (shape=(364,), dtype=int64).
Pickling array (shape=(51,), dtype=int64).
Pickling array (shape=(4209,), dtype=float64).
Pickling array (shape=(3368,), dtype=int64).
Pickling array (shape=(841,), dtype=int64).


[CV]  learning_rate=0.004, max_depth=3, n_estimators=800, subsample=0.7, score=0.536461, total= 1.2min


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.3min


[CV]  learning_rate=0.004, max_depth=3, n_estimators=800, subsample=0.7, score=0.619919, total= 1.2min


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.3min remaining:  1.9min


[CV]  learning_rate=0.004, max_depth=3, n_estimators=800, subsample=0.7, score=0.588419, total= 1.3min


[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.3min remaining:   50.5s


[CV]  learning_rate=0.004, max_depth=3, n_estimators=800, subsample=0.7, score=0.439725, total= 1.3min


[CV]  learning_rate=0.004, max_depth=3, n_estimators=800, subsample=0.7, score=0.596477, total= 1.3min


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.3min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=100.66931812782134, booster='gbtree',
       colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=1, objective='reg:linear',
       random_state=12345, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=12345, silent=1, subsample=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [800], 'learning_rate': [0.004], 'max_depth': [3], 'subsample': [0.7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(r2_score), verbose=100)

In [16]:
# best param config
print(model.best_score_)
print(model.best_params_)

0.556184897097
{'learning_rate': 0.004, 'max_depth': 3, 'n_estimators': 800, 'subsample': 0.7}


In [18]:
# generate predictions using best model
final_model = model.best_estimator_

y_pred = final_model.predict(test)
submission = pd.DataFrame({
    'id': test_ids.astype(np.int32), 
    'y': y_pred
})

In [19]:
submission.head()

Unnamed: 0,id,y
0,1,81.342201
1,2,94.54361
2,3,80.760086
3,4,79.72731
4,5,111.607407


In [20]:
submission.to_csv('submission.csv',index=False)