# gSearch

This notebook handles arbitrary data input and executes a complete gridsearch on XGBoost parameters. In the end, a dictionary with all optimal par:value pairs are returned.

In [None]:
import xgboost as xgb
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
seed = 1337

In [None]:
Xtr = pd.read_pickle('../X_train_stack.pkl')
Xte = pd.read_pickle('../X_test_stack.pkl')
y = pd.read_pickle('../y_train_.pkl')

#X_train, X_test, y_train, y_test = train_test_split(Xtr, y, test_size=0.2)

#dtrain = xgb.DMatrix(X_train, label=y_train)
#dtest = xgb.DMatrix(X_test)

In [None]:
Xtr.head()

In [None]:
def label_binday(hour_time):
    hour_time = int(hour_time)
    if (hour_time >= 0 and hour_time < 7):
        return "early"
    elif (hour_time >= 7 and hour_time < 13):
        return "morning"
    elif (hour_time >= 13 and hour_time < 18):
        return "afternoon"
    elif (hour_time >= 18):
        return "night"
Xtr["time_start"] = Xtr["time_start"].apply(lambda x: label_binday(x))
Xte["time_start"] = Xte["time_start"].apply(lambda x: label_binday(x))
Xtr["time_end"] = Xtr["time_end"].apply(lambda x: label_binday(x))
Xte["time_end"] = Xte["time_end"].apply(lambda x: label_binday(x))
Xtr = pd.get_dummies(Xtr, columns=["time_start", "time_end"])
Xte = pd.get_dummies(Xte, columns=["time_start", "time_end"])

In [None]:
Xtr.drop(['9292', 'NA', 'funda'], axis=1, inplace=True)
Xte.drop(['9292', 'NA', 'funda'], axis=1, inplace=True)

In [None]:
def gSearch(X, y, params, tune_params):
    
    # define model
    model = xgb.XGBClassifier(learning_rate = params['learning_rate'], n_estimators = params['n_estimator']
                              , max_depth = params['max_depth'], min_child_weight = params['min_child_weight']
                              , gamma = params['gamma'], subsample = params['subsample']
                              , colsample_bytree = params['colsample_bytree'], objective = params['objective']
                              , scale_pos_weight = params['scale_pos_weight'], seed = params['seed'])
    
    # define gridsearch
    gsearch = GridSearchCV(estimator=model, param_grid=tune_params, scoring=params['scoring']
                           , n_jobs = -1, iid = False, verbose = 5)

    # obtain optimal parameters
    gsearch.fit(X, y)
    print(gsearch.best_params_)
    
    # return optimal parameters
    return gsearch

def updatePars(base, new):
    for k in new.keys():
        base[k] = new[k]
    return base

In [None]:
params = {
    'objective': 'binary:logistic',
    'scoring': 'roc_auc',
    'learning_rate': 0.08,
    'max_depth': 7,
    'min_child_weight': 1.5,
    'scale_pos_weight': 1,
    'subsample': 0.5,
    'colsample_bytree': 0.4,
    'seed': 1337,
    'silent': 1,
    'num_parallel_tree': 1,
    'gamma': 0,
    'n_estimator': 700
}

model = xgb.XGBClassifier(learning_rate = params['learning_rate'], n_estimators = params['n_estimator']
                              , max_depth = params['max_depth'], min_child_weight = params['min_child_weight']
                              , gamma = params['gamma'], subsample = params['subsample']
                              , colsample_bytree = params['colsample_bytree'], objective = params['objective']
                              , scale_pos_weight = params['scale_pos_weight'], seed = params['seed'])



from sklearn.model_selection import KFold

preds = []

kf=KFold(n_splits=3)
for i, (tr, te) in enumerate(kf.split(Xtr)):
    print(i)
    model.fit(Xtr.iloc[tr], y[tr])
    preds.append(model.predict_proba(Xte))
model.feature_importances_

In [None]:
for i, mat in enumerate(preds):
    pd.DataFrame(mat).to_csv('9999999pred'+str(i)+'.csv', index=False)





In [None]:
a

In [None]:
te_preds = model.predict(Xte)

pd.Series(te_preds).to_csv('~/xgb_te_preds.csv')

In [None]:
round(pd.DataFrame([x for y in preds for x in y]).iloc[:, 1]).to_csv('~/xgb_tr_preds.csv')

In [None]:
pd.to_pickle(Xtr, "X_train1.pkl")
pd.to_pickle(Xte, "X_test1.pkl")

In [None]:
list(set(Xtr.columns.ravel()) - set(Xte.columns.ravel()))

In [None]:
Xtr.drop(['quizonaut',
 'fitbit',
 'gayboystube',
 'fundingcircle',
 'westpac',
 'discretecontacten',
 'pogo',
 'emojipedia',
 'siemens',
 'btcclicks',
 'ojooo'],
         axis=1, inplace=True)

In [None]:
Xtr.drop(["myfreeshares"],
         axis=1, inplace=True)

In [None]:
Xte.shape

In [None]:
Xte.drop(["op-vlieland", "vlinderscrime", "demotywatory", "chartbeat", "usenet-4all", "conforama",
          "beterrekenen", "tourdefrancepro", "farmerama", "free-spider-solitaire", "dreamwidth", "mabanque",
          "annieslife", "bildkontakte", "memrise", "netvibes", "omniboxes", "ighome", "experian", "t411",
          "ipsedebruggen", "noordik", "betegy", "pornsos", "so-v", "digicert", "television-envivo"],
         axis=1, inplace=True)

In [None]:
Xte.shape

In [None]:
import time
t0 = time.time()
print("Parameter optimization")
xgb_model = xgb.XGBClassifier(seed=2017, objective="binary:logistic")
clf = GridSearchCV(xgb_model,
                   {'max_depth': [4,7], # 3, 5, 8
                    'n_estimators': [100,400], # 50,100,200
                    'learning_rate': [0.01, 0.3],
                    'min_child_weight': [1, 1.5], # 1, 1.5, 5
                    'gamma': [0, 0.1], # 0, 0.1
                    'subsample': [0.5, 0.75], # 0.5, 0.75, 1
                    'colsample_bytree': [0.4, 0.7], # 0.5, 0.75, 1,
                    
                   }, verbose=3, cv=3,scoring='roc_auc', n_jobs=-1)
clf.fit(Xtr,y)
print(str(int(time.time() - t0)))
print("Best score:")
print(clf.best_score_)
print("Best parameters:")
print(clf.best_params_)

In [None]:
e = ExtraTreesClassifier(max_features=0.3, min_samples_leaf=11, min_samples_split=4, n_estimators=100)
parameters = {'max_features':[0.3, 0.6], 'min_samples_leaf':[5, 10],
              'min_samples_split':[4, 7], 'n_estimators':[100, 300]}
clf = GridSearchCV(e, parameters, verbose=3, n_jobs=-1)
clf.fit(Xtr, y)

In [None]:
pd.Series(preds[:, 1]).to_csv('xgb_preds.csv', index=False)

In [None]:
preds = model.predict_proba(Xte)

In [None]:
pd.DataFrame(preds).iloc[:, 1].to_csv('subm_xgb_aeron3.csv', index=False)

## n_estimator

In [None]:
# Set initial parameters and find optimal number of boosting rounds

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'seed': seed,
    'silent': 1,
    'num_parallel_tree': 1
}

res = xgb.cv(xgb_params,
             dtrain,
             num_boost_round=5000,
             nfold=4,
             seed=seed,
             stratified=False,
             early_stopping_rounds=15,
             verbose_eval=20,
             show_stdv=True,
             maximize=True)

## Hyperparameter tuning

In [None]:
# formulate initial parameters
params = {
    'objective': 'binary:logistic',
    'scoring': 'roc_auc',
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'scale_pos_weight': 1,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'seed': 1337,
    'silent': 1,
    'num_parallel_tree': 1,
    'gamma': 0,
    'n_estimator': 1000
}

### max_depth and min_child_weight

In [None]:
# formulate the grid
tunePar = {
 'max_depth': list(range(3,8,2)),
 'min_child_weight': list(range(1,6,2))}




model = xgb.XGBClassifier(learning_rate = params['learning_rate'], n_estimators = params['n_estimator']
                          , max_depth = params['max_depth'], min_child_weight = params['min_child_weight']
                          , gamma = params['gamma'], subsample = params['subsample']
                          , colsample_bytree = params['colsample_bytree'], objective = params['objective']
                          , scale_pos_weight = params['scale_pos_weight'], seed = params['seed'])

# define gridsearch
gsearch = GridSearchCV(estimator=model, param_grid=tunePar, scoring=params['scoring']
                       , n_jobs = -1, iid = False, verbose = 5)

# obtain optimal parameters
gsearch.fit(X_train, y_train)






# tmp = gSearch(X_train, y_train, params, tunePar)




In [None]:
tunePar = {
 'max_depth': [tmp.best_params_['max_depth'] + i for i in range(-1, 2)],
 'min_child_weight': [tmp.best_params_['min_child_weight'] + i for i in range(-1, 2)]}
tmp = gSearch(X_train, y_train, params, tunePar)

params = updatePars(params, tmp.best_params_)

### gamma

In [None]:
tunePar = {
 'gamma':[i/10.0 for i in list(range(0,5))]
}
tmp = gSearch(X_train, y_train, params, tunePar)
params = updatePars(params, tmp.best_params_)

## Update n_estimator

In [None]:
pars = ['scale_pos_weight','gamma','colsample_bytree','max_depth'
        ,'subsample','num_parallel_tree','min_child_weight']
for par in pars:
    xgb_params[par] = params[par]

res = xgb.cv(xgb_params,
             dtrain,
             num_boost_round=750,
             nfold=4,
             seed=seed,
             stratified=False,
             early_stopping_rounds=15,
             verbose_eval=20,
             show_stdv=True,
             maximize=False)

### subsample and colsample_bytree 

In [None]:
tunePar = {
 'subsample': [i/10.0 for i in list(range(5,10))],
 'colsample_bytree': [i/10.0 for i in list(range(3,10))]
}
tmp = gSearch(X_train, y_train, params, tunePar)
params = updatePars(params, tmp.best_params_)

tunePar = {
 'subsample': [i/100.0 for i in list(range(int(tmp.best_params_['subsample'] * 100) - 15
                                           , int(tmp.best_params_['subsample'] * 100) + 15, 5))],
 'colsample_bytree': [i/100.0 for i in list(range(int(tmp.best_params_['subsample'] * 100) - 15
                                                  , int(tmp.best_params_['subsample'] * 100) + 15, 5))]
}
tmp = gSearch(X_train, y_train, params, tunePar)
params = updatePars(params, tmp.best_params_)

### alpha

In [None]:
tunePar = {
 'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05]
}
tmp = gSearch(X_train, y_train, params, tunePar)
params = updatePars(params, tmp.best_params_)

## Results
This dictionary contains all optimal parameter values

In [None]:
params