#  RANDOM SEARCH CV - XGBOOST CLASSIFIER

In [None]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
from datetime import datetime
from time import time

### GLOBAL VARIABLES

In [None]:
DATAPATH = '../../../data/train_test/'
MODELPATH = '../../../models/xgboost/'
SEED = 47
NITER = 100
CV = 3
SCORE = 'roc_auc'
handlingnull = False
NJOBS = -1
NTHREADS= 4
USEGPU = False

### LOAD DATASET

In [None]:
train_features = pd.read_pickle(DATAPATH+'X_train.pkl')

In [None]:
train_labels = pd.read_pickle(DATAPATH+'y_train.pkl')['target']

In [None]:
train_features.shape

In [None]:
train_labels.shape

In [None]:
# Create a DMatrix and handling Null values
if handlingnull:
    xgtrain = xgb.DMatrix(train_features, train_labels, missing=-9999)
else:
    xgtrain = xgb.DMatrix(train_features.values, train_labels.values)

### SET UP HYPERPARAMETERS

[xgboost params](https://xgboost.readthedocs.io/en/latest/python/python_api.html)

In [None]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
booster = 'gbtree'


# ======== Booster Parameters ======== # 

# Analogous to learning rate in GBM. 
# Typical final values to be used: 0.01-0.2
eta = [0.01] 

# Defines the minimum sum of weights of all observations required in a child.
min_child_weight = [i for i in range(1,10,2)]

# The maximum depth of a tree
max_depth = [i for i in range(3,10,2)] 

# A node is split only when the resulting split gives a positive reduction in the loss function. 
# Gamma specifies the minimum loss reduction required to make a split.
gamma = [i/10.0 for i in range(0,5)]

# Denotes the fraction of observations to be randomly samples for each tree.
subsample = [i/10.0 for i in range(6,10)]

# Denotes the fraction of columns to be randomly samples for each tree.
colsample_bytree = [i/10.0 for i in range(6,10)]

# L2 regularization term on weights (analogous to Ridge regression)
reg_lambda = [i/10.0 for i in range(4,10)]

# L1 regularization term on weight (analogous to Lasso regression)
reg_alpha = [0, 0.001, 0.005, 0.01, 0.05]

# Control the balance of positive and negative weights, useful for unbalanced classes. 
# A typical value to consider: sum(negative instances) / sum(positive instances)scale_pos_weight = 1
scale_pos_weight = int((len(train_labels) - np.sum(train_labels.values))/np.sum(train_labels.values))


# Learning Task Parameters

# This defines the loss function to be minimized. 
# - binary:logistic –logistic regression for binary classification, returns predicted probability (not class)
# - multi:softmax –multiclass classification using the softmax objective, returns predicted class (not probabilities)
#   you also need to set an additional num_class (number of classes) parameter defining the number of unique classes
# - multi:softprob –same as softmax, but returns predicted probability of each data point belonging to each class.
objective  = 'binary:logistic'


# The metric to be used for validation data.
# - rmse – root mean square error
# - mae – mean absolute error
# - logloss – negative log-likelihood
# - error – Binary classification error rate (0.5 threshold)
# - merror – Multiclass classification error rate
# - mlogloss – Multiclass logloss
# - auc: Area under the curve
eval_metric = 'auc'

### SET UP RANDOM SEARCH GRID

In [None]:
# Create the random grid
random_grid = {
    'learning_rate' : eta,
    'min_child_weight' : min_child_weight,
    'max_depth' : max_depth,
    'gamma': gamma,
    'subsample': subsample,
    'colsample_bytree' : colsample_bytree,
    'reg_lambda' : reg_lambda,
    'reg_alpha' : reg_alpha,
}

In [None]:
random_grid

### FIND NUM BOOST

In [None]:
early_stopping_rounds = 50

In [None]:
model=XGBClassifier(seed=SEED, booster=booster, objective=objective,  scale_pos_weight = scale_pos_weight, nthread=NJOBS)
xgb_param = model.get_xgb_params()
xgb_param['objective'] = objective

if USEGPU:
    xgb_param['tree_method'] = 'gpu_hist'
    xgb_param['gpu_id'] = 0


In [None]:
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round = 1000, nfold = CV, metrics = eval_metric, early_stopping_rounds = early_stopping_rounds, seed = SEED)

In [None]:
n_estimators = cvresult.shape[0]

In [None]:
print("Best number of boosters: ", n_estimators)

### SET UP MODEL

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
model = XGBClassifier(n_estimators=n_estimators, scale_pos_weight=scale_pos_weight,  objective=objective)

In [None]:
if USEGPU:
    model.set_params(gpu_id = 0)
    model.set_params(tree_method='gpu_hist')

In [None]:
# Random search of parameters, using CV fold cross validation, 
# search across NITER different combinations, and use all available cores
xgboost_rsearch = RandomizedSearchCV(estimator = model, 
                                     param_distributions = random_grid, 
                                     scoring=SCORE, 
                                     n_iter = NITER, 
                                     cv = CV, verbose=2, 
                                     random_state=SEED, 
                                     n_jobs = NJOBS)# Fit the random search model


### SEARCH BEST HYPERPARAMETERS

In [None]:
start = time()
xgboost_rsearch.fit(train_features, train_labels)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), NITER))

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  27 tasks      | elapsed: 135.2min
[Parallel(n_jobs=7)]: Done 148 tasks      | elapsed: 608.9min


#### Best estimator

In [None]:
print(xgboost_rsearch.best_estimator_)

#### Best parameter

In [None]:
best_params = xgboost_rsearch.best_params_

In [None]:
best_params['n_estimators'] = n_estimators

In [None]:
print(best_params)

In [None]:
print('Best cross validation score (' + SCORE,'): ', xgboost_rsearch.best_score_)

### SAVE HYPERPARAMETERS AND RESULTS

In [None]:
cv_results = pd.DataFrame(xgboost_rsearch.cv_results_)

In [None]:
cv_results.to_csv(MODELPATH + 'result/rsearch_xgboost_classifier_d' + str(datetime.now().date()) + '.csv',sep=';',index=False)

In [None]:
np.save(MODELPATH + 'hyperparameter/rseach_xgboost_classifier_bestparams_d' + str(datetime.now().date()) + '.npy', best_params)

In [None]:
np.save(MODELPATH + 'result/rseach_xgboost_classifier_best_estimator_d' + str(datetime.now().date()) + '.npy', xgboost_rsearch.best_estimator_)