# XGBOOST -  CLASSIFICATION - BAYESIAN OPTIMIZATION

In [2]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from bayes_opt import BayesianOptimization
import pandas as pd
from datetime import datetime
from time import time

### GLOBAL VARIABLES

In [32]:
DATAPATH = 'data/train_test/'
SEED = 47
NITER = 20
CV = 3
SCORE = 'roc_auc'
handlingnull = False
NJOBS = -1

### LOAD DATASET

In [4]:
train_features = pd.read_pickle(DATAPATH+'X_train.pkl')

In [5]:
train_labels = pd.read_pickle(DATAPATH+'y_train.pkl')['target']

In [6]:
train_features.shape

(148865, 1800)

In [7]:
### create a DMatrix and handling Null values
if handlingnull:
    #train_features[np.isnan(train_features)] = -9999
    xgtrain = xgb.DMatrix(train_features, train_labels, missing=-9999)
else:
    xgtrain = xgb.DMatrix(train_features.values, train_labels.values)

### TRAIN MODEL

#### Set general hyperparameters

In [27]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
booster = 'gbtree'


# ======== Booster Parameters ======== # 

# Analogous to learning rate in GBM. 
# Typical final values to be used: 0.01-0.2
eta = 0.01


# Control the balance of positive and negative weights, useful for unbalanced classes. 
# A typical value to consider: sum(negative instances) / sum(positive instances)scale_pos_weight = 1
scale_pos_weight = int((len(train_labels) - np.sum(train_labels.values))/np.sum(train_labels.values))


# Learning Task Parameters

# This defines the loss function to be minimized. 
# - binary:logistic –logistic regression for binary classification, returns predicted probability (not class)
# - multi:softmax –multiclass classification using the softmax objective, returns predicted class (not probabilities)
#   you also need to set an additional num_class (number of classes) parameter defining the number of unique classes
# - multi:softprob –same as softmax, but returns predicted probability of each data point belonging to each class.
objective  = 'binary:logistic'


# The metric to be used for validation data.
# - rmse – root mean square error
# - mae – mean absolute error
# - logloss – negative log-likelihood
# - error – Binary classification error rate (0.5 threshold)
# - merror – Multiclass classification error rate
# - mlogloss – Multiclass logloss
# - auc: Area under the curve
eval_metric = 'auc'

[xgboost params](https://xgboost.readthedocs.io/en/latest/python/python_api.html)

#### Find num boost

In [9]:
early_stopping_rounds = 10

In [10]:
model=XGBClassifier(seed=SEED, booster=booster, objective=objective,  scale_pos_weight = scale_pos_weight, nthread=NJOBS)
xgb_param = model.get_xgb_params()
xgb_param['objective'] = objective

if USEGPU:
    xgb_param['tree_method'] = 'gpu_hist'
    xgb_param['gpu_id'] = 0


In [11]:
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round = 1000, nfold = CV, metrics = eval_metric, early_stopping_rounds = early_stopping_rounds, seed = SEED)

In [12]:
n_estimators = cvresult.shape[0]

In [13]:
print("Best number of boosters: ", n_estimators)

Best number of boosters:  265


#### Bayesian optimization hyperparameters

In [33]:
# Define search space
pds ={
    # Defines the minimum sum of weights of all observations required in a child.
    'min_child_weight':(14, 20),
    
    # A node is split only when the resulting split gives a positive reduction in the loss function. 
    # Gamma specifies the minimum loss reduction required to make a split.
    'gamma':(0, 5),
    
    # Denotes the fraction of observations to be randomly samples for each tree.
    'subsample':(0.5, 1),
    
    # Denotes the fraction of columns to be randomly samples for each tree.
    'colsample_bytree':(0.1, 1),
    
    # The maximum depth of a tree
    'max_depth': (5, 10)
}

In [34]:
# Define Objective Function
def hyp_xgb(max_depth, subsample, colsample_bytree,min_child_weight, gamma):
    params = {
    'booster': booster,
    'n_estimators': n_estimators,
    'eta': eta,
    'objective': objective,
    'eval_metric':eval_metric, # Optional --> Use eval_metric if you want to stop evaluation based on eval_metric 
    'silent': 1
     }
    
    
    params['max_depth'] = int(round(max_depth))
    params['subsample'] = max(min(subsample, 1), 0)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['min_child_weight'] = int(min_child_weight)
    params['gamma'] = max(gamma, 0)

    
    
    cv_result = xgb.cv(params, xgtrain, num_boost_round=1000,verbose_eval=False, early_stopping_rounds=10, metrics = eval_metric, maximize=True, nfold=CV)
    return  max(cv_result['test-auc-mean'])

In [35]:
optimizer = BayesianOptimization(hyp_xgb, pds, random_state=7)

In [None]:
# Optimize
optimizer.maximize(init_points=5, n_iter=NITER)

|   iter    |  target   | colsam... |   gamma   | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8013  [0m | [0m 0.1687  [0m | [0m 3.9     [0m | [0m 7.192   [0m | [0m 18.34   [0m | [0m 0.989   [0m |


#### Best parameter

In [62]:
optimizer.max['params']

{'subsample': 0.7,
 'reg_lambda': 0.7,
 'reg_alpha': 0.05,
 'min_child_weight': 9,
 'max_depth': 9,
 'learning_rate': 0.01,
 'gamma': 0.4,
 'colsample_bytree': 0.6}

#### Best Score

#### Saving best hyperparameters

In [65]:
np.save('output/models/bayesianoptcv_xgboost_classifier_bestparams_d' + str(datetime.now().date()) + '.npy', optimizer.max['params'])