# BUILD - XGBOOST CLASSFIER

In [None]:
import numpy as np
from sklearn import datasets
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import pandas as pd
from datetime import datetime
from time import time
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve

### GLOBAL VARIABLES

In [None]:
DATAPATH = '../../../data/features/'
MODELPATH = '../../../models/xgboost/'
SEED = 47
NITER = 100
CV = 5
SCORE = 'roc_auc'
handlingnull = False
NJOBS = -1
NTHREADS= 4
USEGPU = False
NCLASS = 0

### LOAD DATASET

In [None]:
train_features = pd.read_pickle(DATAPATH+'X.pkl')

In [None]:
train_features.shape

In [None]:
train_labels = pd.read_pickle(DATAPATH+'y.pkl')[['target']]

In [None]:
### create a DMatrix and handling Null values
if handlingnull:
    xgtrain = xgb.DMatrix(train_features.values, train_labels.values, missing=-9999)
else:
    xgtrain = xgb.DMatrix(train_features.values, train_labels.values)

### SET UP HYPERPARAMETERS

In [None]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
booster = 'gbtree'


# ======== Booster Parameters ======== # 

# Analogous to learning rate in GBM. 
# Typical final values to be used: 0.01-0.2
eta = [0.01] 


# Control the balance of positive and negative weights, useful for unbalanced classes. 
# A typical value to consider: sum(negative instances) / sum(positive instances)scale_pos_weight = 1
scale_pos_weight = int((len(train_labels) - np.sum(train_labels.values))/np.sum(train_labels.values))


# Learning Task Parameters

# This defines the loss function to be minimized. 
# - binary:logistic –logistic regression for binary classification, returns predicted probability (not class)
# - multi:softmax –multiclass classification using the softmax objective, returns predicted class (not probabilities)
#   you also need to set an additional num_class (number of classes) parameter defining the number of unique classes
# - multi:softprob –same as softmax, but returns predicted probability of each data point belonging to each class.
objective  = 'binary:logistic'


# The metric to be used for validation data.
# - rmse – root mean square error
# - mae – mean absolute error
# - logloss – negative log-likelihood
# - error – Binary classification error rate (0.5 threshold)
# - merror – Multiclass classification error rate
# - mlogloss – Multiclass logloss
# - auc: Area under the curve
eval_metric = 'auc'

In [None]:
# load hyperparameters from random search or bayesian optimization
# this step can be avoided
model_param = np.load(MODELPATH + 'hyperparameter/rseach_xgboost_classifier_bestparams_dYYYY-MM-DD.npy', allow_pickle=True).tolist()

In [None]:
model_param

In [None]:
model_param['max_depth'] = int(np.round(model_param['max_depth'],0))
model_param['min_child_weight'] = int(np.round(model_param['min_child_weight'],0))


In [None]:
model_param['seed'] = SEED
model_param['booster'] = booster
model_param['objective'] = objective
model_param['scale_pos_weight'] = scale_pos_weight
model_param['nthread'] = NTHREADS

### TRAIN MODEL

In [None]:
model = xgb.train(model_param, xgtrain, verbose_eval=False)

In [None]:
y_pred = model.predict(xgtrain)

In [None]:
print('Train score (',SCORE,'): ', roc_auc_score(train_labels.values,y_pred))

### SAVE MODEL

In [None]:
model.save_model(MODELPATH+ 'xgb_XXX.model')