# XGBOOST - TRAIN

In [1]:
import numpy as np
from sklearn import datasets
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import pandas as pd
from datetime import datetime
from time import time
from sklearn.metrics import balanced_accuracy_score

### GLOBAL VARIABLES

In [2]:
DATAPATH = 'data/processed/'
SEED = 47
NITER = 100
CV = 5
SCORE = 'roc_auc'
handlingnull = False
NJOBS = -1
USEGPU = False
NCLASS = 0

### LOAD DATASET

In [12]:
train_features = pd.read_pickle(DATAPATH+'X.pkl')

In [13]:
train_features.shape

(212665, 1205)

In [14]:
train_labels = pd.read_pickle(DATAPATH+'y.pkl')[['target']]

In [15]:
campaign = pd.read_pickle('data/features/campaign_quarter_001.pkl')

In [16]:
digital = pd.read_pickle('data/features/digital_features_period_001.pkl')

In [17]:
rcc = pd.read_pickle('data/features/X_rcc_features_ten_001.pkl')

In [18]:
rcc.drop(['id_persona', 'codmes'], inplace=True, axis=1)

In [19]:
train_features = train_features.join(campaign).join(rcc).join(digital)

In [20]:
train_features.shape

(212665, 1770)

In [21]:
### create a DMatrix and handling Null values
if handlingnull:
    #train_features[np.isnan(train_features)] = -9999
    xgtrain = xgb.DMatrix(train_features.values, train_labels.values, missing=-9999)
else:
    xgtrain = xgb.DMatrix(train_features.values, train_labels.values)

### TRAIN MODEL

#### Set hyperparameters

In [22]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
booster = 'gbtree'


# ======== Booster Parameters ======== # 

# Analogous to learning rate in GBM. 
# Typical final values to be used: 0.01-0.2
eta = [0.01] 


# Control the balance of positive and negative weights, useful for unbalanced classes. 
# A typical value to consider: sum(negative instances) / sum(positive instances)scale_pos_weight = 1
scale_pos_weight = int((len(train_labels) - np.sum(train_labels.values))/np.sum(train_labels.values))


# Learning Task Parameters

# This defines the loss function to be minimized. 
# - binary:logistic –logistic regression for binary classification, returns predicted probability (not class)
# - multi:softmax –multiclass classification using the softmax objective, returns predicted class (not probabilities)
#   you also need to set an additional num_class (number of classes) parameter defining the number of unique classes
# - multi:softprob –same as softmax, but returns predicted probability of each data point belonging to each class.
objective  = 'binary:logistic'


# The metric to be used for validation data.
# - rmse – root mean square error
# - mae – mean absolute error
# - logloss – negative log-likelihood
# - error – Binary classification error rate (0.5 threshold)
# - merror – Multiclass classification error rate
# - mlogloss – Multiclass logloss
# - auc: Area under the curve
eval_metric = 'auc'

In [3]:
model_param = np.load('output/hyperparameters/rseach_xgboost_classifier_bestparams_d2019-11-20.npy').tolist()

In [4]:
model_param

{'subsample': 0.6,
 'reg_lambda': 0.9,
 'reg_alpha': 0.01,
 'min_child_weight': 9,
 'max_depth': 9,
 'learning_rate': 0.01,
 'gamma': 0.0,
 'colsample_bytree': 0.6}

In [33]:
model_param['seed'] = SEED
model_param['booster'] = booster
model_param['objective'] = objective
model_param['n_estimator'] = 313
model_param['scale_pos_weight'] = scale_pos_weight
model_param['nthread'] = 8

In [34]:
model = xgb.train(model_param, xgtrain, verbose_eval=False)

[02:11:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 712 extra nodes, 0 pruned nodes, max_depth=9
[02:11:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 664 extra nodes, 0 pruned nodes, max_depth=9
[02:11:03] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 742 extra nodes, 0 pruned nodes, max_depth=9
[02:11:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 740 extra nodes, 0 pruned nodes, max_depth=9
[02:11:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 738 extra nodes, 0 pruned nodes, max_depth=9
[02:11:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 774 extra nodes, 0 pruned nodes, max_depth=9
[02:11:11] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 610 extra nodes, 0 pruned nodes, max_depth=9
[02:11:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 764 extra nodes, 0 pruned nodes, max_depth=9
[02:11:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 732 extra nodes, 0 pruned no

In [35]:
y_pred = model.predict(xgtrain)

In [36]:
from sklearn.metrics import roc_auc_score, roc_curve


In [37]:
print(SCORE,' : ', roc_auc_score(train_labels.values,y_pred))

roc_auc  :  0.8206426125225773


In [38]:
model.save_model('models/xgb_002.model')

In [39]:
np.save('models/xgb_002.features', train_features.columns.tolist())