# XGBOOST - TRAIN

In [1]:
import numpy as np
from sklearn import datasets
import lightgbm as lgb
from lightgbm import LGBMClassifier
import pandas as pd
from datetime import datetime
from time import time
from sklearn.metrics import roc_auc_score, roc_curve

### GLOBAL VARIABLES

In [2]:
DATAPATH = 'data/processed/'
SEED = 47
NITER = 100
CV = 5
SCORE = 'roc_auc'
handlingnull = False
NJOBS = -1
USEGPU = False
NCLASS = 0

### LOAD DATASET

In [3]:
train_features = pd.read_pickle(DATAPATH+'X.pkl')

In [4]:
train_features.shape

(212665, 1205)

In [5]:
train_labels = pd.read_pickle(DATAPATH+'y.pkl')[['target']]

In [6]:
campaign = pd.read_pickle('data/features/campaign_quarter_001.pkl')

In [7]:
digital = pd.read_pickle('data/features/digital_features_period_001.pkl')

In [8]:
rcc = pd.read_pickle('data/features/X_rcc_features_quarter_001.pkl')

In [9]:
rcc.drop(['id_persona', 'codmes'], inplace=True, axis=1)

In [None]:
feateng = pd.read_pickle('data/features/featureseng_train_001.pkl')

In [11]:
woe = pd.read_pickle('data/features/X_woe_features.pkl')

In [12]:
train_features = train_features.join(campaign).join(rcc).join(digital).join(feateng).join(woe)

In [13]:
train_features.shape

(212665, 1800)

In [14]:
features_eng = np.load('data/train_test/features_selected.npy')

In [15]:
train_features = train_features[features_eng]

In [16]:
train_features.shape

(212665, 800)

In [18]:
### create a DMatrix and handling Null values
if handlingnull:
    #train_features[np.isnan(train_features)] = -9999
    lgb_train = lgb.Dataset(train_features.values, train_labels.values, missing=-9999)
else:
    lgb_train = lgb.Dataset(train_features.values, train_labels.values)

### TRAIN MODEL

#### Set hyperparameters

In [22]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
boosting = 'gbdt'


# ======== Booster Parameters ======== # 

# Analogous to learning rate in GBM. 
# Typical final values to be used: 0.01-0.2
eta = 0.01 


# A node is split only when the resulting split gives a positive reduction in the loss function. 
# Gamma specifies the minimum loss reduction required to make a split.
gamma = [i/10.0 for i in range(0,5)]


# Control the balance of positive and negative weights, useful for unbalanced classes. 
# A typical value to consider: sum(negative instances) / sum(positive instances)scale_pos_weight = 1
scale_pos_weight = (len(train_labels.target) - sum(train_labels.target))/sum(train_labels.target)


# Learning Task Parameters
# This defines the loss function to be minimized. See documentation
# -  options: regression, regression_l1, huber, fair, poisson, quantile, 
# mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda,
# lambdarank, aliases: objective_type, app, application
objective  = 'binary'


# The metric to be used for validation data.
# - rmse, root square loss, aliases: root_mean_squared_error, l2_root
# - quantile, Quantile regression
# - mape, MAPE loss, aliases: mean_absolute_percentage_error
# - huber, Huber loss
# - fair, Fair loss
# - poisson, negative log-likelihood for Poisson regression
# - gamma, negative log-likelihood for Gamma regression
# - gamma_deviance, residual deviance for Gamma regression
# - tweedie, negative log-likelihood for Tweedie regression
# - ndcg, NDCG, aliases: lambdarank
# - map, MAP, aliases: mean_average_precision
# - auc, AUC
# - binary_logloss, log loss, aliases: binary
metric = 'auc'

In [23]:
model_param = np.load('output/models/bayesianoptcv_gbm_classifier_bestparams_d2019-11-28.npy',allow_pickle=True).tolist()

In [24]:
model_param['max_depth'] = int(np.round(model_param['max_depth'],0))
model_param['min_child_weight'] = int(np.round(model_param['min_child_weight'],0))


In [25]:
model_param['num_leaves'] = int(np.round(model_param['num_leaves'],0))


In [26]:
model_param['seed'] = SEED
model_param['booster'] = boosting
model_param['objective'] = objective
model_param['scale_pos_weight'] = scale_pos_weight


In [27]:
model_param['num_threads'] = NJOBS

In [28]:
model_param

{'bagging_fraction': 0.8327365806346091,
 'feature_fraction': 0.8881760200078465,
 'max_depth': 13,
 'min_child_weight': 30,
 'min_split_gain': 0.004145814603445978,
 'num_leaves': 99,
 'n_estimators': 344,
 'seed': 47,
 'booster': 'gbdt',
 'objective': 'binary',
 'scale_pos_weight': 9.227229008367798,
 'num_threads': -1}

In [35]:
lgbmodel = lgb.train(model_param, lgb_train)

In [38]:
lgbmodel.save_model('output/models/lgb_002.model')

<lightgbm.basic.Booster at 0x7fd8decbc128>

In [26]:
np.save('models/xgb_004.features', train_features.columns.tolist())