# LigthGBM CLASSIFIER - GRIDSEARCH

In [1]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
from datetime import datetime
from time import time

### GLOBAL VARIABLES

In [2]:
DATAPATH = '../data/train_test/'
SEED = 47
NITER = 100
CV = 5
SCORE = 'roc_auc' # chouse the score in base what you want to predict
usenull = True
NJOBS = 28

### LOAD DATASET

In [3]:
train_features = np.load(DATAPATH+'X_train.npy')

In [4]:
train_features.shape

(68893, 42)

In [5]:
train_tfidf_svd300 = np.load(DATAPATH+'X_train_tfidf_svd300.npy')

In [6]:
train_tfidf_svd300.shape

(68893, 300)

In [7]:
train_features = np.concatenate((train_features, train_tfidf_svd300), axis=1)

In [8]:
train_labels = np.load(DATAPATH+'y_train.npy')

#### Fix Null values

In [9]:
if usenull == False:
    train_features[np.isnan(train_features)] = -9999

In [10]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(train_features, train_labels)

### TRAIN MODEL

In [47]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
boosting = 'gbdt'

# Learning Task Parameters
# This defines the loss function to be minimized. See documentation
# -  options: regression, regression_l1, huber, fair, poisson, quantile, 
# mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda,
# lambdarank, aliases: objective_type, app, application
objective  = 'binary'

# The metric to be used for validation data.
# - rmse, root square loss, aliases: root_mean_squared_error, l2_root
# - quantile, Quantile regression
# - mape, MAPE loss, aliases: mean_absolute_percentage_error
# - huber, Huber loss
# - fair, Fair loss
# - poisson, negative log-likelihood for Poisson regression
# - gamma, negative log-likelihood for Gamma regression
# - gamma_deviance, residual deviance for Gamma regression
# - tweedie, negative log-likelihood for Tweedie regression
# - ndcg, NDCG, aliases: lambdarank
# - map, MAP, aliases: mean_average_precision
# - auc, AUC
# - binary_logloss, log loss, aliases: binary
metric = 'auc'

# Control the balance of positive and negative weights, useful for unbalanced classes. 
# A typical value to consider: sum(negative instances) / sum(positive instances)scale_pos_weight = 1
scale_pos_weight = int((len(train_labels) - np.sum(train_labels))/np.sum(train_labels))

#### Load  hyperparameters from random search

In [21]:
lgb_params = np.load('../models/rseach_gbm_classifier_bestparams_d2019-10-30.npy', allow_pickle=True).tolist()

In [23]:
lgb_params

{'subsample': 0.8,
 'reg_lambda': 0.5,
 'reg_alpha': 0.001,
 'min_data_in_leaf': 180,
 'max_depth': 9,
 'learning_rate': 0.01,
 'gamma': 0.4,
 'colsample_bytree': 0.7,
 'bagging_freq': 20}

In [24]:
lgb_params['seed'] = SEED
lgb_params['boosting_type'] = boosting
lgb_params['objective'] = objective
lgb_params['metric'] = metric
lgb_params['num_threads'] = NJOBS
lgb_params['verbose'] = 0
lgb_params['scale_pos_weight'] = scale_pos_weight

#### Find num boost

In [25]:
early_stopping_rounds = 50

In [26]:
cvresult = lgb.cv(lgb_params, lgb_train, num_boost_round = 1000, nfold = CV, metrics = metric, early_stopping_rounds = early_stopping_rounds, seed = SEED)

In [27]:
n_estimators = pd.DataFrame(cvresult).shape[0]

In [28]:
print("Best number of estimators found: ", n_estimators)

Best number of estimators found:  999


#### Set Search hyperparameters

In [41]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
boosting = 'gbdt'


# ======== Booster Parameters ======== # 

# Analogous to learning rate in GBM. 
# Typical final values to be used: 0.01-0.2
eta = [0.01] 

# Minimal sum hessian in one leaf. Like min_data_in_leaf, it can be used to deal with over-fitting
# Default 1e-3
min_child_weight = [0.001]

# Minimal number of data in one leaf. Can be used to deal with over-fitting
# default = 20, type = int, aliases: min_data_per_leaf, min_data, min_child_samples, 
# constraints: min_data_in_leaf >= 0
min_data_in_leaf = [i for i in range(int(lgb_params['min_data_in_leaf']) - 10, int(lgb_params['min_data_in_leaf']) + 12, 4)]

# The maximum depth of a tree
max_depth = [i for i in range(int(lgb_params['max_depth']) - 1, int(lgb_params['max_depth']) + 2, 1)]

# A node is split only when the resulting split gives a positive reduction in the loss function. 
# Gamma specifies the minimum loss reduction required to make a split.
gamma = [lgb_params['gamma']]

# Denotes the fraction of observations to be randomly samples for each tree.
subsample = [i/100 for i in range(int(lgb_params['subsample']*100) - 4, int(lgb_params['subsample']*100) + 6, 3)]

# Denotes the fraction of columns to be randomly samples for each tree.
colsample_bytree = [i/100 for i in range(int(lgb_params['colsample_bytree']*100) - 4, int(lgb_params['colsample_bytree']*100) + 6, 3)]

# frequency for bagging
# 0 means disable bagging; k means perform bagging at every k iteration
# Note: to enable bagging, bagging_fraction should be set to value smaller than 1.0 as well
bagging_freq = [lgb_params['bagging_freq']]

# L2 regularization term on weights (analogous to Ridge regression)
reg_lambda = [i/100 for i in range(int(lgb_params['reg_lambda']*100) - 4, int(lgb_params['reg_lambda']*100) + 4, 4)]

# L1 regularization term on weight (analogous to Lasso regression)
reg_alpha = [lgb_params['reg_alpha']]

# Control the balance of positive and negative weights, useful for unbalanced classes. 
# A typical value to consider: sum(negative instances) / sum(positive instances)scale_pos_weight = 1
scale_pos_weight = int((len(train_labels) - np.sum(train_labels))/np.sum(train_labels))


# Learning Task Parameters
# This defines the loss function to be minimized. See documentation
# -  options: regression, regression_l1, huber, fair, poisson, quantile, 
# mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda,
# lambdarank, aliases: objective_type, app, application
objective  = 'binary'


# The metric to be used for validation data.
# - rmse, root square loss, aliases: root_mean_squared_error, l2_root
# - quantile, Quantile regression
# - mape, MAPE loss, aliases: mean_absolute_percentage_error
# - huber, Huber loss
# - fair, Fair loss
# - poisson, negative log-likelihood for Poisson regression
# - gamma, negative log-likelihood for Gamma regression
# - gamma_deviance, residual deviance for Gamma regression
# - tweedie, negative log-likelihood for Tweedie regression
# - ndcg, NDCG, aliases: lambdarank
# - map, MAP, aliases: mean_average_precision
# - auc, AUC
# - binary_logloss, log loss, aliases: binary
metric = 'auc'

[lightGBM params](https://lightgbm.readthedocs.io/en/latest/Parameters.html)

In [42]:
# Create the random grid
param_grid = {
    'learning_rate' : eta,
    'min_data_in_leaf' : min_data_in_leaf,
    'max_depth' : max_depth,
    'gamma': gamma,
    'subsample': subsample,
    'colsample_bytree' : colsample_bytree,
    'bagging_freq' : bagging_freq,
    'reg_lambda' : reg_lambda,
    'reg_alpha' : reg_alpha,
}

In [43]:
param_grid

{'learning_rate': [0.01],
 'min_data_in_leaf': [170, 174, 178, 182, 186, 190],
 'max_depth': [8, 9, 10],
 'gamma': [0.4],
 'subsample': [0.76, 0.79, 0.82, 0.85],
 'colsample_bytree': [0.66, 0.69, 0.72, 0.75],
 'bagging_freq': [20],
 'reg_lambda': [0.46, 0.5],
 'reg_alpha': [0.001]}

In [44]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
model = LGBMClassifier(n_estimators=n_estimators, scale_pos_weight=scale_pos_weight, objective=objective, metric=metric)

In [45]:
# Random search of parameters, using CV fold cross validation, 
# search across NITER different combinations, and use all available cores
gbm_gsearch = GridSearchCV(estimator = model, param_grid=param_grid, scoring=SCORE, cv = CV, verbose=2, n_jobs = NJOBS)# Fit the random search model


#### Training

In [46]:
start = time()
gbm_gsearch.fit(train_features, train_labels)
print("GridSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), NITER))

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=28)]: Using backend LokyBackend with 28 concurrent workers.
[Parallel(n_jobs=28)]: Done 106 tasks      | elapsed:  9.9min
[Parallel(n_jobs=28)]: Done 309 tasks      | elapsed: 29.5min
[Parallel(n_jobs=28)]: Done 592 tasks      | elapsed: 55.5min
[Parallel(n_jobs=28)]: Done 957 tasks      | elapsed: 88.8min
[Parallel(n_jobs=28)]: Done 1402 tasks      | elapsed: 143.4min
[Parallel(n_jobs=28)]: Done 1929 tasks      | elapsed: 206.1min
[Parallel(n_jobs=28)]: Done 2536 tasks      | elapsed: 267.2min
[Parallel(n_jobs=28)]: Done 2880 out of 2880 | elapsed: 302.4min finished


GridSearchCV took 18156.24 seconds for 100 candidates parameter settings.


#### Saving results

In [52]:
cv_results = pd.DataFrame(gbm_gsearch.cv_results_)

In [53]:
cv_results.to_csv('../models/gsearch_gbm_classifier_d' + str(datetime.now().date()) + '.csv',sep=';',index=False)

#### Best estimator

In [55]:
gbm_gsearch.best_estimator_

LGBMClassifier(bagging_freq=20, boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.66, gamma=0.4, importance_type='split',
               learning_rate=0.01, max_depth=9, metric='auc',
               min_child_samples=20, min_child_weight=0.001,
               min_data_in_leaf=170, min_split_gain=0.0, n_estimators=999,
               n_jobs=-1, num_leaves=31, objective='binary', random_state=None,
               reg_alpha=0.001, reg_lambda=0.5, scale_pos_weight=2, silent=True,
               subsample=0.85, subsample_for_bin=200000, subsample_freq=0)

#### Best parameter

In [57]:
lgb_params = gbm_gsearch.best_params_

In [60]:
lgb_params['seed'] = SEED
lgb_params['boosting_type'] = boosting
lgb_params['objective'] = objective
lgb_params['metric'] = metric
lgb_params['num_threads'] = NJOBS
lgb_params['verbose'] = 0
lgb_params['scale_pos_weight'] = scale_pos_weight
lgb_params['n_estimators'] = n_estimators

In [62]:
lgb_params

{'bagging_freq': 20,
 'colsample_bytree': 0.66,
 'gamma': 0.4,
 'learning_rate': 0.01,
 'max_depth': 9,
 'min_data_in_leaf': 170,
 'reg_alpha': 0.001,
 'reg_lambda': 0.5,
 'subsample': 0.85,
 'seed': 47,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'auc',
 'num_threads': 28,
 'verbose': 0,
 'scale_pos_weight': 2,
 'n_estimators': 999}

#### Best Score

In [59]:
print(SCORE,' : ', gbm_gsearch.best_score_)

roc_auc  :  0.7795858668857277


#### Saving best hyperparameters

In [63]:
np.save('../models/gseach_gbm_classifier_bestparams_d' + str(datetime.now().date()) + '.npy', lgb_params)

In [65]:
np.save('../models/gseach_gbm_classifier_best_estimator_d' + str(datetime.now().date()) + '.npy', gbm_gsearch.best_estimator_)