# LigthGBM - CLASSIFICATION - BAYESIAN OPTIMIZATION

In [1]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
import lightgbm as lgb
from lightgbm import LGBMClassifier
from bayes_opt import BayesianOptimization
import pandas as pd
from datetime import datetime
from time import time

In [63]:
import warnings
warnings.filterwarnings('ignore')

### GLOBAL VARIABLES

In [2]:
DATAPATH = 'data/train_test/'
SEED = 47
NITER = 100
CV = 3
SCORE = 'roc_auc'
handlingnull = False
NJOBS = -1
USEGPU = False
NCLASS = 3 # number class to predict (if bivar set 0)

### LOAD DATASET

In [3]:
train_features = pd.read_pickle(DATAPATH+'X_train.pkl').values

In [4]:
train_labels = pd.read_pickle(DATAPATH+'y_train.pkl')['target'].values

In [5]:
train_features.shape

(148865, 1770)

In [6]:
train_labels.shape

(148865,)

#### Input Null values

In [7]:
if handlingnull:
    train_features[np.isnan(train_features)] = -9999

In [8]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(train_features, train_labels)

In [9]:
del(train_features)

### TRAIN MODEL

#### Set Search hyperparameters

In [39]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
boosting = 'gbdt'


# ======== Booster Parameters ======== # 

# Analogous to learning rate in GBM. 
# Typical final values to be used: 0.01-0.2
eta = [0.01] 


# A node is split only when the resulting split gives a positive reduction in the loss function. 
# Gamma specifies the minimum loss reduction required to make a split.
gamma = [i/10.0 for i in range(0,5)]


# Control the balance of positive and negative weights, useful for unbalanced classes. 
# A typical value to consider: sum(negative instances) / sum(positive instances)scale_pos_weight = 1
scale_pos_weight = (len(train_labels) - sum(train_labels))/sum(train_labels)


# Learning Task Parameters
# This defines the loss function to be minimized. See documentation
# -  options: regression, regression_l1, huber, fair, poisson, quantile, 
# mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda,
# lambdarank, aliases: objective_type, app, application
objective  = 'binary'


# The metric to be used for validation data.
# - rmse, root square loss, aliases: root_mean_squared_error, l2_root
# - quantile, Quantile regression
# - mape, MAPE loss, aliases: mean_absolute_percentage_error
# - huber, Huber loss
# - fair, Fair loss
# - poisson, negative log-likelihood for Poisson regression
# - gamma, negative log-likelihood for Gamma regression
# - gamma_deviance, residual deviance for Gamma regression
# - tweedie, negative log-likelihood for Tweedie regression
# - ndcg, NDCG, aliases: lambdarank
# - map, MAP, aliases: mean_average_precision
# - auc, AUC
# - binary_logloss, log loss, aliases: binary
metric = 'auc'

[lightGBM params](https://lightgbm.readthedocs.io/en/latest/Parameters.html)


help(lgb.LGBMClassifier)

In [58]:
# Define the search space
# Domain space-- Range of hyperparameters
pds = {
    # Minimal number of data in one leaf. Can be used to deal with over-fitting
    # default = 20, type = int, aliases: min_data_per_leaf, min_data, min_child_samples, 
    'num_leaves': (20, 100),

    # Denotes the fraction of columns to be randomly samples for each tree.
    'feature_fraction': (0.1, 0.9),
    
    # Denotes the fraction of observations to be randomly samples for each tree.
    'bagging_fraction': (0.8, 1),

    # The maximum depth of a tree
    'max_depth': (9, 13 ),

    'min_split_gain': (0.001, 0.1),

    # Minimal sum hessian in one leaf. Like min_data_in_leaf, it can be used to deal with over-fitting
    # Default 1e-3
    'min_child_weight': (30, 50),
    
   

    
}

In [59]:
pds

{'num_leaves': (20, 100),
 'feature_fraction': (0.1, 0.9),
 'bagging_fraction': (0.8, 1),
 'max_depth': (9, 13),
 'min_split_gain': (0.001, 0.1),
 'min_child_weight': (30, 50)}

#### Find num boost

In [12]:
early_stopping_rounds = 50

In [27]:
# specify your configurations as a dict
lgb_params = {
    'boosting_type': boosting,
    'objective': objective,
    'metric': metric,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'num_threads' : 8,
    'verbose': 0,
    #'num_class':  NCLASS,
    'seed' : SEED
}

In [14]:
cvresult = lgb.cv(lgb_params, lgb_train, num_boost_round = 1000, nfold = CV, metrics = metric, early_stopping_rounds = early_stopping_rounds, seed = SEED)

In [15]:
n_estimators = pd.DataFrame(cvresult).shape[0]

In [42]:
print("Best number of estimators found: ", n_estimators)

Best number of estimators found:  285


In [43]:
n_estimators = 285

#### Bayesian optimization hyperparameters

In [73]:
def hyp_lgbm(num_leaves, feature_fraction, bagging_fraction, max_depth, min_split_gain, min_child_weight):
      
    params = {'boosting_type': boosting,
              'application': objective,
              'num_iterations': n_estimators,
              'learning_rate':eta, 
              'early_stopping_round':50,
              'metric': metric} # Default parameters
    
    
    params["num_leaves"] = int(round(num_leaves))
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['max_depth'] = int(round(max_depth))
    params['min_split_gain'] = min_split_gain
    params['min_child_weight'] = min_child_weight
    params['min_split_gain'] = min_split_gain
    
    
    
    cv_result = lgb.cv(params, lgb_train, nfold=CV, seed=SEED, stratified=False, verbose_eval=None, metrics = metric)
    
    # Extract the best score
    best_score = max(cv_result['auc-mean'])
    
    # Loss must be minimized
    #loss = 1 - best_score

    return best_score

In [74]:
# Surrogate model
optimizer = BayesianOptimization(hyp_lgbm,pds,random_state=7)

In [75]:
# Optimize
optimizer.maximize(init_points=5, n_iter=15)

|   iter    |  target   | baggin... | featur... | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7949  [0m | [0m 0.8153  [0m | [0m 0.7239  [0m | [0m 10.75   [0m | [0m 44.47   [0m | [0m 0.09782 [0m | [0m 63.08   [0m |
| [0m 2       [0m | [0m 0.79    [0m | [0m 0.9002  [0m | [0m 0.1576  [0m | [0m 10.07   [0m | [0m 40.0    [0m | [0m 0.06824 [0m | [0m 84.3    [0m |
| [0m 3       [0m | [0m 0.7883  [0m | [0m 0.8762  [0m | [0m 0.1527  [0m | [0m 10.15   [0m | [0m 48.19   [0m | [0m 0.02213 [0m | [0m 56.17   [0m |
| [0m 4       [0m | [0m 0.7845  [0m | [0m 0.9862  [0m | [0m 0.1199  [0m | [0m 11.4    [0m | [0m 49.0    [0m | [0m 0.0238  [0m | [0m 63.88   [0m |
| [0m 5       [0m | [0m 0.7909  [0m | [0m 0.9818  [0m | [0m 0.2065  [0m | [0m 11.09   [0m | [0m 45.01   [0m | [0m 0.06723 [0m | [0m 57.42   

#### Best parameter

In [76]:
optimizer.max['params']

{'bagging_fraction': 0.8145882315796769,
 'feature_fraction': 0.8517987893796524,
 'max_depth': 12.957248699729796,
 'min_child_weight': 30.301446391267557,
 'min_split_gain': 0.08380172857159027,
 'num_leaves': 99.64679476791302}

#### Saving best hyperparameters

In [77]:
np.save('output/models/bayesianoptcv_gbm_classifier_bestparams_d' + str(datetime.now().date()) + '.npy', optimizer.max['params'])