# LigthGBM - RANDOM SEARCH

In [1]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
from datetime import datetime
from time import time

### GLOBAL VARIABLES

In [22]:
DATAPATH = 'data/processed/'
SEED = 47
NITER = 100
CV = 3
SCORE = 'roc_auc'
handlingnull = False
NJOBS = -1
USEGPU = False
NCLASS = 3 # number class to predict (if bivar set 0)

### LOAD DATASET

In [18]:
X_train = pd.read_pickle(DATAPATH+'X.pkl')

In [16]:
y_train = pd.read_pickle(DATAPATH+'y.pkl')

In [5]:
train_features.shape

(148865, 1770)

In [6]:
train_labels.shape

(148865,)

In [23]:
X_test = pd.read_pickle(DATAPATH+'submission.pkl')

In [24]:
campaign = pd.read_pickle('data/features/campaign_quarter_001.pkl')

In [None]:
digital = pd.read_pickle('data/features/digital_features_period_001.pkl')

#### Input Null values

In [7]:
if handlingnull:
    train_features[np.isnan(train_features)] = -9999

In [8]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(train_features, train_labels)

### TRAIN MODEL

#### Set Search hyperparameters

In [9]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
boosting = 'gbdt'


# ======== Booster Parameters ======== # 

# Analogous to learning rate in GBM. 
# Typical final values to be used: 0.01-0.2
eta = [0.01] 

# Minimal sum hessian in one leaf. Like min_data_in_leaf, it can be used to deal with over-fitting
# Default 1e-3
min_child_weight = [0.001, 0.005, 0.01, 0.05, 0.1, 1]

# Minimal number of data in one leaf. Can be used to deal with over-fitting
# default = 20, type = int, aliases: min_data_per_leaf, min_data, min_child_samples, 
# constraints: min_data_in_leaf >= 0
min_data_in_leaf = [i for i in range(20,1000,40)]

# The maximum depth of a tree
max_depth = [i for i in range(3,10,2)] 

# A node is split only when the resulting split gives a positive reduction in the loss function. 
# Gamma specifies the minimum loss reduction required to make a split.
gamma = [i/10.0 for i in range(0,5)]

# Denotes the fraction of observations to be randomly samples for each tree.
subsample = [i/10.0 for i in range(6,10)]

# Denotes the fraction of columns to be randomly samples for each tree.
colsample_bytree = [i/10.0 for i in range(6,10)]

# frequency for bagging
# 0 means disable bagging; k means perform bagging at every k iteration
# Note: to enable bagging, bagging_fraction should be set to value smaller than 1.0 as well
bagging_freq = [3, 5, 10, 20, 30]

# L2 regularization term on weights (analogous to Ridge regression)
reg_lambda = [i/10.0 for i in range(4,10)]

# L1 regularization term on weight (analogous to Lasso regression)
reg_alpha = [0, 0.001, 0.005, 0.01, 0.05]

# Control the balance of positive and negative weights, useful for unbalanced classes. 
# A typical value to consider: sum(negative instances) / sum(positive instances)scale_pos_weight = 1
scale_pos_weight = (len(train_labels) - sum(train_labels))/sum(train_labels)


# Learning Task Parameters
# This defines the loss function to be minimized. See documentation
# -  options: regression, regression_l1, huber, fair, poisson, quantile, 
# mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda,
# lambdarank, aliases: objective_type, app, application
objective  = 'binary'


# The metric to be used for validation data.
# - rmse, root square loss, aliases: root_mean_squared_error, l2_root
# - quantile, Quantile regression
# - mape, MAPE loss, aliases: mean_absolute_percentage_error
# - huber, Huber loss
# - fair, Fair loss
# - poisson, negative log-likelihood for Poisson regression
# - gamma, negative log-likelihood for Gamma regression
# - gamma_deviance, residual deviance for Gamma regression
# - tweedie, negative log-likelihood for Tweedie regression
# - ndcg, NDCG, aliases: lambdarank
# - map, MAP, aliases: mean_average_precision
# - auc, AUC
# - binary_logloss, log loss, aliases: binary
metric = 'auc'

[lightGBM params](https://lightgbm.readthedocs.io/en/latest/Parameters.html)

In [10]:
# Create the random grid
random_grid = {
    'learning_rate' : eta,
    'min_data_in_leaf' : min_data_in_leaf,
    'max_depth' : max_depth,
    'gamma': gamma,
    'subsample': subsample,
    'colsample_bytree' : colsample_bytree,
    'bagging_freq' : bagging_freq,
    'reg_lambda' : reg_lambda,
    'reg_alpha' : reg_alpha,
}

In [11]:
random_grid

{'learning_rate': [0.01],
 'min_data_in_leaf': [20,
  60,
  100,
  140,
  180,
  220,
  260,
  300,
  340,
  380,
  420,
  460,
  500,
  540,
  580,
  620,
  660,
  700,
  740,
  780,
  820,
  860,
  900,
  940,
  980],
 'max_depth': [3, 5, 7, 9],
 'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],
 'subsample': [0.6, 0.7, 0.8, 0.9],
 'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
 'bagging_freq': [3, 5, 10, 20, 30],
 'reg_lambda': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05]}

#### Find num boost

In [12]:
early_stopping_rounds = 50

In [13]:
# specify your configurations as a dict
lgb_params = {
    'boosting_type': boosting,
    'objective': objective,
    'metric': metric,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'num_threads' : 8,
    'verbose': 0,
    #'num_class':  NCLASS,
    'seed' : SEED
}

In [14]:
cvresult = lgb.cv(lgb_params, lgb_train, num_boost_round = 1000, nfold = CV, metrics = metric, early_stopping_rounds = early_stopping_rounds, seed = SEED)

In [15]:
n_estimators = pd.DataFrame(cvresult).shape[0]

In [16]:
print("Best number of estimators found: ", n_estimators)

Best number of estimators found:  285


In [20]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
model = LGBMClassifier(n_estimators=n_estimators, 
                       #num_classes=NCLASS, 
                       scale_pos_weight=scale_pos_weight, 
                       #is_unbalance = True,
                       objective=objective, metric=metric)

In [21]:
# Random search of parameters, using CV fold cross validation, 
# search across NITER different combinations, and use all available cores
gbm_rsearch = RandomizedSearchCV(estimator = model, param_distributions = random_grid, scoring=SCORE, n_iter = NITER, cv = CV, verbose=2, random_state=SEED, n_jobs = NJOBS)# Fit the random search model


#### Training

In [20]:
drop_cols = ["codmes"]
test_preds = []
train_preds = []
y_train["target"] = y_train["margen"].astype("float32")
for mes in X_train.codmes.unique():
    print("*"*10, mes, "*"*10)
    Xt = X_train[X_train.codmes != mes]
    yt = y_train.loc[Xt.index, "target"]
    Xt = Xt.drop(drop_cols, axis=1)

    Xv = X_train[X_train.codmes == mes]
    yv = y_train.loc[Xv.index, "target"]
    
    learner = LGBMRegressor(n_estimators=1000, n_jobs=-1)
    learner.fit(Xt, yt,  early_stopping_rounds=10, eval_metric="mae",
                eval_set=[(Xt, yt), (Xv.drop(drop_cols, axis=1), yv)], verbose=50)
    test_preds.append(pd.Series(learner.predict(X_test.drop(drop_cols, axis=1)),
                                index=X_test.index, name="fold_" + str(mes)))
    train_preds.append(pd.Series(learner.predict(Xv.drop(drop_cols, axis=1)),
                                index=Xv.index, name="probs"))

test_preds = pd.concat(test_preds, axis=1).mean(axis=1)
train_preds = pd.concat(train_preds)

********** 201902 **********
Training until validation scores don't improve for 10 rounds.
[50]	training's l2: 3929.36	training's l1: 19.7834	valid_1's l2: 4914.17	valid_1's l1: 19.9251
Early stopping, best iteration is:
[60]	training's l2: 3836.62	training's l1: 19.6415	valid_1's l2: 4905.82	valid_1's l1: 19.9104


NameError: name 'X_test' is not defined

#### Saving results

In [23]:
cv_results = pd.DataFrame(gbm_rsearch.cv_results_)



In [24]:
cv_results.to_csv('output/results/rsearch_gbm_classifier_d' + str(datetime.now().date()) + '.csv',sep=';',index=False)

#### Best estimator

In [25]:
gbm_rsearch.best_estimator_

LGBMClassifier(bagging_freq=10, boosting_type='gbdt', class_weight=None,
        colsample_bytree=0.9, gamma=0.2, importance_type='split',
        learning_rate=0.01, max_depth=9, metric='auc',
        min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=140,
        min_split_gain=0.0, n_estimators=285, n_jobs=-1, num_leaves=31,
        objective='binary', random_state=None, reg_alpha=0.001,
        reg_lambda=0.4, scale_pos_weight=9.22705413575158, silent=True,
        subsample=0.7, subsample_for_bin=200000, subsample_freq=0)

#### Best parameter

In [26]:
gbm_rsearch.best_params_

{'subsample': 0.7,
 'reg_lambda': 0.4,
 'reg_alpha': 0.001,
 'min_data_in_leaf': 140,
 'max_depth': 9,
 'learning_rate': 0.01,
 'gamma': 0.2,
 'colsample_bytree': 0.9,
 'bagging_freq': 10}

#### Best Score

In [27]:
print(SCORE,' : ', gbm_rsearch.best_score_)

roc_auc  :  0.7941493817885061


#### Saving best hyperparameters

In [28]:
np.save('output/models/rseach_gbm_classifier_bestparams_d' + str(datetime.now().date()) + '.npy', gbm_rsearch.best_params_)

In [29]:
np.save('output/models/rseach_gbm_classifier_best_estimator_d' + str(datetime.now().date()) + '.npy', gbm_rsearch.best_estimator_)