# Busqueda de parámetros con Optimización bayesiana y LightGBM usando la librería Hyperopt.

In [None]:
# Basado en el código de:
# https://github.com/WillKoehrsen/hyperparameter-optimization/blob/master/Bayesian%20Hyperparameter%20Optimization%20of%20Gradient%20Boosting%20Machine.ipynb
# donde se puede encontrar un detalle más completo.
# Se utiliza el dataset: caravan-insurance-challenge.csv que se encuentra disponible en https://www.kaggle.com/uciml/caravan-insurance-challenge

import pandas as pd
import numpy as np
import sklearn.model_selection as KFold
import lightgbm as lgb

MAX_EVALS = 50
N_FOLDS = 10
FILE_PATH = "/content/drive/MyDrive/TuRuTaEnTuDrive/data/"

In [None]:
from google.colab import drive

drive.mount("/content/drive/")
#Colocar la ubicación del drive donde se encuentra el archivo. Por ejemplo: /content/drive/MyDrive/MisNotebooks/Datasets/caravan-insurance-challenge.csv 
file = FILE_PATH + "caravan-insurance-challenge.csv"
df = pd.read_csv(file)
df.head()

Mounted at /content/drive/


Unnamed: 0,ORIGIN,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,MRELSA,MRELOV,MFALLEEN,MFGEKIND,MFWEKIND,MOPLHOOG,MOPLMIDD,MOPLLAAG,MBERHOOG,MBERZELF,MBERBOER,MBERMIDD,MBERARBG,MBERARBO,MSKA,MSKB1,MSKB2,MSKC,MSKD,MHHUUR,MHKOOP,MAUT1,MAUT2,MAUT0,MZFONDS,MZPART,MINKM30,MINK3045,MINK4575,...,PPERSAUT,PBESAUT,PMOTSCO,PVRAAUT,PAANHANG,PTRACTOR,PWERKT,PBROM,PLEVEN,PPERSONG,PGEZONG,PWAOREG,PBRAND,PZEILPL,PPLEZIER,PFIETS,PINBOED,PBYSTAND,AWAPART,AWABEDR,AWALAND,APERSAUT,ABESAUT,AMOTSCO,AVRAAUT,AAANHANG,ATRACTOR,AWERKT,ABROM,ALEVEN,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,CARAVAN
0,train,33,1,3,2,8,0,5,1,3,7,0,2,1,2,6,1,2,7,1,0,1,2,5,2,1,1,2,6,1,1,8,8,0,1,8,1,0,4,5,...,6,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,train,37,1,2,2,8,1,4,1,4,6,2,2,0,4,5,0,5,4,0,0,0,5,0,4,0,2,3,5,0,2,7,7,1,2,6,3,2,0,5,...,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,train,37,1,2,2,8,0,4,2,4,3,2,4,4,4,2,0,5,4,0,0,0,7,0,2,0,5,0,4,0,7,2,7,0,2,9,0,4,5,0,...,6,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,train,9,1,3,3,3,2,3,2,4,5,2,2,2,3,4,3,4,2,4,0,0,3,1,2,3,2,1,4,0,5,4,9,0,0,7,2,1,5,3,...,6,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,train,40,1,4,2,10,1,4,1,4,7,1,2,2,4,4,5,4,0,0,5,4,0,0,0,9,0,0,0,0,4,5,6,2,1,5,4,0,0,9,...,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [None]:
# En el dataset esta taggeada la info que se va a usar en training y en test (ya que esto se utilizó para una competencia de kaggle)

# Se separa los registros de training de los de test
df_train = df[df["ORIGIN"] == "train"]
df_eval = df[df["ORIGIN"] == "test"]

# Se pasa el target a un formato más adecuado.
target_train = np.array(df_train['CARAVAN'].astype(np.int32)).reshape((-1,))
target_eval = np.array(df_eval['CARAVAN'].astype(np.int32)).reshape((-1,))

# Se borra la columna ORIGIN que servía para distinguir entre training y test.
# Se borra la columna CARAVAN ya que la separamos más arriba como targets. 
df_train = df_train.drop(columns = ['ORIGIN', 'CARAVAN'])
df_eval = df_eval.drop(columns = ['ORIGIN', 'CARAVAN'])

# Se convierte a numpy array para poder utilizarlo en el cross validation de LightGBM
features = np.array(df_train)
test_features = np.array(df_eval)
labels = target_train[:]

print('Dataset training forma: ', df_train.shape)
print('Dataset testing forma: ', df_eval.shape)
df_train.head()

Dataset training forma:  (5822, 85)
Dataset testing forma:  (4000, 85)


Unnamed: 0,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,MRELSA,MRELOV,MFALLEEN,MFGEKIND,MFWEKIND,MOPLHOOG,MOPLMIDD,MOPLLAAG,MBERHOOG,MBERZELF,MBERBOER,MBERMIDD,MBERARBG,MBERARBO,MSKA,MSKB1,MSKB2,MSKC,MSKD,MHHUUR,MHKOOP,MAUT1,MAUT2,MAUT0,MZFONDS,MZPART,MINKM30,MINK3045,MINK4575,MINK7512,...,PWALAND,PPERSAUT,PBESAUT,PMOTSCO,PVRAAUT,PAANHANG,PTRACTOR,PWERKT,PBROM,PLEVEN,PPERSONG,PGEZONG,PWAOREG,PBRAND,PZEILPL,PPLEZIER,PFIETS,PINBOED,PBYSTAND,AWAPART,AWABEDR,AWALAND,APERSAUT,ABESAUT,AMOTSCO,AVRAAUT,AAANHANG,ATRACTOR,AWERKT,ABROM,ALEVEN,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND
0,33,1,3,2,8,0,5,1,3,7,0,2,1,2,6,1,2,7,1,0,1,2,5,2,1,1,2,6,1,1,8,8,0,1,8,1,0,4,5,0,...,0,6,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,37,1,2,2,8,1,4,1,4,6,2,2,0,4,5,0,5,4,0,0,0,5,0,4,0,2,3,5,0,2,7,7,1,2,6,3,2,0,5,2,...,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,37,1,2,2,8,0,4,2,4,3,2,4,4,4,2,0,5,4,0,0,0,7,0,2,0,5,0,4,0,7,2,7,0,2,9,0,4,5,0,0,...,0,6,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,9,1,3,3,3,2,3,2,4,5,2,2,2,3,4,3,4,2,4,0,0,3,1,2,3,2,1,4,0,5,4,9,0,0,7,2,1,5,3,0,...,0,6,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,40,1,4,2,10,1,4,1,4,7,1,2,2,4,4,5,4,0,0,5,4,0,0,0,9,0,0,0,0,4,5,6,2,1,5,4,0,0,9,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


# Armado de elementos necesarios para la optimización bayesiana

A continuación se definen las diferentes partes que se necesitan para llevar a cabo la optimización bayesiana. Estas son:


*   Función objetivo.
*   Espacio de búsqueda.
*   El algoritmo de optimización.
*   La historia de los resultados.

In [None]:
import csv
from hyperopt import STATUS_OK
from timeit import default_timer as timer

model = lgb.LGBMClassifier()
dt_train_lgb = lgb.Dataset(features, label = labels)

# Función objetivo

In [None]:
# Se utiliza la métrica AUC en este caso. 
def objective(params, n_folds = N_FOLDS):

    #Para saber el nro de iteraciones realizados.
    global ITERATION
    
    ITERATION += 1
    
    #Sobre el espacio de búsqueda, teniendo en que GOSS no acepta subsampling.
    subsample = params['boosting_type'].get('subsample', 1.0)   
    params['boosting_type'] = params['boosting_type']['boosting_type']
    params['subsample'] = subsample
    
    # Conventir los parametros enteros a enteros.
    for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
        params[parameter_name] = int(params[parameter_name])
    
    start = timer()  
    # Se aplica el cross-validation.
    cv_results = lgb.cv(params, dt_train_lgb, num_boost_round = 10000, nfold = n_folds, 
                        early_stopping_rounds = 100, metrics = 'auc', seed = 50)    
    run_time = timer() - start
    
    # Se toma la mejor marca.
    best_score = np.max(cv_results['auc-mean'])
    
    # Calcular la pérdida
    loss = 1 - best_score
    
    # rondas de boosting que devuelven la marca más alta.
    n_estimators = int(np.argmax(cv_results['auc-mean']) + 1)

    # Se guardan los resultados parciales en un archivo.
    out_file = FILE_PATH + "gbm_trials.csv"
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, params, ITERATION, n_estimators, run_time])
    
    # Diccionario con información para la evaluación
    return {'loss': loss, 'params': params, 'iteration': ITERATION,
            'estimators': n_estimators, 
            'train_time': run_time, 'status': STATUS_OK}

# Espacio de búsqueda


In [None]:
from hyperopt import hp
from hyperopt.pyll.stochastic import sample

In [None]:
# Se define el espacio de busqueda.
# Se definen distribuciones de probabilidad para cada hiperparámetro.
# Notar que cuando más amplios son los rangos de busqueda y la cantidad de parámetros más va a tardar.
# Opción 1: espacio amplio
space = {
    'class_weight': hp.choice('class_weight', [None, 'balanced']),
    'boosting_type': hp.choice('boosting_type', [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, 
                                                 {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)},
                                                 {'boosting_type': 'goss', 'subsample': 1.0}]),
    'num_leaves': hp.quniform('num_leaves', 30, 150, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
    'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0)
}

In [None]:
#Opción 2: usamos un espacio de búsqueda más pequeño para fines prácticos.
space = {
    'class_weight': hp.choice('class_weight', [None, 'balanced']),
    'boosting_type': hp.choice('boosting_type', [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, 
                                                 {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)},
                                                 {'boosting_type': 'goss', 'subsample': 1.0}]),
    'num_leaves': hp.quniform('num_leaves', 30, 80, 1),
    'min_child_samples': hp.quniform('min_child_samples', 20, 500, 20),
    'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 30000, 20000)
}

In [None]:
# Se hace esto porque gbm no soporta diccionarios anidados.
x = sample(space)
subsample = x['boosting_type'].get('subsample', 1.0)
x['boosting_type'] = x['boosting_type']['boosting_type']
x['subsample'] = subsample
x

{'boosting_type': 'dart',
 'class_weight': None,
 'min_child_samples': 220.0,
 'num_leaves': 36.0,
 'subsample': 0.628348450487278,
 'subsample_for_bin': 20000.0}

# Algoritmo de optimización

In [None]:
# Hyperot usa Tree Parzen Estimator.
from hyperopt import tpe

tpe_algorithm = tpe.suggest

# Guardar los resultados

In [None]:
from hyperopt import Trials

#Guarda los diccionarios devueltos por la función objetivo.
bayes_trials = Trials()

In [None]:
# Se crea el archivo para guardar los resultados parciales de las corridas
out_file = FILE_PATH + "gbm_trials.csv"
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)

# Se escribe la cabecera del archivo
# Las claves "loss" y "status" son mandatorias.
writer.writerow(['loss', 'params', 'iteration', 'estimators', 'train_time'])
of_connection.close()

# Optimización bayesiana

In [None]:
from hyperopt import fmin

In [None]:
%%capture

global ITERATION

ITERATION = 0

#Run optimization
best = fmin(fn= objective,
            space = space,
            algo = tpe.suggest,
            max_evals = MAX_EVALS,
            trials = bayes_trials,
            rstate = np.random.RandomState(50))

# Resultados

In [None]:
import pandas as pd

results = pd.read_csv(FILE_PATH + "gbm_trials.csv")

# Sort with best scores on top and reset index for slicing
results.sort_values('loss', ascending = True, inplace = True)
results.reset_index(inplace = True, drop = True)
results.head()

Unnamed: 0,loss,params,iteration,estimators,train_time
0,0.237688,"{'boosting_type': 'dart', 'class_weight': None...",10,102,1424.186775
1,0.237912,"{'boosting_type': 'dart', 'class_weight': None...",21,56,1692.470128
2,0.237912,"{'boosting_type': 'dart', 'class_weight': None...",29,56,1734.088844
3,0.23829,"{'boosting_type': 'dart', 'class_weight': None...",24,111,1522.923913
4,0.23829,"{'boosting_type': 'dart', 'class_weight': None...",30,111,1521.70897


In [None]:
import ast

best_estimators = int(results.loc[0, "estimators"])
# Convert from a string to a dictionary
best_params = ast.literal_eval(results.loc[0, "params"]).copy()

best_model = lgb.LGBMClassifier( n_estimators= best_estimators,
                   n_jobs = -1,
                   objective = "binary",
                   random_state = 50,
                   ** best_params)    

In [None]:
from sklearn.metrics import roc_auc_score 

best_model.fit(features, labels)

preds = best_model.predict_proba(test_features)[:, 1]

print('El modelo tiene un AUC de {:.4f} en el dataset de testing.'.format(roc_auc_score(target_eval, preds)))
print('Se usaron {} iteraciones.'.format(results.loc[0, 'iteration']))

El modelo tiene un AUC de 0.7192 en el dataset de testing.
Se usaron 10 iteraciones.
