
### Paso a paso para realizar una automatización de los hiperparametros de un modelo de ML


#### Mi base de datos con las variables seleccionadas

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split

datos = pd.read_csv('Datos/datos_c_seleccion.csv', index_col=0)
# Separo labels de features
labels=datos.target
features = datos.drop(columns=['target'])
# Divido Train y Test
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 6000, random_state = 42)
print('Train shape: ', X_train.shape)
print('Test shape: ', X_test.shape)


Train shape:  (54231, 36)
Test shape:  (6000, 36)


In [2]:
#armo el dataset para lgb
train_set = lgb.Dataset(X_train, label = y_train)
test_set = lgb.Dataset(X_test, label = y_train)

### Optimización Bayesiana

Debemos especificar una función para optimizar, y sus correspondientes parametros, con sus valores límites.

Generalmente la optimización es respecto a minimizar un valor, y como nuestra metrica es AUC ROC, donde cuanto más alto es mejor, la función objetico retoranará 1-AUC ROC CV

### 1- Nuestra función objetivo es optimizar los valores de los hyperarametros de LGBM


In [3]:
import csv
from hyperopt import STATUS_OK
from timeit import default_timer as timer

def objective(hiperparemetros):
    """Objective function for Gradient Boosting Machine Hyperparameter Optimization.
       Writes a new line to `outfile` on every iteration"""
    
    # Keep track of evals
    global ITERATION
    
    ITERATION += 1
    
    # Using early stopping to find number of trees trained
    if 'n_estimators' in hiperparemetros:
        del hiperparemetros['n_estimators']
    
    # Retrieve the subsample
    subsample = hiperparemetros['boosting_type'].get('subsample', 1.0)
    
    # Extract the boosting type and subsample to top level keys
    hiperparemetros['boosting_type'] = hiperparemetros['boosting_type']['boosting_type']
    hiperparemetros['subsample'] = subsample
    
    # Make sure parameters that need to be integers are integers
    for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
        hiperparemetros[parameter_name] = int(hiperparemetros[parameter_name])

    start = timer()
    
    # Perform n_folds cross validation
    cv_results = lgb.cv(hiperparemetros, train_set, num_boost_round = 10000, nfold = N_FOLDS, 
                        early_stopping_rounds = 100, metrics = 'auc', seed = 50)

    run_time = timer() - start
    
    # Extract the best score
    best_score = cv_results['auc-mean'][-1]
    
    # Loss must be minimized
    loss = 1 - best_score
    
    # Boosting rounds that returned the highest cv score
    n_estimators = len(cv_results['auc-mean'])
    
    # Add the number of estimators to the hiperparemetros
    hiperparemetros['n_estimators'] = n_estimators

    # Write to the csv file ('a' means append)
    of_connection = open(OUT_FILE, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, hiperparemetros, ITERATION, run_time, best_score])
    of_connection.close()

    # Dictionary with information for evaluation
    return {'loss': loss, 'hiperparemetros': hiperparemetros, 'iteration': ITERATION,
            'train_time': run_time, 'status': STATUS_OK}

### 2- Parametros que debemos calibrar


In [4]:
model = lgb.LGBMClassifier(random_state=50)
hiperparemetros = model.get_params()
hiperparemetros

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': 50,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

### 3- Valores posibles de los parámetros para evaluar

In [5]:
# Define the search space
from hyperopt import hp

space = {
    'boosting_type': hp.choice('boosting_type', 
                                            [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, 
                                             #{'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)},
                                             {'boosting_type': 'goss', 'subsample': 1.0}]),
    'num_leaves': hp.quniform('num_leaves', 20, 150, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
    'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
    'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    'is_unbalance': hp.choice('is_unbalance', [True, False]),
    'class_weight': hp.choice('class_weight', ['balanced', None]),#importantísimo
}

### 4-Optimización

In [6]:
from hyperopt import tpe
# Create the algorithm
tpe_algorithm = tpe.suggest
from hyperopt import fmin

In [7]:
from hyperopt import Trials

# Record results
trials = Trials()
# Create a file and open a connection
OUT_FILE = 'Datos/bayes_test.csv'
of_connection = open(OUT_FILE, 'w')
writer = csv.writer(of_connection)

ITERATION = 0

# Write column names
headers = ['loss', 'hyperparameters', 'iteration', 'runtime', 'score']
writer.writerow(headers)
of_connection.close()

In [8]:
# Global variable
global  ITERATION
MAX_EVALS = 10
N_FOLDS = 5
ITERATION = 0

# Run optimization
best = fmin(fn = objective, space = space, algo = tpe.suggest, trials = trials,
            max_evals = MAX_EVALS)

best

100%|█████████████████████████████████████████████████| 10/10 [02:05<00:00, 14.87s/it, best loss: 0.037083527045228104]


{'boosting_type': 0,
 'class_weight': 0,
 'colsample_by_tree': 0.9327992276257075,
 'gdbt_subsample': 0.8372297281400392,
 'is_unbalance': 1,
 'learning_rate': 0.020972552144714853,
 'min_child_samples': 110.0,
 'num_leaves': 124.0,
 'reg_alpha': 0.678907006302082,
 'reg_lambda': 0.08624711694400033,
 'subsample_for_bin': 220000.0}

In [9]:
results = pd.read_csv(OUT_FILE)


In [10]:
# Sort with best values on top
results=results.sort_values('score', ascending = False).reset_index(drop = True)

In [11]:
results.loc[0, 'hyperparameters']#el mejor

"{'boosting_type': 'gbdt', 'class_weight': 'balanced', 'colsample_bytree': 0.9327992276257075, 'is_unbalance': False, 'learning_rate': 0.020972552144714853, 'min_child_samples': 110, 'num_leaves': 124, 'reg_alpha': 0.678907006302082, 'reg_lambda': 0.08624711694400033, 'subsample_for_bin': 220000, 'subsample': 0.8372297281400392, 'n_estimators': 99}"

In [12]:
results.loc[1, 'hyperparameters']# el siguiente

"{'boosting_type': 'gbdt', 'class_weight': 'balanced', 'colsample_bytree': 0.7900995784745264, 'is_unbalance': True, 'learning_rate': 0.0343034148498665, 'min_child_samples': 250, 'num_leaves': 91, 'reg_alpha': 0.2659708831339006, 'reg_lambda': 0.5710140558171175, 'subsample_for_bin': 160000, 'subsample': 0.9358923747737681, 'n_estimators': 62}"