# Optimizacion de hiperparametros

In [2]:
import pandas as pd
import polars as pl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_contour

from time import time

import pickle

In [3]:
!gsutil cp /home/eanegrin/buckets/b1/datasets/competencia_03_fe_v06_undersampled.parquet /home/eanegrin/datasets/

Copying file:///home/eanegrin/buckets/b1/datasets/competencia_03_fe_v06_undersampled.parquet...
- [1 files][552.0 MiB/552.0 MiB]                                                
Operation completed over 1 objects/552.0 MiB.                                    


In [4]:
# base_path = 'C:/Eugenio/Maestria/DMEyF/'
base_path = '/home/eanegrin/buckets/b1/'

dataset_path = base_path + 'datasets/'
modelos_path = base_path + 'modelos/'
db_path = base_path + 'db/'
dataset_file = 'competencia_03_fe_v06_undersampled.parquet'

ganancia_acierto = 273000
costo_estimulo = 7000

semillas = [122219, 109279, 400391, 401537, 999961]

data = pd.read_parquet('/home/eanegrin/datasets/' + dataset_file)
# data = pd.read_parquet(dataset_path + dataset_file)

In [5]:
# el dataset undersampleado ya excluye los meses "malos" y los que tienen la clase ternaria incompleta
# nos queda excluir el mes de testing:

data = data[data['foto_mes'] != 202107]

data['foto_mes'].unique()

array([202007, 202008, 202009, 202010, 202011, 202012, 202101, 202102,
       202103, 202104, 202105, 202106])

In [6]:
data.shape

(211679, 1152)

In [7]:
# Asignamos pesos a las clases

data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

In [8]:
X_train = data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria'], axis=1)
y_train_binaria = data['clase_binaria']
w_train = data['clase_peso']

Para evaluar la calidad del modelo, crearemos nuestra propia función de evaluación que calcule la ganancia. La razón de incluir los pesos es precisamente para poder implementar esta función de evaluación de manera adecuada. Al combinar las clases *BAJA+1* y *BAJA+2* en una sola, necesitamos una forma de diferenciarlas, y es aquí donde entra en juego el *weight*. Este parámetro nos permitirá distinguir entre ambas clases al momento de evaluarlas dentro del algoritmo.


In [9]:
def lgb_gan_eval(y_pred, data):
    weight = data.get_weight()
    ganancia = np.where(weight == 1.00002, ganancia_acierto, 0) - np.where(weight < 1.00002, costo_estimulo, 0)
    ganancia = ganancia[np.argsort(y_pred)[::-1]]
    ganancia = np.cumsum(ganancia)

    return 'gan_eval', np.max(ganancia) , True

# Optimizacion

In [10]:
def objective(trial):

    num_leaves = trial.suggest_int('num_leaves', 8, 200),
    learning_rate = trial.suggest_float('learning_rate', 0.005, 0.05),
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 64, 2000),
    feature_fraction = trial.suggest_float('feature_fraction', 0.1, 1.0),
    bagging_fraction = trial.suggest_float('bagging_fraction', 0.1, 1.0),

    params = {
        'objective': 'binary',
        'metric': 'custom',
        'boosting_type': 'gbdt',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_bin': 31,
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'min_data_in_leaf': min_data_in_leaf,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'seed': semillas[1],
        'verbose': -1
    }
    
    train_data = lgb.Dataset(X_train,
                              label=y_train_binaria,
                              weight=w_train)
    
    # print(f"Learning Rate: {learning_rate}, Type: {type(learning_rate)}")
    
    # callbacks for early stopping
    early_stopping_cb = lgb.early_stopping(stopping_rounds=200)
        
    cv_results = lgb.cv(
        params,
        train_data,
        num_boost_round=2000,
        callbacks=[early_stopping_cb],
        feval=lgb_gan_eval,
        stratified=True,
        nfold=5,
        seed=semillas[1]
    )
    
    max_gan = max(cv_results['valid gan_eval-mean'])
    best_iter = cv_results['valid gan_eval-mean'].index(max_gan) + 1

    # Guardamos cual es la mejor iteración del modelo
    trial.set_user_attr("best_iter", best_iter)

    return max_gan * 5 # funcion objetivo, en el proximo paso le digo si quiero maximizarla o minimizarla.

In [11]:
storage_name = "sqlite:///" + db_path + "optimization_lgbm.db"
study_name = "competencia3_lgbm_v06" # UPDATE

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

[I 2024-12-04 03:17:36,218] A new study created in RDB with name: competencia3_lgbm_v06


In [None]:
study.optimize(objective, n_trials=100)

Analizamos los resultados as usual

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:
plot_slice(study)

In [None]:
plot_contour(study)

In [None]:
plot_contour(study, params=['num_leaves','min_data_in_leaf'])

In [None]:
study.best_trial.params

In [None]:
best_iter = study.best_trial.user_attrs["best_iter"]
best_iter