In [1]:
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt

import optuna
import lightgbm as lgb

import pickle
import os

In [2]:
# !gsutil cp /home/eanegrin/buckets/b1/datasets/competencia_02_fe_v01.parquet /home/eanegrin/datasets/

In [2]:
base_path = 'C:/Eugenio/Maestria/DMEyF/'
# base_path = '/home/eanegrin/buckets/b1/'

dataset_path = base_path + 'datasets/'
modelos_path = base_path + 'modelos/'
db_path = base_path + 'db/'
dataset_file = 'competencia_03_fe_v08_undersampled.parquet'

ganancia_acierto = 273000
costo_estimulo = 7000

semillas = [1149, 4836, 9443, 7483, 3492, 6287, 830, 3294, 5932, 13]

In [3]:
# data = pd.read_parquet('/home/eanegrin/datasets/' + dataset_file)
data = pd.read_parquet(dataset_path + dataset_file)

In [4]:
data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

In [5]:
data['clase_binaria'] = 0
data['clase_binaria'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 1)

In [6]:
X_train = data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria'], axis=1)
y_train_binaria = data['clase_binaria']
w_train = data['clase_peso']

In [7]:
def lgb_gan_eval(y_pred, data):
    weight = data.get_weight()
    ganancia = np.where(weight == 1.00002, ganancia_acierto, 0) - np.where(weight < 1.00002, costo_estimulo, 0)
    ganancia = ganancia[np.argsort(y_pred)[::-1]]
    ganancia = np.cumsum(ganancia)

    return 'gan_eval', np.max(ganancia) , True

# Entrenamiento

In [8]:
storage_name = "sqlite:///" + db_path + "optimization_lgbm_v08.db"
study_name = "competencia3_lgbm_v08" # UPDATE

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

[I 2024-12-05 09:50:20,710] Using an existing study with name 'competencia3_lgbm_v08' instead of creating a new one.


In [9]:
resultados = study.trials_dataframe()
resultados.shape

(15, 12)

In [10]:
study.best_trial.params

{'num_leaves': 248,
 'learning_rate': 0.0498907204475266,
 'min_data_in_leaf': 121,
 'feature_fraction': 0.42739407657827894,
 'bagging_fraction': 0.4653942981227891}

Reentrenamos los modelos individuales con la totalidad de los datos:

In [None]:
version = 'v008' # UPDATE

os.makedirs(modelos_path + 'v010', exist_ok= True) # carpeta donde vamos a almacenar los modelos que va a usar el ensamble

best_iter = study.best_trial.user_attrs["best_iter"]
print(f"Mejor cantidad de árboles para el mejor model {best_iter}")

counter = 0

for semilla in semillas:
    
    counter += 1
    print(f'{counter}. Train del modelo {version} con semilla {semilla}')    
    
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_bin': 31,
        'num_leaves': study.best_trial.params['num_leaves'],
        'learning_rate': study.best_trial.params['learning_rate'],
        'min_data_in_leaf': study.best_trial.params['min_data_in_leaf'],
        'feature_fraction': study.best_trial.params['feature_fraction'],
        'bagging_fraction': study.best_trial.params['bagging_fraction'],
        'seed': semilla,
        'verbose': -1
    }

    train_data = lgb.Dataset(X_train,
                            label=y_train_binaria,
                            weight=w_train)

    model = lgb.train(params,
                    train_data,
                    num_boost_round=best_iter)
    
    model.save_model(modelos_path + f'v010/lgb_competencia3_{version}_s{semilla}_final.txt') # _final para indicar que es la version entrenada con todo el conjunto disponible.

Mejor cantidad de árboles para el mejor model 1909
1. Train del modelo v008 con semilla 1149
2. Train del modelo v008 con semilla 4836
3. Train del modelo v008 con semilla 9443
4. Train del modelo v008 con semilla 7483
5. Train del modelo v008 con semilla 3492
6. Train del modelo v008 con semilla 6287
7. Train del modelo v008 con semilla 830
8. Train del modelo v008 con semilla 3294
9. Train del modelo v008 con semilla 5932
10. Train del modelo v008 con semilla 13
