In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import polars as pl

from sklearn.tree import DecisionTreeClassifier, plot_tree,  _tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
import lightgbm as lgb
from sklearn.impute import SimpleImputer
import gc
from joblib import Parallel, delayed
from pathlib import Path
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_contour

from time import time
import os
import pickle

In [None]:
# print(os.path.getsize("data/df_confe.csv") / (1024**3), "GB")


2.5967229744419456 GB


In [3]:
# Leer CSV completo
data = pl.read_csv("data/competencia_01_fe_sinslope.csv")

#print(data.shape)        # filas, columnas
#print(data.head())


In [2]:
# Filtrar los meses deseados
#df_filtrado = data.filter(
#    (data["foto_mes"] == 202103) | (data["foto_mes"] == 202104)
#)

# Guardar en CSV
#df_filtrado.write_csv("df_confe_ma.csv")

In [2]:
SEMILLAS = [550007, 550019, 550031, 550033, 550047]

mes_train = 202103, 202102, 202101
mes_test = 202104
mes_kaggle = 202106
ganancia_acierto = 780000
costo_estimulo = 20000
# =====================

In [4]:
# Filtrar solo el mes de kaggle
df_kaggle = data.filter(pl.col("foto_mes") == mes_kaggle)

In [6]:
df_kaggle.shape

(164313, 803)

In [5]:
#guRDAR csv kaggle
df_kaggle.write_csv("data/df_kaggle.csv")

In [None]:
# Filtrar solo el mes de train
#df_train = data.filter(pl.col("foto_mes").is_in(mes_train))

# Filtrar solo el mes de test
#df_test = data.filter(pl.col("foto_mes") == mes_test)


In [None]:
#guardar df_train en csv
#df_train.write_csv("data/df_train_01_02_03.csv")
#df_test.write_csv("data/df_test_04.csv")   

In [3]:
df_train = pd.read_csv("data/df_train_01_02_03.csv")

In [4]:
#tiene mi df_train la columna "clase_ternaria"?
print("clase_ternaria" in df_train.columns)

True


In [5]:
df_train['clase_peso'] = 1.0

df_train.loc[df_train['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
df_train.loc[df_train['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

#Optimización con LightGBM

In [6]:

import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Separar X e y
X = df_train.drop(["clase_ternaria", "clase_peso"], axis=1)  # ✅ Sacamos también clase_peso
y = df_train["clase_ternaria"]
pesos = df_train["clase_peso"]  # ✅ Guardamos los pesos

# Binarizar y
y_binaria = (y != "CONTINUA").astype(int)

# Split 70/30 (ahora incluimos los pesos)
X_train, X_val, y_train, y_val, pesos_train, pesos_val = train_test_split(
    X, y_binaria, pesos,  # ✅ Separamos X, y Y pesos
    train_size=0.7,
    random_state=42,
    stratify=y_binaria
)

print("Train:", X_train.shape, y_train.shape, pesos_train.shape)
print("Validation:", X_val.shape, y_val.shape, pesos_val.shape)

# Ahora en el Dataset:
train_data = lgb.Dataset(X_train,
                          label=y_train,
                          weight=pesos_train  # ✅ Usamos los pesos del train
                          )

Train: (340753, 802) (340753,) (340753,)
Validation: (146038, 802) (146038,) (146038,)


In [8]:
def ganancia_prob(y_pred, data):
  weight = data.get_weight()
  ganancia = np.where(weight == 1.00002, ganancia_acierto, 0) - np.where(weight < 1.00002, costo_estimulo, 0)
  ganancia = ganancia[np.argsort(y_pred)[::-1]]
  ganancia = np.cumsum(ganancia)  # ✅ Bien
  return 'gan_eval', np.max(ganancia), True
  


In [13]:
def objective(trial):

    num_leaves = trial.suggest_int('num_leaves', 8, 100)
    learning_rate = trial.suggest_float('learning_rate', 0.005, 0.1) # mas bajo, más iteraciones necesita
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 300, 1000)
    feature_fraction = trial.suggest_float('feature_fraction', 0.1, 1.0)
    bagging_fraction = trial.suggest_float('bagging_fraction', 0.1, 1.0),

    params = {
        'objective': 'binary',
        'metric': 'custom',
        'boosting_type': 'gbdt',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_bin': 31,
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'min_data_in_leaf': min_data_in_leaf,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'seed': SEMILLAS[0],
        'verbose': -1
    }
    train_data = lgb.Dataset(X_train,
                              label=y_train, # eligir la clase
                              weight=pesos_train
                              )
    cv_results = lgb.cv(
        params,
        train_data,
        num_boost_round=100, # modificar, subit y subir... y descomentar la línea inferior
        # early_stopping_rounds= int(50 + 5 / learning_rate),
        feval=ganancia_prob,
        stratified=True,
        nfold=3,
        seed=SEMILLAS[0],
        callbacks=[
                lgb.early_stopping(stopping_rounds=int(50 + 5/learning_rate), verbose=False),
                lgb.log_evaluation(period=200),
                ]
    )
    max_gan = max(cv_results['valid gan_eval-mean'])
    best_iter = cv_results['valid gan_eval-mean'].index(max_gan) + 1

    # Guardamos cual es la mejor iteración del modelo
    trial.set_user_attr("best_iter", best_iter)

    return max_gan * 3
# Al final de objective()
gc.collect()

#guardar el archivo en mi carpeta data
storage_name = "sqlite:///data/optimization_lgbm.db"

study_name = "exp_301_lgbm"

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

[I 2025-10-09 16:41:37,005] Using an existing study with name 'exp_301_lgbm' instead of creating a new one.


In [14]:
study.optimize(objective, n_trials=30) # subir subir

[I 2025-10-09 16:43:25,676] Trial 15 finished with value: 824520000.0 and parameters: {'num_leaves': 13, 'learning_rate': 0.032922995753811045, 'min_data_in_leaf': 964, 'feature_fraction': 0.2632501067072054, 'bagging_fraction': 0.28635017212034686}. Best is trial 15 with value: 824520000.0.
[I 2025-10-09 16:45:32,227] Trial 16 finished with value: 1027440000.0 and parameters: {'num_leaves': 100, 'learning_rate': 0.07720399027065392, 'min_data_in_leaf': 658, 'feature_fraction': 0.39718138159387195, 'bagging_fraction': 0.3601895736680899}. Best is trial 16 with value: 1027440000.0.
[I 2025-10-09 16:47:35,837] Trial 17 finished with value: 791800000.0 and parameters: {'num_leaves': 49, 'learning_rate': 0.007962520110203843, 'min_data_in_leaf': 895, 'feature_fraction': 0.4706483103904118, 'bagging_fraction': 0.4298206201719591}. Best is trial 16 with value: 1027440000.0.
[I 2025-10-09 16:49:25,185] Trial 18 finished with value: 834300000.0 and parameters: {'num_leaves': 12, 'learning_rate

In [18]:
import json
# Obtener los mejores 10 trials
best_trials = study.trials_dataframe().sort_values('value', ascending=False).head(10)

# Opción 1: Guardar como lista de diccionarios (más completo)
mejores_params = []
for i, trial in enumerate(study.best_trials[:10] if len(study.best_trials) >= 10 else study.best_trials):
    mejores_params.append({
        'rank': i + 1,
        'trial_number': trial.number,
        'value': trial.value,
        'params': trial.params,
        'best_iter': trial.user_attrs.get('best_iter', None)
    })

# Guardar en JSON
with open('data/mejores_hiperparametros.json', 'w', encoding='utf-8') as f:
    json.dump(mejores_params, f, indent=4, ensure_ascii=False)

