In [1]:
import pandas as pd
import polars as pl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import lightgbm as lgb

import optuna

import os

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!gsutil cp /home/eanegrin/buckets/b1/datasets/competencia_02_fe_v01.parquet /home/eanegrin/datasets/

Copying file:///home/eanegrin/buckets/b1/datasets/competencia_02_fe_v01.parquet...
- [1 files][  7.1 GiB/  7.1 GiB]                                                
Operation completed over 1 objects/7.1 GiB.                                      


In [3]:
base_path = '/home/eanegrin/buckets/b1/'

dataset_path = base_path + 'datasets/'
modelos_path = base_path + 'modelos/'
db_path = base_path + 'db/'
dataset_file = 'competencia_02_fe_v01.parquet'

ganancia_acierto = 273000
costo_estimulo = 7000

# agregue sus semillas
semillas = [122219, 109279, 400391, 401537, 999961]

data = pd.read_parquet('/home/eanegrin/datasets/' + dataset_file)
# data = pd.read_parquet(dataset_path + dataset_file)

In [4]:
meses_train = [202006,202007,202008,202009,202010,202011,202012,
               202101,202102,202103,202104,202105] # dejo 202106 afuera para test

data = data[data['foto_mes'].isin(meses_train)]
data.shape

(1924019, 678)

In [5]:
# Asignamos pesos a las clases

data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

In [6]:
data['clase_binaria'] = 0
data['clase_binaria'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 1)

In [7]:
X_train = data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria'], axis=1)
y_train_binaria = data['clase_binaria'] # Junta a los 2 baja
w_train = data['clase_peso']

In [8]:
train_data = lgb.Dataset(X_train,
                          label=y_train_binaria,
                          weight=w_train)

In [9]:
def lgb_gan_eval(y_pred, data):
    weight = data.get_weight()
    ganancia = np.where(weight == 1.00002, ganancia_acierto, 0) - np.where(weight < 1.00002, costo_estimulo, 0)
    ganancia = ganancia[np.argsort(y_pred)[::-1]]
    ganancia = np.cumsum(ganancia)

    return 'gan_eval', np.max(ganancia) , True

In [10]:
# cargamos el estudio de optuna

storage_name = "sqlite:///" + db_path + "optimization_lgbm.db"
study_name = "competencia2_lgbm_v01" # UPDATE

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

best_iter = study.best_trial.user_attrs["best_iter"]

[I 2024-11-14 13:07:39,576] Using an existing study with name 'competencia2_lgbm_v01' instead of creating a new one.


In [None]:
# calculamos la ganancia en cross validation para las 5 semillas

cv_results = []

for semilla in semillas:
    
    # mejores hiperparametros obtenidos con optuna
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_bin': 31,
        'num_leaves': study.best_trial.params['num_leaves'],
        'learning_rate': study.best_trial.params['learning_rate'],
        'min_data_in_leaf': study.best_trial.params['min_data_in_leaf'],
        'feature_fraction': study.best_trial.params['feature_fraction'],
        'bagging_fraction': study.best_trial.params['bagging_fraction'],
        'seed': semilla,
        'verbose': 0
    }
    
    # early_stopping_cb = lgb.early_stopping(stopping_rounds=50)
    
    results = lgb.cv(
        params,
        train_data,
        num_boost_round=best_iter,
        # callbacks=[early_stopping_cb],
        feval=lgb_gan_eval,
        stratified=True,
        nfold=5,
        seed=semilla
    )

    # me quedo con los resultados de la ultima iteracion
    final_mean = results['valid gan_eval-mean'][-1]
    final_std = results['valid gan_eval-stdv'][-1]

    cv_results.append({
        'model':'v01', # UPDATE
        'semilla': semilla,
        'mean_cv_score': final_mean,
        'std_cv_score': final_std,
        'n_months_train': 11 #UPDATE
    })
    
cv_results = pd.DataFrame(cv_results)

In [None]:
output_path = base_path + 'exp/competencia_2/cv_results.csv'

if os.path.exists(output_path):
    existing_results = pd.read_csv(output_path)
    results = pd.concat([existing_results, cv_results], ignore_index=True)
else:
    results = cv_results

results.to_csv(output_path, index=False)