In [1]:
import pandas as pd
import polars as pl
import numpy as np
import lightgbm as lgb
import optuna

In [2]:
# base_path = '/content/drive/MyDrive/DMEyF/2024/'
base_path = 'C:/Eugenio/Maestria/DMEyF/'
# base_path = '/home/eanegrin/buckets/b1/'

dataset_path = base_path + 'datasets/'
modelos_path = base_path + 'modelos/'
db_path = base_path + 'db/'
dataset_file = 'competencia_02_fe_v01.parquet'

ganancia_acierto = 273000
costo_estimulo = 7000

# agregue sus semillas
semillas = [122219, 109279, 400391, 401537, 999961]

In [3]:
data = pl.scan_parquet(dataset_path + dataset_file).filter(pl.col("foto_mes") == 202108).collect()

In [4]:
# convirto a pandas dataframe
data = data.to_pandas()

In [5]:
# Asignamos pesos a las clases

data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

In [6]:
data['clase_binaria'] = 0
data['clase_binaria'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 0)

In [7]:
X_test = data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria'], axis=1)

### Predicciones para Kaggle

In [8]:
# Voy a probar 20 puntos de corte para cada semilla (total 100 envios)

version = 'v014' # UPADTE

modelos = [f'lgb_competencia2_{version}_s{semilla}_final.txt' for semilla in semillas]
puntos_corte = np.linspace(10000, 13000, 20).astype(int)

In [9]:
modelos

['lgb_competencia2_v014_s122219_final.txt',
 'lgb_competencia2_v014_s109279_final.txt',
 'lgb_competencia2_v014_s400391_final.txt',
 'lgb_competencia2_v014_s401537_final.txt',
 'lgb_competencia2_v014_s999961_final.txt']

In [10]:
modelos_path

'C:/Eugenio/Maestria/DMEyF/modelos/'

In [11]:
counter = 0

for modelo in modelos:
    
    # cargamos el modelo
    model = lgb.Booster(model_file=f'{modelos_path}{version}/{modelo}')
    
    # predecimos para agosto
    X_test = data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria'], axis=1).copy()
    y_pred_lgm = model.predict(X_test)
    X_test['pred_lgm'] = y_pred_lgm
    
    # ordeno de mayor probabilidad de baja a menor
    idx = np.argsort(y_pred_lgm)[::-1]
    X_test.reset_index(drop=True, inplace=True)
    X_test = X_test.iloc[idx]
    
    for corte in puntos_corte:
        
        envios = np.zeros(len(X_test), dtype=int)
        envios[:corte] = 1
        X_test['Predicted'] = envios
        
        output = X_test[['numero_de_cliente', 'Predicted']]
        
        counter += 1
        file_name = f'results_{version}_{counter}.csv'
        output_path = base_path + f'exp/competencia_2/{version}/' + file_name
        
        output.to_csv(output_path, index=False)
                
        print(f'{output_path} --- modelo: {modelo}, corte: {corte}')

C:/Eugenio/Maestria/DMEyF/exp/competencia_2/v014/results_v014_1.csv --- modelo: lgb_competencia2_v014_s122219_final.txt, corte: 10000
C:/Eugenio/Maestria/DMEyF/exp/competencia_2/v014/results_v014_2.csv --- modelo: lgb_competencia2_v014_s122219_final.txt, corte: 10157
C:/Eugenio/Maestria/DMEyF/exp/competencia_2/v014/results_v014_3.csv --- modelo: lgb_competencia2_v014_s122219_final.txt, corte: 10315
C:/Eugenio/Maestria/DMEyF/exp/competencia_2/v014/results_v014_4.csv --- modelo: lgb_competencia2_v014_s122219_final.txt, corte: 10473
C:/Eugenio/Maestria/DMEyF/exp/competencia_2/v014/results_v014_5.csv --- modelo: lgb_competencia2_v014_s122219_final.txt, corte: 10631
C:/Eugenio/Maestria/DMEyF/exp/competencia_2/v014/results_v014_6.csv --- modelo: lgb_competencia2_v014_s122219_final.txt, corte: 10789
C:/Eugenio/Maestria/DMEyF/exp/competencia_2/v014/results_v014_7.csv --- modelo: lgb_competencia2_v014_s122219_final.txt, corte: 10947
C:/Eugenio/Maestria/DMEyF/exp/competencia_2/v014/results_v014_