In [21]:
import pandas as pd
import polars as pl
import numpy as np
import lightgbm as lgb
import optuna

In [2]:
# base_path = '/content/drive/MyDrive/DMEyF/2024/'
base_path = 'C:/Eugenio/Maestria/DMEyF/'
# base_path = '/home/eanegrin/buckets/b1/'

dataset_path = base_path + 'datasets/'
modelos_path = base_path + 'modelos/'
db_path = base_path + 'db/'
dataset_file = 'competencia_02_fe_v01.parquet' # CAMBIAR EN LA VM

ganancia_acierto = 273000
costo_estimulo = 7000

# agregue sus semillas
semillas = [122219, 109279, 400391, 401537, 999961]

In [24]:
data = pl.scan_parquet(dataset_path + dataset_file).filter(pl.col("foto_mes") == 202108).collect()

In [25]:
# convirto a pandas dataframe
data = data.to_pandas()

In [27]:
# Asignamos pesos a las clases

data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

In [28]:
data['clase_binaria1'] = 0
data['clase_binaria2'] = 0
data['clase_binaria1'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 0)
data['clase_binaria2'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 1)

In [29]:
X_test = data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria1','clase_binaria2'], axis=1)

In [8]:
# Cargo el modelo pre-entrenado
model = lgb.Booster(model_file=modelos_path + 'lgb_competencia2_v001.txt')

In [30]:
corte = 10500

In [31]:
y_pred_lgm = model.predict(X_test)

In [33]:
# Agrego la prediccion en el dataset de test
X_test['pred_lgm'] = y_pred_lgm

#  ordeno por probabilidad para marcar a quienes les vamos a enviar el estimulo
idx = np.argsort(y_pred_lgm)[::-1]
X_test.reset_index(drop=True, inplace=True) # reseteo el index para poder ordenar por idx, sino en index tenia por default el numero de fila del dataframe original "data"
X_test = X_test.iloc[idx]

# genero la columna con 1s para los que van a recibir el estimulo, 0 para todo el resto
envios = np.zeros(len(X_test), dtype=int)
envios[:corte] = 1
X_test['Predicted'] = envios

# otra forma seria:
# X_test['Predicted'] = 0
# X_test.loc[:gan_max_idx-1, 'Predicted'] = 1

In [34]:
check = X_test[['numero_de_cliente','Predicted','pred_lgm']]
check

Unnamed: 0,numero_de_cliente,Predicted,pred_lgm
25508,800789349,1,9.580700e-01
144674,779077357,1,9.558021e-01
4981,991536140,1,9.144224e-01
59654,1031640918,1,9.055210e-01
7010,262440814,1,9.046682e-01
...,...,...,...
162718,1017606395,0,1.512209e-10
136315,710336817,0,9.352578e-11
128067,762560913,0,8.171502e-11
1508,849451684,0,7.404692e-11


In [35]:
output = X_test[['numero_de_cliente', 'Predicted']]

file_name = 'results_v01_02.csv'
output_path = base_path + 'exp/competencia_2/' + file_name

output.to_csv(output_path, index=False)

In [None]:
output.shape
output

(165442, 2)

In [14]:
# Algo se rompio en algun punto, voy a probar reentrenando el modelo solo con junio

In [15]:
data = pl.scan_parquet(dataset_path + dataset_file).filter(pl.col("foto_mes") == 202106).collect()

In [16]:
data = data.to_pandas()

In [17]:
# Asignamos pesos a las clases

data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

In [18]:
data['clase_binaria1'] = 0
data['clase_binaria2'] = 0
data['clase_binaria1'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 0)
data['clase_binaria2'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 1)

In [19]:
X_train = data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria1','clase_binaria2'], axis=1)
y_train_binaria1 = data['clase_binaria1'] # Solo BAJA+2
y_train_binaria2 = data['clase_binaria2'] # junta a los 2 BAJA
w_train = data['clase_peso']

In [20]:
def lgb_gan_eval(y_pred, data):
    weight = data.get_weight()
    ganancia = np.where(weight == 1.00002, ganancia_acierto, 0) - np.where(weight < 1.00002, costo_estimulo, 0)
    ganancia = ganancia[np.argsort(y_pred)[::-1]]
    ganancia = np.cumsum(ganancia)

    return 'gan_eval', np.max(ganancia) , True

In [22]:
storage_name = "sqlite:///" + db_path + "optimization_lgbm.db"
study_name = "competencia2_lgbm_v01" # UPDATE

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

[I 2024-11-04 18:09:38,996] Using an existing study with name 'competencia2_lgbm_v01' instead of creating a new one.


In [23]:
best_iter = study.best_trial.user_attrs["best_iter"]
print(f"Mejor cantidad de árboles para el mejor model {best_iter}")
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'first_metric_only': True,
    'boost_from_average': True,
    'feature_pre_filter': False,
    'max_bin': 31,
    'num_leaves': study.best_trial.params['num_leaves'],
    'learning_rate': study.best_trial.params['learning_rate'],
    'min_data_in_leaf': study.best_trial.params['min_data_in_leaf'],
    'feature_fraction': study.best_trial.params['feature_fraction'],
    'bagging_fraction': study.best_trial.params['bagging_fraction'],
    'seed': semillas[0],
    'verbose': 0
}

train_data = lgb.Dataset(X_train,
                          label=y_train_binaria2,
                          weight=w_train)

model = lgb.train(params,
                  train_data,
                  num_boost_round=best_iter)

Mejor cantidad de árboles para el mejor model 997
