In [None]:
import pandas as pd
import polars as pl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_contour

from time import time

import pickle

In [None]:
base_path = '/home/cburich_pymnts/buckets/b1/'
dataset_path = base_path + 'datasets/'
modelos_path = base_path + 'modelos/'
db_path = base_path + 'db/'


# base_path = 'C:/Users/Cristian Burich/Desktop/MA/segundo/eyf/'
# dataset_path = base_path + 'datasets/'
# modelos_path = base_path + 'modelos/'
# db_path = base_path + 'db/'


dataset_file = 'competencia_03_fe_k300.parquet'   # usamos la version sin U?

ganancia_acierto = 273000
costo_estimulo = 7000

# agregue sus semillas
semillas = [165229,165211,165203,165237,165247]

# data = pd.read_parquet('/home/eanegrin/datasets/' + dataset_file)
data = pd.read_parquet(dataset_path + dataset_file)

In [None]:
data = data.drop(columns=['clase_ternaria_1', 'tmobile_app', 'cmobile_app_trx'])

In [None]:
data['clase_binaria1'] = np.nan
data['clase_binaria2'] = np.nan

# Update values while keeping NaN as NaN
data['clase_binaria1'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 
                                  np.where(data['clase_ternaria'].isna(), np.nan, 0))
data['clase_binaria2'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 
                                  np.where(data['clase_ternaria'].isna(), np.nan, 1))

In [None]:
# Asignamos pesos a las clases

data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

data['clase_binaria'] = 0
data['clase_binaria'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 0)

In [None]:
def lgb_gan_eval(y_pred, data):
    weight = data.get_weight()
    ganancia = np.where(weight == 1.00002, ganancia_acierto, 0) - np.where(weight < 1.00002, costo_estimulo, 0)
    ganancia = ganancia[np.argsort(y_pred)[::-1]]
    ganancia = np.cumsum(ganancia)

    return 'gan_eval', np.max(ganancia) , True

In [None]:
meses_train = [201906, 201907, 201908, 201909, 201910, 201911, 201912,
               202001, 202002, 202003, 202004, 202005,
               202007, 202008, 202009, 202010, 202011, 202012,
               202101, 202102, 202103, 202104, 202105]

In [None]:

train_data = data[data['foto_mes'].isin(meses_train)]
test_data = data[data['foto_mes'] == 202107]

X_train = train_data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria1','clase_binaria2'], axis=1)
y_train_binaria1 = train_data['clase_binaria1']
y_train_binaria2 = train_data['clase_binaria2']
w_train = train_data['clase_peso']

X_test = test_data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria1','clase_binaria2'], axis=1)
y_test_binaria1 = test_data['clase_binaria1']
y_test_class = test_data['clase_ternaria']
w_test = test_data['clase_peso']

In [None]:
X_train.shape

### Rescatamos resultados optimizacion

In [None]:
storage_name = "sqlite:///" + db_path + "optimization_lgbm.db"
study_name = "competencia3_lgbm_k300" # UPDATE

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

In [None]:
best_iter = study.best_trial.user_attrs["best_iter"]
print(f"Mejor cantidad de árboles para el mejor model {best_iter}")

params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'first_metric_only': True,
    'boost_from_average': True,
    'feature_pre_filter': False,
    'max_bin': 31,
    'num_leaves': study.best_trial.params['num_leaves'],
    'learning_rate': study.best_trial.params['learning_rate'],
    'min_data_in_leaf': study.best_trial.params['min_data_in_leaf'],
    'feature_fraction': study.best_trial.params['feature_fraction'],
    'bagging_fraction': study.best_trial.params['bagging_fraction'],
    'seed': semillas[0],
    'verbose': 0
}



In [None]:
train_data = lgb.Dataset(X_train,
                          label=y_train_binaria2,
                          weight=w_train)

model = lgb.train(params,
                  train_data,
                  num_boost_round=best_iter)

In [None]:
params

In [None]:
importances = model.feature_importance()
feature_names = X_train.columns.tolist()
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values('importance', ascending=False)
importance_df[importance_df['importance'] > 0]

### Predecimos con 30 semillas en el modelo

### 

In [None]:
import random

# Semilla inicial para reproducibilidad
initial_seed = 165229
random.seed(initial_seed)

# Generar 20 semillas adicionales
semillero = [random.randint(0, 200000) for _ in range(30)]

print(semillero)

In [None]:
def ganancia_prob(y_pred, y_true, prop = 1):
  ganancia = np.where(y_true == 1, ganancia_acierto, 0) - np.where(y_true == 0, costo_estimulo, 0)
  return ganancia[y_pred >= 0.025].sum() / prop


In [None]:
ganancia_results= []

for seed in semillero:
    params['seed'] = seed

    model = lgb.train(params,
                  train_data,
                  num_boost_round=best_iter)
    
    y_pred_lgm = model.predict(X_test)

    ganancia = ganancia_prob(y_pred_lgm, y_test_binaria1)
    ganancia_results.append(ganancia)
    print(f"Ganancia LGBM con seed {seed}: {ganancia}")

In [None]:
average_ganancia = np.mean(ganancia_results)
print(f"Ganancia promedio: {average_ganancia}")

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(ganancia_results, bins=20, edgecolor='black')
# plt.xlim(9e7, 10.5e7)
plt.title('Histograma de Ganancias - Baseline')
plt.xlabel('Ganancia')
plt.ylabel('Frecuencia')
plt.grid(axis='y', alpha=0.75)
plt.show()