In [None]:
%pip install scikit-learn==1.3.2
%pip install seaborn==0.13.1
%pip install numpy==1.26.4
%pip install matplotlib==3.7.1
%pip install pandas==2.1.4
%pip install lightgbm==4.4.0
%pip install optuna==3.6.1
%pip install python-dotenv
%pip install plotly
%pip install ipython

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_contour

from time import time

from datetime import datetime

from dotenv import load_dotenv

import pickle

import os

In [None]:
datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
load_dotenv()

# Accedo a variables de entorno
dataset_path = os.getenv('DATASET_PATH')
dataset_file = os.getenv('DATASET_FILE')
ganancia_acierto = 273000
costo_estimulo = 7000
semillas = [945787,945799,945809,945811,945817]
mes_test = 202109

data = pd.read_csv(dataset_path + dataset_file)

In [None]:
datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
'''
data.drop([
    'cseguro_vida_lag1',
    'minversion2_lag1',
    'cinversion2_lag1',
    'minversion1_dolares_lag1',
    'vmr_mpagominimo_delta1',
    'Unnamed: 0'
], axis=1, inplace=True, errors='ignore')

# Asigno nan a la columna clase_ternaria si foto_mes es igual a mes_test
data.loc[data['foto_mes'] == mes_test, 'clase_ternaria'] = np.nan
'''
data.drop([
    'Unnamed: 0'
], axis=1, inplace=True, errors='ignore')

In [None]:
data.head()

In [None]:
data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

In [None]:
data['clase_binaria1'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 0)
data['clase_binaria2'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 1)

In [None]:
valores_unicos = data['clase_ternaria'].unique()

In [None]:
print(valores_unicos)

In [None]:
data.head()

In [None]:
# mes_train = 202104

train_data = data[(data['foto_mes'] != mes_test) & 
                  (data['foto_mes'] != mes_test - 1)
                  ]
# train_data = data[data['foto_mes'].isin([202101, 202102, 202103])]
# train_data = data[data['foto_mes'] == mes_train] #Para competencia_01

test_data = data[data['foto_mes'] == mes_test]

X_train = train_data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria1','clase_binaria2'], axis=1)
y_train_binaria1 = train_data['clase_binaria1']
y_train_binaria2 = train_data['clase_binaria2']
w_train = train_data['clase_peso']

X_test = test_data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria1','clase_binaria2'], axis=1)
y_test_binaria1 = test_data['clase_binaria1']
y_test_class = test_data['clase_ternaria']
w_test = test_data['clase_peso']

In [None]:
test_data.head()

In [None]:
train_data.head()

In [None]:
#imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
#Xif = imp_mean.fit_transform(X_test)

In [None]:
def lgb_gan_eval(y_pred, data):
    weight = data.get_weight()
    ganancia = np.where(weight == 1.00002, ganancia_acierto, 0) - np.where(weight < 1.00002, costo_estimulo, 0)
    ganancia = ganancia[np.argsort(y_pred)[::-1]]
    ganancia = np.cumsum(ganancia)

    return 'gan_eval', np.max(ganancia) , True

# Parámetros del modelo
params = {
    'objective': 'binary',
    'metric': 'gan_eval',
    'boosting_type': 'gbdt',
    'max_bin': 31,
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.3,
    'bagging_fraction': 0.7,
    'verbose': 0
}

In [None]:
train_data1 = lgb.Dataset(X_train, label=y_train_binaria1, weight=w_train)
train_data2 = lgb.Dataset(X_train, label=y_train_binaria2, weight=w_train)

In [None]:
cv_results1 = lgb.cv(
    params,
    train_data1,
    num_boost_round=150,
    feval=lgb_gan_eval,
    nfold=5,
    seed=semillas[0]
)

cv_results2 = lgb.cv(
    params,
    train_data2,
    num_boost_round=150,
    feval=lgb_gan_eval,
    nfold=5,
    seed=semillas[0]
)

In [None]:
datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
df_ganancias = pd.DataFrame({
    'binaria1': cv_results1['valid gan_eval-mean'],
    'binaria2': cv_results2['valid gan_eval-mean'],
    'Iteracion': range(1, len(cv_results1['valid gan_eval-mean']) + 1)
})

# Normalizamos la ganancias
df_ganancias['binaria1'] = df_ganancias['binaria1']*5
df_ganancias['binaria2'] = df_ganancias['binaria2']*5

plt.figure(figsize=(10, 6))
sns.lineplot(x='Iteracion', y='binaria1', data=df_ganancias, label='binaria 1')
sns.lineplot(x='Iteracion', y='binaria2', data=df_ganancias, label='binaria 2')
plt.title('Comparación de las Ganancias de las 2 clases binarias')
plt.xlabel('Iteración')
plt.ylabel('Ganancia')
plt.legend()
plt.show()


In [None]:

def objective(trial):
    num_leaves = trial.suggest_int('num_leaves', 8, 100),
    learning_rate = trial.suggest_float('learning_rate', 0.005, 0.3), # mas bajo, más iteraciones necesita
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 1, 1000),
    feature_fraction = trial.suggest_float('feature_fraction', 0.1, 1.0),
    bagging_fraction = trial.suggest_float('bagging_fraction', 0.1, 1.0),

    params = {
        'objective': 'binary',
        'metric': 'custom',
        'boosting_type': 'gbdt',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_bin': 31,
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'min_data_in_leaf': min_data_in_leaf,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'seed': semillas[0],
        'verbose': -1
    }
    
    train_data = lgb.Dataset(X_train,
                              label=y_train_binaria2, # eligir la clase
                              weight=w_train)
    cv_results = lgb.cv(
        params,
        train_data,
        num_boost_round=100, # modificar, subit y subir... y descomentar la línea inferior
        # early_stopping_rounds= int(50 + 5 / learning_rate),
        feval=lgb_gan_eval,
        stratified=True,
        nfold=5,
        seed=semillas[0]
    )
    max_gan = max(cv_results['valid gan_eval-mean'])
    best_iter = cv_results['valid gan_eval-mean'].index(max_gan) + 1

    # Guardamos cual es la mejor iteración del modelo
    trial.set_user_attr("best_iter", best_iter)

    return max_gan * 5


# now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
now = '2024-11-25_17-48-22' # Variable para usar una db ya existente

storage_name = "sqlite:///" + dataset_path + "optimization_lgbm" + now + ".db"
study_name = "exp_301_lgbm"

print('storage_name', storage_name)

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

In [None]:
new_var = study.optimize(objective, n_trials=150) # Ajustar a 500 en gcloud
new_var

In [None]:
datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:
plot_slice(study)

plot_contour(study, params=['num_leaves','min_data_in_leaf'] )

In [None]:
best_iter = study.best_trial.user_attrs["best_iter"]
print(f"Mejor cantidad de árboles para el mejor model {best_iter}")

params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'first_metric_only': True,
    'boost_from_average': True,
    'feature_pre_filter': False,
    'max_bin': 31,
    'num_leaves': study.best_trial.params['num_leaves'],
    'learning_rate': study.best_trial.params['learning_rate'],
    'min_data_in_leaf': study.best_trial.params['min_data_in_leaf'],
    'feature_fraction': study.best_trial.params['feature_fraction'],
    'bagging_fraction': study.best_trial.params['bagging_fraction'],
    'seed': semillas[0],
    'verbose': 0
}

train_data = lgb.Dataset(X_train,
                          label=y_train_binaria2,
                          weight=w_train)

model = lgb.train(params,
                  train_data,
                  num_boost_round=best_iter)


In [None]:
importances = model.feature_importance()
feature_names = X_train.columns.tolist()
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values('importance', ascending=False)
importance_df[importance_df['importance'] > 0]

In [None]:
# Opcional: guardo modelo como txt

# model.save_model(modelos_path + 'lgb_first.txt')
# model = lgb.Booster(model_file=modelos_path + 'lgb_first.txt')

In [None]:
y_pred_lgm = model.predict(X_test)
y_pred_lgm

In [None]:
# Supongamos que 'X_test' es tu DataFrame original del que deseas conservar el resto
y_pred_prob = model.predict(X_test)

# Convertir a predicciones binarias usando un umbral de 0.025
threshold = 0.025

#probar cambiando el umbral
y_pred_binary = (y_pred_prob >= threshold).astype(int)

# Agregar las columnas de probabilidades y predicciones al DataFrame original
X_test['probabilidad'] = y_pred_prob
X_test['prediccion'] = y_pred_binary

X_test.prediccion.value_counts()

In [None]:
# Filtrar el DataFrame para quedarte solo con 'numero_de_cliente' y 'prediccion'
result_df = X_test[['numero_de_cliente', 'prediccion']]

# Renombrar la columna 'prediccion' a 'Predicted' si es necesario
result_df.rename(columns={'prediccion': 'Predicted'}, inplace=True)

In [None]:
# Especificar la ruta completa del archivo donde deseas guardar el DataFrame
now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

output_file = dataset_path + "resultados_predicciones" + now + ".csv"

print('output_file', output_file)

# Guardar el DataFrame como un archivo CSV en la ruta especificada
result_df.to_csv(output_file, index=False)

In [None]:
datetime.now().strftime("%Y-%m-%d_%H-%M-%S")