In [None]:
%pip install scikit-learn==1.3.2
%pip install seaborn==0.13.1
%pip install numpy==1.26.4
%pip install matplotlib==3.7.1
%pip install pandas==2.1.4
%pip install lightgbm==4.4.0
%pip install optuna==3.6.1
%pip install python-dotenv
%pip install plotly
%pip install ipython
%pip install openpyxl

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_contour

from time import time

from datetime import datetime

from dotenv import load_dotenv

import pickle

import os

In [None]:
datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
load_dotenv()

# Accedo a variables de entorno
dataset_path = os.getenv('DATASET_PATH')
dataset_file = os.getenv('DATASET_FILE')
mes_test = int(os.getenv('MES_TEST'))
trials = int(os.getenv('TRIALS'))

ganancia_acierto = 273000
costo_estimulo = 7000
semillas = [945787,945799,945809,945811,945817]

data = pd.read_csv(dataset_path + dataset_file)

In [None]:
datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
data['mpayroll_sobre_edad'] = data['mpayroll'] / data['cliente_edad']

# Variables de sumas
data['vm_mfinanciacion_limite'] = data[['Master_mfinanciacion_limite', 'Visa_mfinanciacion_limite']].sum(axis=1, skipna=True)
data['vm_Fvencimiento'] = data[['Master_Fvencimiento', 'Visa_Fvencimiento']].min(axis=1, skipna=True)
data['vm_Finiciomora'] = data[['Master_Finiciomora', 'Visa_Finiciomora']].min(axis=1, skipna=True)
data['vm_msaldototal'] = data[['Master_msaldototal', 'Visa_msaldototal']].sum(axis=1, skipna=True)
data['vm_msaldopesos'] = data[['Master_msaldopesos', 'Visa_msaldopesos']].sum(axis=1, skipna=True)
data['vm_msaldodolares'] = data[['Master_msaldodolares', 'Visa_msaldodolares']].sum(axis=1, skipna=True)
data['vm_mconsumospesos'] = data[['Master_mconsumospesos', 'Visa_mconsumospesos']].sum(axis=1, skipna=True)
data['vm_mconsumosdolares'] = data[['Master_mconsumosdolares', 'Visa_mconsumosdolares']].sum(axis=1, skipna=True)
data['vm_mlimitecompra'] = data[['Master_mlimitecompra', 'Visa_mlimitecompra']].sum(axis=1, skipna=True)
data['vm_madelantopesos'] = data[['Master_madelantopesos', 'Visa_madelantopesos']].sum(axis=1, skipna=True)
data['vm_madelantodolares'] = data[['Master_madelantodolares', 'Visa_madelantodolares']].sum(axis=1, skipna=True)
data['vm_fultimo_cierre'] = data[['Master_fultimo_cierre', 'Visa_fultimo_cierre']].max(axis=1, skipna=True)
data['vm_mpagado'] = data[['Master_mpagado', 'Visa_mpagado']].sum(axis=1, skipna=True)
data['vm_mpagospesos'] = data[['Master_mpagospesos', 'Visa_mpagospesos']].sum(axis=1, skipna=True)
data['vm_mpagosdolares'] = data[['Master_mpagosdolares', 'Visa_mpagosdolares']].sum(axis=1, skipna=True)
data['vm_fechaalta'] = data[['Master_fechaalta', 'Visa_fechaalta']].max(axis=1, skipna=True)
data['vm_mconsumototal'] = data[['Master_mconsumototal', 'Visa_mconsumototal']].sum(axis=1, skipna=True)
data['vm_cconsumos'] = data[['Master_cconsumos', 'Visa_cconsumos']].sum(axis=1, skipna=True)
data['vm_cadelantosefectivo'] = data[['Master_cadelantosefectivo', 'Visa_cadelantosefectivo']].sum(axis=1, skipna=True)
data['vm_mpagominimo'] = data[['Master_mpagominimo', 'Visa_mpagominimo']].sum(axis=1, skipna=True)

# Variables de ratios
data['vmr_Master_mlimitecompra'] = data['Master_mlimitecompra'] / data['vm_mlimitecompra']
data['vmr_Visa_mlimitecompra'] = data['Visa_mlimitecompra'] / data['vm_mlimitecompra']
data['vmr_msaldototal'] = data['vm_msaldototal'] / data['vm_mlimitecompra']
data['vmr_msaldopesos'] = data['vm_msaldopesos'] / data['vm_mlimitecompra']
data['vmr_msaldopesos2'] = data['vm_msaldopesos'] / data['vm_msaldototal']
data['vmr_msaldodolares'] = data['vm_msaldodolares'] / data['vm_mlimitecompra']
data['vmr_msaldodolares2'] = data['vm_msaldodolares'] / data['vm_msaldototal']
data['vmr_mconsumospesos'] = data['vm_mconsumospesos'] / data['vm_mlimitecompra']
data['vmr_mconsumosdolares'] = data['vm_mconsumosdolares'] / data['vm_mlimitecompra']
data['vmr_madelantopesos'] = data['vm_madelantopesos'] / data['vm_mlimitecompra']
data['vmr_madelantodolares'] = data['vm_madelantodolares'] / data['vm_mlimitecompra']
data['vmr_mpagado'] = data['vm_mpagado'] / data['vm_mlimitecompra']
data['vmr_mpagospesos'] = data['vm_mpagospesos'] / data['vm_mlimitecompra']
data['vmr_mpagosdolares'] = data['vm_mpagosdolares'] / data['vm_mlimitecompra']
data['vmr_mconsumototal'] = data['vm_mconsumototal'] / data['vm_mlimitecompra']
data['vmr_mpagominimo'] = data['vm_mpagominimo'] / data['vm_mlimitecompra']



# Filtramos solo las columnas numéricas
numeric_cols = data.select_dtypes(include=[np.number])

# Reemplazo valores infinitos con NaN solo en las columnas numéricas
infinitos_qty = np.isinf(numeric_cols).sum().sum()
if infinitos_qty > 0:
    print(f"ATENCIÓN: Hay {infinitos_qty} valores infinitos en tu dataset. Serán pasados a NaN.")
    data[numeric_cols.columns] = numeric_cols.replace([np.inf, -np.inf], np.nan)



In [None]:
null_count_by_month = data.groupby('foto_mes').apply(lambda df: df.isnull().sum())

In [None]:
# Filtrar columnas con al menos un valor distinto de 0
filtered_columns = null_count_by_month.loc[:, (null_count_by_month != 0).any(axis=0)]

In [None]:
from scipy.stats import median_abs_deviation

# Calcular la mediana y la desviación absoluta de la mediana (MAD) por columna
median_values = filtered_columns.median()
mad_values = filtered_columns.apply(median_abs_deviation)

# Identificar columnas donde alguna fila supera 3 veces la MAD
columns_with_outliers = filtered_columns.loc[:, ((filtered_columns - median_values).abs() > (3 * mad_values)).any(axis=0)]

In [None]:
columns_with_outliers.to_excel('columns_with_outliers.xlsx', index=False)

In [None]:
print("Directorio actual:", os.getcwd())


In [None]:
'''
data.drop([
    'cseguro_vida_lag1',
    'minversion2_lag1',
    'cinversion2_lag1',
    'minversion1_dolares_lag1',
    'vmr_mpagominimo_delta1',
    'Unnamed: 0'
], axis=1, inplace=True, errors='ignore')

# Asigno nan a la columna clase_ternaria si foto_mes es igual a mes_test
data.loc[data['foto_mes'] == mes_test, 'clase_ternaria'] = np.nan
'''
data.drop([
    'Unnamed: 0'
], axis=1, inplace=True, errors='ignore')

In [None]:
data.head()

In [None]:
data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

In [None]:
data['clase_binaria1'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 0)
data['clase_binaria2'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 1)

In [None]:
valores_unicos = data['clase_ternaria'].unique()
print(valores_unicos)

In [None]:
data.head()

In [None]:
# Defino dataset de train y dataset de test

train_data = data[(data['foto_mes'] != mes_test) & (data['foto_mes'] != mes_test - 1)]
#train_data = data[data['foto_mes'].isin([202101, 202102, 202103])]

test_data = data[data['foto_mes'] == mes_test]

In [None]:
"""
# Inicia drifting

train_null_percentage = train_data.isnull().mean() * 100
test_null_percentage = test_data.isnull().mean() * 100

comparison_df = pd.DataFrame({'Train Null Percentage': train_null_percentage, 'Score Null Percentage': test_null_percentage})
comparison_df['diff_nulls'] = (comparison_df['Score Null Percentage'] - comparison_df['Train Null Percentage']).abs()

comparison_df_sorted = comparison_df.sort_values('diff_nulls', ascending=False)

comparison_df_sorted
"""

In [None]:
"""
trial_null_df_sorted = comparison_df.sort_values('Train Null Percentage', ascending=False)

trial_null_df_sorted
"""

In [None]:
"""
train_zero_percentage = (train_data == 0).mean() * 100
score_zero_percentage = (test_data == 0).mean() * 100

comparison_df_zero = pd.DataFrame({'Train Zero Percentage': train_zero_percentage, 'Score Zero Percentage': score_zero_percentage})

comparison_df_zero['diff_zero_percentage'] = (comparison_df_zero['Score Zero Percentage'] - comparison_df_zero['Train Zero Percentage']).abs()
diff_zero_percentage_sorted = comparison_df_zero.sort_values('diff_zero_percentage',ascending=False)
diff_zero_percentage_sorted
"""

In [None]:
"""
diff_trial_zero_percentage_sorted = comparison_df_zero.sort_values('Train Zero Percentage', ascending=False)
diff_trial_zero_percentage_sorted
"""

In [None]:
"""
columnas_a_eliminar = list(set(
    comparison_df_sorted[comparison_df_sorted['diff_nulls'] > 5].index
).union(
    diff_zero_percentage_sorted[diff_zero_percentage_sorted['diff_zero_percentage'] > 5].index
))


columnas_a_saltar = {'clase_binaria2', 'clase_binaria1','clase_ternaria','foto_mes'}
columnas_a_eliminar = [col for col in columnas_a_eliminar if col not in columnas_a_saltar]


train_data = train_data.drop(columns=columnas_a_eliminar)
test_data = test_data.drop(columns=columnas_a_eliminar)
"""

In [None]:
"""
columnas_a_eliminar = list(set(
    trial_null_df_sorted[trial_null_df_sorted['Train Null Percentage'] > 90].index
).union(
    diff_trial_zero_percentage_sorted[diff_trial_zero_percentage_sorted["Train Zero Percentage"] > 90].index
))

columnas_a_eliminar = [col for col in columnas_a_eliminar if col not in columnas_a_saltar]

train_data = train_data.drop(columns=columnas_a_eliminar)
test_data = test_data.drop(columns=columnas_a_eliminar)

# Termina drifting
"""

In [None]:
X_train = train_data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria1','clase_binaria2'], axis=1)
y_train_binaria1 = train_data['clase_binaria1']
y_train_binaria2 = train_data['clase_binaria2']
w_train = train_data['clase_peso']

X_test = test_data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria1','clase_binaria2'], axis=1)
y_test_binaria1 = test_data['clase_binaria1']
y_test_class = test_data['clase_ternaria']
w_test = test_data['clase_peso']

In [None]:
#imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
#Xif = imp_mean.fit_transform(X_test)

In [None]:
def lgb_gan_eval(y_pred, data):
    weight = data.get_weight()
    ganancia = np.where(weight == 1.00002, ganancia_acierto, 0) - np.where(weight < 1.00002, costo_estimulo, 0)
    ganancia = ganancia[np.argsort(y_pred)[::-1]]
    ganancia = np.cumsum(ganancia)

    return 'gan_eval', np.max(ganancia) , True

# Parámetros del modelo
params = {
    'objective': 'binary',
    'metric': 'gan_eval',
    'boosting_type': 'gbdt',
    'max_bin': 31,
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.3,
    'bagging_fraction': 0.7,
    'verbose': 0
}

In [None]:
train_data1 = lgb.Dataset(X_train, label=y_train_binaria1, weight=w_train)
train_data2 = lgb.Dataset(X_train, label=y_train_binaria2, weight=w_train)

In [None]:
cv_results1 = lgb.cv(
    params,
    train_data1,
    num_boost_round=150,
    feval=lgb_gan_eval,
    nfold=5,
    seed=semillas[0]
)

cv_results2 = lgb.cv(
    params,
    train_data2,
    num_boost_round=150,
    feval=lgb_gan_eval,
    nfold=5,
    seed=semillas[0]
)

In [None]:
datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
df_ganancias = pd.DataFrame({
    'binaria1': cv_results1['valid gan_eval-mean'],
    'binaria2': cv_results2['valid gan_eval-mean'],
    'Iteracion': range(1, len(cv_results1['valid gan_eval-mean']) + 1)
})

# Normalizamos la ganancias
df_ganancias['binaria1'] = df_ganancias['binaria1']*5
df_ganancias['binaria2'] = df_ganancias['binaria2']*5

plt.figure(figsize=(10, 6))
sns.lineplot(x='Iteracion', y='binaria1', data=df_ganancias, label='binaria 1')
sns.lineplot(x='Iteracion', y='binaria2', data=df_ganancias, label='binaria 2')
plt.title('Comparación de las Ganancias de las 2 clases binarias')
plt.xlabel('Iteración')
plt.ylabel('Ganancia')
plt.legend()
plt.show()


In [None]:

def objective(trial):
    num_leaves = trial.suggest_int('num_leaves', 8, 100),
    learning_rate = trial.suggest_float('learning_rate', 0.005, 0.3), # mas bajo, más iteraciones necesita
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 1, 1000),
    feature_fraction = trial.suggest_float('feature_fraction', 0.1, 1.0),
    bagging_fraction = trial.suggest_float('bagging_fraction', 0.1, 1.0),

    params = {
        'objective': 'binary',
        'metric': 'custom',
        'boosting_type': 'gbdt',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_bin': 31,
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'min_data_in_leaf': min_data_in_leaf,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'seed': semillas[0],
        'verbose': -1
    }
    
    train_data = lgb.Dataset(X_train,
                              label=y_train_binaria2, # eligir la clase
                              weight=w_train)
    cv_results = lgb.cv(
        params,
        train_data,
        num_boost_round=100, # modificar, subit y subir... y descomentar la línea inferior
        # early_stopping_rounds= int(50 + 5 / learning_rate),
        feval=lgb_gan_eval,
        stratified=True,
        nfold=5,
        seed=semillas[0]
    )
    max_gan = max(cv_results['valid gan_eval-mean'])
    best_iter = cv_results['valid gan_eval-mean'].index(max_gan) + 1

    # Guardamos cual es la mejor iteración del modelo
    trial.set_user_attr("best_iter", best_iter)

    return max_gan * 5


now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

storage_name = "sqlite:///" + dataset_path + "optimization_lgbm" + now + ".db"
study_name = "exp_301_lgbm"

print('storage_name', storage_name)

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

In [None]:
new_var = study.optimize(objective, n_trials=trials) # Ajustar a 500 en gcloud
new_var

In [None]:
datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:
plot_slice(study)

plot_contour(study, params=['num_leaves','min_data_in_leaf'] )

In [None]:
best_iter = study.best_trial.user_attrs["best_iter"]
print(f"Mejor cantidad de árboles para el mejor model {best_iter}")

params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'first_metric_only': True,
    'boost_from_average': True,
    'feature_pre_filter': False,
    'max_bin': 31,
    'num_leaves': study.best_trial.params['num_leaves'],
    'learning_rate': study.best_trial.params['learning_rate'],
    'min_data_in_leaf': study.best_trial.params['min_data_in_leaf'],
    'feature_fraction': study.best_trial.params['feature_fraction'],
    'bagging_fraction': study.best_trial.params['bagging_fraction'],
    'seed': semillas[0],
    'verbose': 0
}

train_data = lgb.Dataset(X_train,
                          label=y_train_binaria2,
                          weight=w_train)

model = lgb.train(params,
                  train_data,
                  num_boost_round=best_iter)


In [None]:
importances = model.feature_importance()
feature_names = X_train.columns.tolist()
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values('importance', ascending=False)
importance_df[importance_df['importance'] > 0]

In [None]:
# Opcional: guardo modelo como txt

# model.save_model(modelos_path + 'lgb_first.txt')
# model = lgb.Booster(model_file=modelos_path + 'lgb_first.txt')

In [None]:
y_pred_lgm = model.predict(X_test)
y_pred_lgm

In [None]:
# Supongamos que 'X_test' es tu DataFrame original del que deseas conservar el resto
y_pred_prob = model.predict(X_test)

# Convertir a predicciones binarias usando un umbral de 0.025
threshold = 0.025
# threshold = 0.01

#probar cambiando el umbral
y_pred_binary = (y_pred_prob >= threshold).astype(int)

# Agregar las columnas de probabilidades y predicciones al DataFrame original
X_test['probabilidad'] = y_pred_prob
X_test['prediccion'] = y_pred_binary

X_test.prediccion.value_counts()

In [None]:
# Filtrar el DataFrame para quedarte solo con 'numero_de_cliente' y 'prediccion'
result_df = X_test[['numero_de_cliente', 'prediccion']]

# Renombrar la columna 'prediccion' a 'Predicted' si es necesario
result_df.rename(columns={'prediccion': 'Predicted'}, inplace=True)

In [None]:
# Especificar la ruta completa del archivo donde deseas guardar el DataFrame
now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

output_file = dataset_path + "resultados_predicciones" + now + ".csv"

print('output_file', output_file)

# Guardar el DataFrame como un archivo CSV en la ruta especificada
result_df.to_csv(output_file, index=False)

In [None]:

result_df_prob = X_test[['numero_de_cliente', 'prediccion','probabilidad']]
# Especificar la ruta completa del archivo donde deseas guardar el DataFrame
now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

result_df_prob = dataset_path + "resultados_predicciones_prob" + now + ".csv"

print('result_df_prob', result_df_prob)




In [None]:
datetime.now().strftime("%Y-%m-%d_%H-%M-%S")