In [1]:
import pandas as pd
import polars as pl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_contour

from time import time

import pickle
import os

In [2]:
# !gsutil cp /home/eanegrin/buckets/b1/datasets/competencia_02_fe_v01.parquet /home/eanegrin/datasets/

In [2]:
# base_path = '/content/drive/MyDrive/DMEyF/2024/'
base_path = 'C:/Eugenio/Maestria/DMEyF/'
# base_path = '/home/eanegrin/buckets/b1/'

dataset_path = base_path + 'datasets/'
modelos_path = base_path + 'modelos/'
db_path = base_path + 'db/'
dataset_file = 'competencia_03_fe_v08_undersampled.parquet'

ganancia_acierto = 273000
costo_estimulo = 7000

semillas = [122219, 109279, 400391, 401537, 999961]

In [3]:
# data = pd.read_parquet('/home/eanegrin/datasets/' + dataset_file)
data = pd.read_parquet(dataset_path + dataset_file)

In [4]:
# el dataset undersampleado ya excluye los meses "malos" y los que tienen la clase ternaria incompleta
# nos queda excluir el mes de testing:

data = data[data['foto_mes'] != 202107]

data['foto_mes'].unique()

array([201905, 201906, 201907, 201908, 201909, 201910, 201911, 201912,
       202001, 202002, 202003, 202004, 202005, 202007, 202008, 202009,
       202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105,
       202106])

In [5]:
# Asignamos pesos a las clases

data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

In [6]:
data['clase_binaria'] = 0
data['clase_binaria'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 1)

In [7]:
X_train = data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria'], axis=1)
y_train_binaria = data['clase_binaria'] # Junta a los 2 baja
w_train = data['clase_peso']

In [8]:
def lgb_gan_eval(y_pred, data):
    weight = data.get_weight()
    ganancia = np.where(weight == 1.00002, ganancia_acierto, 0) - np.where(weight < 1.00002, costo_estimulo, 0)
    ganancia = ganancia[np.argsort(y_pred)[::-1]]
    ganancia = np.cumsum(ganancia)

    return 'gan_eval', np.max(ganancia) , True

# Entrenamiento

Cargamos el study de optuna que optimizamos en el script anterior

In [None]:
storage_name = "sqlite:///" + db_path + "optimization_lgbm_v08.db"
study_name = "competencia3_lgbm_v08" # UPDATE

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

[I 2024-12-04 17:04:07,195] Using an existing study with name 'competencia3_lgbm_v08' instead of creating a new one.


In [12]:
resultados = study.trials_dataframe()
resultados.shape

(14, 12)

In [13]:
study.best_trial

FrozenTrial(number=10, state=1, values=[3911985000.0], datetime_start=datetime.datetime(2024, 12, 4, 16, 9, 23, 229065), datetime_complete=datetime.datetime(2024, 12, 4, 16, 25, 46, 734614), params={'num_leaves': 248, 'learning_rate': 0.0498907204475266, 'min_data_in_leaf': 121, 'feature_fraction': 0.42739407657827894, 'bagging_fraction': 0.4653942981227891}, user_attrs={'best_iter': 1909}, system_attrs={}, intermediate_values={}, distributions={'num_leaves': IntDistribution(high=250, log=False, low=50, step=1), 'learning_rate': FloatDistribution(high=0.05, log=False, low=0.005, step=None), 'min_data_in_leaf': IntDistribution(high=3000, log=False, low=100, step=1), 'feature_fraction': FloatDistribution(high=0.6, log=False, low=0.4, step=None), 'bagging_fraction': FloatDistribution(high=1.0, log=False, low=0.1, step=None)}, trial_id=1562, value=None)

Entrenamos un modelo solo para ver feature importance:

In [14]:
best_iter = study.best_trial.user_attrs["best_iter"]
print(f"Mejor cantidad de árboles para el mejor model {best_iter}")

params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'first_metric_only': True,
    'boost_from_average': True,
    'feature_pre_filter': False,
    'max_bin': 31,
    'num_leaves': study.best_trial.params['num_leaves'],
    'learning_rate': study.best_trial.params['learning_rate'],
    'min_data_in_leaf': study.best_trial.params['min_data_in_leaf'],
    'feature_fraction': study.best_trial.params['feature_fraction'],
    'bagging_fraction': study.best_trial.params['bagging_fraction'],
    'seed': semillas[0],
    'verbose': -1
}

train_data = lgb.Dataset(X_train,
                          label=y_train_binaria,
                          weight=w_train)

model = lgb.train(params,
                  train_data,
                  num_boost_round=best_iter)


Mejor cantidad de árboles para el mejor model 1909


In [15]:
params

{'objective': 'binary',
 'boosting_type': 'gbdt',
 'first_metric_only': True,
 'boost_from_average': True,
 'feature_pre_filter': False,
 'max_bin': 31,
 'num_leaves': 248,
 'learning_rate': 0.0498907204475266,
 'min_data_in_leaf': 121,
 'feature_fraction': 0.42739407657827894,
 'bagging_fraction': 0.4653942981227891,
 'seed': 122219,
 'verbose': -1}

Variables mas importantes:

In [16]:
importances = model.feature_importance()
feature_names = X_train.columns.tolist()
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values('importance', ascending=False)
# importance_df[importance_df['importance'] > 0]
importance_df

Unnamed: 0,feature,importance
0,numero_de_cliente,3881
1,foto_mes,2878
4,cliente_edad,2833
496,cliente_edad_avg_lag_234,2168
104,ctrx_quarter,2092
...,...,...
691,cprestamos_hipotecarios_delta_lag_234,0
670,mcuenta_corriente_adicional_delta_lag_234,0
14,mcuenta_corriente_adicional,0
533,minversion1_dolares_avg_lag_234,0


In [17]:
importance_df['feature'].head(100).tolist()

['numero_de_cliente',
 'foto_mes',
 'cliente_edad',
 'cliente_edad_avg_lag_234',
 'ctrx_quarter',
 'cliente_edad_avg_lag_567',
 'mcuentas_saldo',
 'cliente_edad_lag1',
 'ctrx_quarter_delta_lag_567',
 'ctrx_quarter_delta_lag_234',
 'mcuenta_corriente_avg_lag_234',
 'Visa_Fvencimiento',
 'Visa_Fvencimiento_avg_lag_234',
 'cliente_antiguedad',
 'ctrx_quarter_avg_lag_234',
 'mcuenta_corriente_avg_lag_567',
 'Visa_Fvencimiento_avg_lag_101112',
 'mcuenta_corriente',
 'mcuenta_corriente_delta_lag_234',
 'mrentabilidad_annual_avg_lag_234',
 'Visa_Fvencimiento_avg_lag_567',
 'Master_Fvencimiento',
 'mrentabilidad_annual_delta_lag_567',
 'mpasivos_margen',
 'Visa_Fvencimiento_lag1',
 'mcuenta_corriente_delta_lag_567',
 'mrentabilidad_annual_avg_lag_567',
 'mrentabilidad_annual_avg_lag_101112',
 'mrentabilidad_annual_delta_lag_101112',
 'ctrx_quarter_lag1',
 'mcuenta_corriente_avg_lag_101112',
 'mcaja_ahorro',
 'mcaja_ahorro_avg_lag_234',
 'ctrx_quarter_delta_lag_101112',
 'ctrx_quarter_avg_lag_5

### Entrenamos con la totalidad de las semillas y guardamos los modelos

In [18]:
version = 'v008' # UPDATE

os.makedirs(modelos_path + version, exist_ok= True) # crea la carpeta donde almacenamos las versiones del modelo.

best_iter = study.best_trial.user_attrs["best_iter"]
print(f"Mejor cantidad de árboles para el mejor model {best_iter}")

for semilla in semillas:
    
    print(f'Train del modelo {version} con semilla {semilla}')
    
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_bin': 31,
        'num_leaves': study.best_trial.params['num_leaves'],
        'learning_rate': study.best_trial.params['learning_rate'],
        'min_data_in_leaf': study.best_trial.params['min_data_in_leaf'],
        'feature_fraction': study.best_trial.params['feature_fraction'],
        'bagging_fraction': study.best_trial.params['bagging_fraction'],
        'seed': semilla,
        'verbose': -1
    }

    train_data = lgb.Dataset(X_train,
                            label=y_train_binaria,
                            weight=w_train)

    model = lgb.train(params,
                    train_data,
                    num_boost_round=best_iter)
    
    model.save_model(modelos_path + f'{version}/lgb_competencia3_{version}_s{semilla}.txt')

Mejor cantidad de árboles para el mejor model 1909
Train del modelo v008 con semilla 122219
Train del modelo v008 con semilla 109279
Train del modelo v008 con semilla 400391
Train del modelo v008 con semilla 401537
Train del modelo v008 con semilla 999961
