## A. Configuración General.

In [1]:
%pip install duckdb

Note: you may need to restart the kernel to use updated packages.


In [2]:
#1. Librerías.
%run "../librerias.ipynb"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#3. Constantes.
%run "../constantes.ipynb"

dataset_preprocesado = dataset_file_fe_all_1

cantidad_meses_train = "all"
ventana = 1

mes_train = mes_train_all_menos_1
mes_test = mes_test
mes_validation = mes_validation

In [4]:
#4. Funciones
%run "../funciones.ipynb"

In [5]:
#5. Lectura de datos.
data = pd.read_parquet(dataset_preprocesado)

In [6]:
#6. Pequeño pre-procesamiento sobre los datos.
#i. Cambio tipos de datos (Me lo toma como tipo de dato "object"...)
data['ctrx_quarter_normalizado'] = data['ctrx_quarter_normalizado'].astype(float)
#ii. Elimino columnas de último momento por Data Concept.
columnas_de_interes_prestamos = data.filter(like='prestamos_personales').columns
data.drop(columnas_de_interes_prestamos,axis=1,inplace=True)
#iii. Pesos y reclusterización.
data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

data['clase_binaria1'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 0)
data['clase_binaria2'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 1)


In [7]:
#iv. Divido entre Train y Test.
train_data = data[data['foto_mes'].isin(mes_train)]
validation_data = data[data['foto_mes'] == mes_validation]
test_data = data[data['foto_mes'] == mes_test]

In [8]:
del data
import gc
gc.collect()

0

In [9]:

#b. Datos para entrenar todo el modelo final para Kaggle.
X_train = train_data.drop(['clase_ternaria', 'clase_peso','clase_binaria2', 'clase_binaria1'], axis=1)
y_train_binaria2 = train_data['clase_binaria2']
w_train = train_data['clase_peso']

#c. Datos de Test (a predecir).
X_test = test_data.drop(['clase_ternaria', 'clase_peso','clase_binaria2', 'clase_binaria1'], axis=1)
y_test_binaria2 = test_data['clase_binaria2']


#c. Datos de Validation (a validar).
w_validation= validation_data['clase_peso']
X_validation = validation_data.drop(['clase_ternaria', 'clase_peso','clase_binaria2', 'clase_binaria1'], axis=1)
y_validation_binaria2 = validation_data['clase_binaria2']

In [10]:
# X_train['ctrx_quarter_normalizado'] = X_train['ctrx_quarter_normalizado'].astype(float)
# X_test['ctrx_quarter_normalizado'] = X_test['ctrx_quarter_normalizado'].astype(float)
# X_validation['ctrx_quarter_normalizado'] = X_validation['ctrx_quarter_normalizado'].astype(float)



In [10]:
del train_data
del test_data
del validation_data
gc.collect()


0

In [64]:
import optuna
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import warnings

warnings.filterwarnings("ignore", message="Found `num_boost_round` in params. Will use it instead of argument")

def optimize_month(X_train_mes, y_train_mes):
    def objective(trial):
        # Hiperparámetros específicos a optimizar
        lgb_params = {
            'objective': 'binary',
            'metric': 'custom',
            'boosting_type': 'rf',
            'first_metric_only': True,
            'boost_from_average': True,
            'feature_pre_filter': False,
            'max_depth': -1,
            'num_threads': -1,
            'verbose': -1,
            'num_boost_round': trial.suggest_int('num_boost_round', 20, 500),
            'max_bin': trial.suggest_int('max_bin', 20, 200),
            'num_leaves': trial.suggest_int('num_leaves', 8, 300),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 1000),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.35),
        }

        # Validación cruzada en los datos del mes
        model = lgb.LGBMClassifier(**lgb_params, random_state=42)
        model.fit(X_train_mes, y_train_mes)
        val_pred = model.predict(X_validation)
        return ganancia_prob(val_pred, y_validation_binaria2)

    # Optimizar los hiperparámetros para este mes
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=500)
    return study.best_params

best_params_per_month = {}

# Iterar sobre los meses y optimizar cada uno
meses = X_train['foto_mes'].unique()
for mes in sorted(meses):
    print(f"Optimizando para el mes {mes}...")
    X_train_mes = X_train[X_train['foto_mes'] == mes]
    y_train_mes = y_train_binaria2[X_train['foto_mes'] == mes]
    best_params_per_month[mes] = optimize_month(X_train_mes, y_train_mes)



Optimizando para el mes 202101...


[I 2024-11-15 23:21:10,147] A new study created in memory with name: no-name-e4cb35a1-8ec2-4e78-b058-f1edca7819bd
[I 2024-11-15 23:21:28,250] Trial 0 finished with value: 227794000.0 and parameters: {'num_boost_round': 191, 'max_bin': 164, 'num_leaves': 78, 'min_data_in_leaf': 96, 'feature_fraction': 0.6876420268690289, 'bagging_fraction': 0.1984378541686857, 'learning_rate': 0.3021093945729224}. Best is trial 0 with value: 227794000.0.
[I 2024-11-15 23:21:35,254] Trial 1 finished with value: 229761000.0 and parameters: {'num_boost_round': 90, 'max_bin': 34, 'num_leaves': 17, 'min_data_in_leaf': 135, 'feature_fraction': 0.3811154262363632, 'bagging_fraction': 0.3660708220496687, 'learning_rate': 0.18720505394482206}. Best is trial 1 with value: 229761000.0.
[I 2024-11-15 23:21:46,845] Trial 2 finished with value: 232134000.0 and parameters: {'num_boost_round': 185, 'max_bin': 56, 'num_leaves': 74, 'min_data_in_leaf': 374, 'feature_fraction': 0.27871418714009855, 'bagging_fraction': 0.4

Optimizando para el mes 202102...


[I 2024-11-16 02:53:42,751] A new study created in memory with name: no-name-a3334a08-a898-44cd-9f74-dfb4e4ece786
[I 2024-11-16 02:53:52,314] Trial 0 finished with value: 255339000.0 and parameters: {'num_boost_round': 62, 'max_bin': 162, 'num_leaves': 272, 'min_data_in_leaf': 488, 'feature_fraction': 0.2204882501056088, 'bagging_fraction': 0.8955807698017053, 'learning_rate': 0.035741505832234725}. Best is trial 0 with value: 255339000.0.
[I 2024-11-16 02:54:26,735] Trial 1 finished with value: 239890000.0 and parameters: {'num_boost_round': 297, 'max_bin': 197, 'num_leaves': 192, 'min_data_in_leaf': 969, 'feature_fraction': 0.5412073907387418, 'bagging_fraction': 0.16705727840237583, 'learning_rate': 0.06126751180277117}. Best is trial 0 with value: 255339000.0.
[I 2024-11-16 02:55:41,338] Trial 2 finished with value: 237552000.0 and parameters: {'num_boost_round': 394, 'max_bin': 138, 'num_leaves': 271, 'min_data_in_leaf': 468, 'feature_fraction': 0.9165507384308607, 'bagging_fracti

Optimizando para el mes 202103...


[I 2024-11-16 06:24:57,646] A new study created in memory with name: no-name-f08ccd91-22b3-4c1b-9970-36397156b735
[I 2024-11-16 06:25:14,760] Trial 0 finished with value: 249592000.0 and parameters: {'num_boost_round': 269, 'max_bin': 124, 'num_leaves': 64, 'min_data_in_leaf': 858, 'feature_fraction': 0.1540215389167772, 'bagging_fraction': 0.21162590959767555, 'learning_rate': 0.3209431375111563}. Best is trial 0 with value: 249592000.0.
[I 2024-11-16 06:25:45,050] Trial 1 finished with value: 285922000.0 and parameters: {'num_boost_round': 252, 'max_bin': 172, 'num_leaves': 255, 'min_data_in_leaf': 34, 'feature_fraction': 0.30549343872135504, 'bagging_fraction': 0.9867852177304357, 'learning_rate': 0.22988216806599523}. Best is trial 1 with value: 285922000.0.
[I 2024-11-16 06:26:16,899] Trial 2 finished with value: 254303000.0 and parameters: {'num_boost_round': 204, 'max_bin': 93, 'num_leaves': 103, 'min_data_in_leaf': 948, 'feature_fraction': 0.8629388749452074, 'bagging_fraction'

In [91]:

lgb_models = {}
# Entrenar modelos mensuales con los mejores hiperparámetros
# Y crear las predicciones para agregar al Meta Model
oof_predictions = np.zeros((X_validation.shape[0], len(meses)))

for i, (mes, params) in enumerate(best_params_per_month.items()):
    X_train_mes = X_train[X_train['foto_mes'] == mes]
    y_train_mes = y_train_binaria2[X_train['foto_mes'] == mes]
    params.update({
        'objective': 'binary',
        'metric': 'custom',
        'boosting_type': 'rf',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_depth': -1,
        'num_threads': -1,
        'verbose': -1,
    })
    model = lgb.LGBMClassifier(**params, random_state=i)
    model.fit(X_train_mes, y_train_mes)
    oof_predictions[:, i] = model.predict_proba(X_validation)[:, 1]
    lgb_models[mes] = model


In [92]:
def objective_meta(trial):
    # Hiperparámetros específicos a optimizar

    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    stopping_rounds = int(50 + 5 / learning_rate)
    meta_params = {
        'objective': 'binary',
        'metric': 'custom',
        'boosting_type': 'rf',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_depth': -1,
        'num_threads': -1,
        'verbose': -1,
        'num_boost_round': trial.suggest_int('num_boost_round', 20, 300),
        'max_bin': trial.suggest_int('max_bin', 20, 200),
        'num_leaves': trial.suggest_int('num_leaves', 8, 100),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 1000),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
        'learning_rate': learning_rate,
        'stopping_rounds': stopping_rounds,
    }
    N_FOLDS = 5
    N_ITER = 1  # Cantidad de iteraciones con diferentes semillas
    scores_iter = []

    for i in range(N_ITER):  # Cambiamos la semilla en cada iteración
        skf_meta = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=i)
        scores_meta = []

        for train_idx, val_idx in skf_meta.split(meta_X, meta_y):
            meta_X_train, meta_X_val = meta_X.iloc[train_idx], meta_X.iloc[val_idx]
            meta_y_train, meta_y_val = meta_y.iloc[train_idx], meta_y.iloc[val_idx]

            # Configura el modelo con la semilla actual
            meta_model = lgb.LGBMClassifier(**meta_params, random_state=i)
            meta_model.fit(meta_X_train, meta_y_train)
            
            val_pred = meta_model.predict(meta_X_val)
            max_ganancia = ganancia_prob(val_pred, meta_y_val)
            scores_meta.append(max_ganancia)
        
        # Promedio de ganancias para los K-Folds en la iteración actual
        iter_avg_score = np.mean(scores_meta) * N_FOLDS
        scores_iter.append(iter_avg_score)

    # Promedio de ganancias para las N iteraciones con distintas semillas
    return np.mean(scores_iter)

# Crear dataset para el meta-modelo
meta_features = pd.DataFrame(oof_predictions, columns=[f'model_mes_{m}' for m in meses])
meta_features = pd.concat([meta_features, X_validation.reset_index(drop=True)], axis=1)
meta_X = meta_features
meta_y = y_validation_binaria2

# no-name-3a3c49eb-5af8-4d2d-82a4-33825140e7a6
# [I 2024-11-16 13:21:13,038] Trial 129 finished with value: 309176000.0 and parameters: {'learning_rate': 0.2804332312296757, 'num_boost_round': 210, 'max_bin': 20, 'num_leaves': 86, 'min_data_in_leaf': 94, 'feature_fraction': 0.4499833274696736, 'bagging_fraction': 0.7410741485338036}. Best is trial 129 with value: 309176000.0.




study = optuna.create_study(direction="maximize")
study.optimize(objective_meta, n_trials=500)

[I 2024-11-16 13:59:12,210] A new study created in memory with name: no-name-0cb6e59d-92aa-49f2-8e7f-ea855b7937de
[I 2024-11-16 14:00:29,582] Trial 0 finished with value: 281036000.0 and parameters: {'learning_rate': 0.04034333154687938, 'num_boost_round': 209, 'max_bin': 38, 'num_leaves': 68, 'min_data_in_leaf': 95, 'feature_fraction': 0.9513066324220012, 'bagging_fraction': 0.7415473294003818}. Best is trial 0 with value: 281036000.0.
[I 2024-11-16 14:01:09,796] Trial 1 finished with value: 293797000.0 and parameters: {'learning_rate': 0.14366212225317718, 'num_boost_round': 232, 'max_bin': 83, 'num_leaves': 10, 'min_data_in_leaf': 276, 'feature_fraction': 0.6400946214804508, 'bagging_fraction': 0.8393684717081372}. Best is trial 1 with value: 293797000.0.
[I 2024-11-16 14:02:06,765] Trial 2 finished with value: 293468000.0 and parameters: {'learning_rate': 0.06793312539108738, 'num_boost_round': 139, 'max_bin': 65, 'num_leaves': 47, 'min_data_in_leaf': 625, 'feature_fraction': 0.552

In [93]:
meta_params = study.best_params
meta_params.update({
    'stopping_rounds': int(50 + 5 / meta_params['learning_rate']),
    'objective': 'binary',
    'metric': 'custom',
    'boosting_type': 'rf',
    'first_metric_only': True,
    'boost_from_average': True,
    'feature_pre_filter': False,
    'max_depth': -1,
    'num_threads': -1,
    'verbose': -1,
})
meta_model = lgb.LGBMClassifier(**meta_params, random_state=i)
meta_model.fit(meta_X, meta_y)

In [95]:
meta_X

Unnamed: 0,model_mes_202101,model_mes_202102,model_mes_202103,numero_de_cliente,foto_mes,active_quarter,cliente_vip,internet,cliente_edad,cliente_antiguedad,...,avg_historico_tc_proporcion_consumo_pesos,ratio_actual_tc_proporcion_consumo_dolares,sumcum_tc_proporcion_consumo_dolares,avg_historico_tc_proporcion_consumo_dolares,ratio_actual_tc_proporcion_adelanto_pesos,sumcum_tc_proporcion_adelanto_pesos,avg_historico_tc_proporcion_adelanto_pesos,ratio_actual_tc_proporcion_adelanto_dolares,sumcum_tc_proporcion_adelanto_dolares,avg_historico_tc_proporcion_adelanto_dolares
0,0.149946,0.286219,0.087754,698422069,202104,1,0,0,59,174,...,1.0,,0.000000,0.000000,,0.0,0.0,,0.0,0.0
1,0.003436,0.004263,0.004737,698456125,202104,1,0,0,41,174,...,1.0,,0.000000,0.000000,,0.0,0.0,,0.0,0.0
2,0.003903,0.013701,0.004928,698522052,202104,1,0,0,77,173,...,1.0,0.970376,0.000311,0.000156,,0.0,0.0,,0.0,0.0
3,0.008744,0.010283,0.008774,698641284,202104,1,0,0,65,47,...,1.0,0.961282,0.000096,0.000048,,0.0,0.0,,0.0,0.0
4,0.003436,0.004263,0.004737,698928834,202104,1,0,0,50,155,...,1.0,1.069129,0.000056,0.000028,,0.0,0.0,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164085,0.003436,0.004263,0.004737,593239068,202104,1,0,0,39,48,...,1.0,,0.000000,0.000000,,0.0,0.0,,0.0,0.0
164086,0.003913,0.004753,0.005880,593326361,202104,1,0,0,67,200,...,1.0,1.230039,0.000245,0.000123,,0.0,0.0,,0.0,0.0
164087,0.015909,0.009333,0.017204,593445968,202104,1,0,0,63,42,...,0.0,,0.000000,0.000000,,0.0,0.0,,0.0,0.0
164088,0.057582,0.070559,0.031965,593764876,202104,1,0,0,50,134,...,0.0,,0.000000,0.000000,,0.0,0.0,,0.0,0.0


In [97]:
# X_test.drop(['Probabilidad'], axis=1, inplace=True)
# # # #

In [98]:
oof_predictions = np.zeros((X_test.shape[0], len(meses)))

# Entrenar modelos mensuales y generar predicciones sobre el último mes
for i, mes in enumerate(meses):
    # Guardar las probabilidades predichas para el meta-modelo usando el último mes
    oof_predictions[:, i] = lgb_models[mes].predict_proba(X_test)[:, 1]
    
# Crear dataset para el meta-modelo
meta_features = pd.DataFrame(oof_predictions, columns=[f'model_mes_{m}' for m in meses])
meta_features = pd.concat([meta_features, X_test.reset_index(drop=True)], axis=1)

predicciones = meta_model.predict(meta_features)

In [11]:
import optuna
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import warnings

warnings.filterwarnings("ignore", message="Found `num_boost_round` in params. Will use it instead of argument")

def objective(trial):
    # Hiperparámetros de LightGBM (para todos los modelos mensuales)
    learning_rate = trial.suggest_float('learning_rate', 0.005, 0.3)
    stopping_rounds = int(50 + 5 / learning_rate)

    lgb_params = {
        'objective': 'binary',
        'metric': 'custom',
        'boosting_type': 'rf',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_depth': -1,
        'num_threads': -1,
        'verbose': -1,
        'num_boost_round': trial.suggest_int('num_boost_round', 20, 300),
        'max_bin': trial.suggest_int('max_bin', 20, 200),
        'num_leaves': trial.suggest_int('num_leaves', 8, 300),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 1000),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
        'learning_rate': learning_rate,
        'stopping_rounds': stopping_rounds,
    }

    learning_rate_meta = trial.suggest_float('learning_rate_meta', 0.005, 0.3)
    stopping_rounds_meta = int(50 + 5 / learning_rate_meta)

    meta_params = {
        'objective': 'binary',
        'metric': 'custom',
        'boosting_type': 'rf',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_depth': -1,
        'num_threads': -1,
        'verbose': -1,
        'num_boost_round': trial.suggest_int('num_boost_round_meta', 20, 300),
        'max_bin': trial.suggest_int('max_bin_meta', 20, 200),
        'num_leaves': trial.suggest_int('num_leaves_meta', 8, 300),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf_meta', 10, 1000),
        'feature_fraction': trial.suggest_float('feature_fraction_meta', 0.1, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction_meta', 0.1, 1.0),
        'learning_rate': learning_rate_meta,
        'stopping_rounds': stopping_rounds_meta,
    }
    mes_column = X_train['foto_mes']
    # Obtener los meses únicos en el dataset
    previous_months = sorted(mes_column.unique())
    oof_predictions = np.zeros((X_validation.shape[0], len(previous_months)))

    # Entrenar modelos mensuales y generar predicciones sobre el último mes
    for i, mes in enumerate(previous_months):
        X_train_mes = X_train[mes_column == mes]
        y_train_mes = y_train_binaria2[mes_column == mes]

        # Entrenar el modelo mensual de LightGBM con los mismos hiperparámetros
        lgb_model = lgb.LGBMClassifier(**lgb_params, random_state=i)
        lgb_model.fit(X_train_mes, y_train_mes)
        
        # Guardar las probabilidades predichas para el meta-modelo usando el último mes
        oof_predictions[:, i] = lgb_model.predict_proba(X_validation)[:, 1]
        
    # Crear dataset para el meta-modelo
    meta_features = pd.DataFrame(oof_predictions, columns=[f'model_mes_{m}' for m in previous_months])
    meta_features = pd.concat([meta_features, X_validation.reset_index(drop=True)], axis=1)

    # Validación cruzada para el meta-modelo
    meta_X = meta_features
    meta_y = y_validation_binaria2
    N_FOLDS = 5
    N_ITER = 5  # Cantidad de iteraciones con diferentes semillas
    scores_iter = []

    for i in range(N_ITER):  # Cambiamos la semilla en cada iteración
        skf_meta = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=i)
        scores_meta = []

        for train_idx, val_idx in skf_meta.split(meta_X, meta_y):
            meta_X_train, meta_X_val = meta_X.iloc[train_idx], meta_X.iloc[val_idx]
            meta_y_train, meta_y_val = meta_y.iloc[train_idx], meta_y.iloc[val_idx]

            # Configura el modelo con la semilla actual
            meta_model = lgb.LGBMClassifier(**meta_params, random_state=i)
            meta_model.fit(meta_X_train, meta_y_train)
            
            val_pred = meta_model.predict(meta_X_val)
            max_ganancia = ganancia_prob(val_pred, meta_y_val)
            scores_meta.append(max_ganancia)
        
        # Promedio de ganancias para los K-Folds en la iteración actual
        iter_avg_score = np.mean(scores_meta) * N_FOLDS
        scores_iter.append(iter_avg_score)

    # Promedio de ganancias para las N iteraciones con distintas semillas
    return np.mean(scores_iter)

# Ejecutar la optimización para los modelos mensuales y el meta-modelo
storage_name = "sqlite:///" + db_path + "optimization_lgbm.db"
study_name = "exp_104_lgbm_stack"
# study_name = "exp_102_lgbm_stack"
# [I 2024-11-12 15:04:17,182] Trial 421 finished with value: 309197000.0 and parameters: {'num_boost_round': 73, 'max_bin': 185, 'num_leaves': 97, 'min_data_in_leaf': 125, 'feature_fraction': 0.1207285984798519, 'bagging_fraction': 0.3959168275347215, 'learning_rate': 0.17828141037608225, 'learning_rate_meta': 0.1632216311367999, 'num_boost_round_meta': 269, 'max_bin_meta': 20, 'num_leaves_meta': 49, 'min_data_in_leaf_meta': 330, 'feature_fraction_meta': 0.6563868462458232, 'bagging_fraction_meta': 0.9009839465133691}. Best is trial 421 with value: 309197000.0.


# study_name = "exp_103_lgbm_stack"
# [I 2024-11-14 15:36:40,901] Trial 293 finished with value: 304386600.0 and parameters: {'num_boost_round': 91, 'max_bin': 105, 'num_leaves': 85, 'min_data_in_leaf': 176, 'feature_fraction': 0.11980908738871937, 'bagging_fraction': 0.4074846861963536, 'learning_rate': 0.25862515180921425, 'learning_rate_meta': 0.23031916930546562, 'num_boost_round_meta': 121, 'max_bin_meta': 20, 'num_leaves_meta': 83, 'min_data_in_leaf_meta': 233, 'feature_fraction_meta': 0.5316681085625028, 'bagging_fraction_meta': 0.6560750890593606}. Best is trial 293 with value: 304386600.0.

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)
study.optimize(objective, n_trials=1000)
best_params = study.best_params

# Mostrar los mejores hiperparámetros obtenidos
print("Mejores hiperparámetros de LightGBM:", {k: best_params[k] for k in best_params if not k.startswith('meta_')})
print("Mejores hiperparámetros del meta-modelo:", {k: best_params[k] for k in best_params if k.startswith('meta_')})


[I 2024-11-17 17:10:02,710] Using an existing study with name 'exp_104_lgbm_stack' instead of creating a new one.
[I 2024-11-17 17:17:30,564] Trial 135 finished with value: 306017600.0 and parameters: {'learning_rate': 0.18085608645399648, 'num_boost_round': 218, 'max_bin': 178, 'num_leaves': 159, 'min_data_in_leaf': 203, 'feature_fraction': 0.2703936830674434, 'bagging_fraction': 0.6125013640383035, 'learning_rate_meta': 0.1178684713170288, 'num_boost_round_meta': 174, 'max_bin_meta': 20, 'num_leaves_meta': 242, 'min_data_in_leaf_meta': 362, 'feature_fraction_meta': 0.6232183406552168, 'bagging_fraction_meta': 0.33431513136619473}. Best is trial 114 with value: 309412600.0.
[I 2024-11-17 17:24:48,126] Trial 136 finished with value: 305841200.0 and parameters: {'learning_rate': 0.18772479788411978, 'num_boost_round': 159, 'max_bin': 172, 'num_leaves': 153, 'min_data_in_leaf': 205, 'feature_fraction': 0.27967549939232833, 'bagging_fraction': 0.6154505272057765, 'learning_rate_meta': 0.1

KeyboardInterrupt: 

In [None]:
#2. Voy a realizar un estudio de Optuna para encontrar los mejores parámetros.
#i. Creo la base de datos donde guardar los resultados.0.14367932630824398
storage_name = "sqlite:///" + db_path + "optimization_lgbm.db"

study_name = f"exp_lgbm_{cantidad_meses_train}_{ventana}_undersampling" # Primer dígito cuantos meses para atrás desde 06/21, segundo dígito número data drifting.

#ii. Creo el estudio.
study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

#iii. Corro el estudio.
study.optimize(objective, n_trials=100)

In [10]:
#4. Visualizo los resultados del estudio, para modificar los rangos de análisis.

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:
plot_slice(study)

In [None]:
plot_contour(study)

In [None]:
plot_contour(study, params=['num_leaves','min_data_in_leaf'] )

In [12]:
study.best_trial.params

{'learning_rate': 0.20876098557002484,
 'num_boost_round': 175,
 'max_bin': 184,
 'num_leaves': 220,
 'min_data_in_leaf': 101,
 'feature_fraction': 0.21617106076308432,
 'bagging_fraction': 0.41306986027731496,
 'learning_rate_meta': 0.04229140546525002,
 'num_boost_round_meta': 130,
 'max_bin_meta': 20,
 'num_leaves_meta': 288,
 'min_data_in_leaf_meta': 226,
 'feature_fraction_meta': 0.7608592937685094,
 'bagging_fraction_meta': 0.2762981261185856}

In [19]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import warnings

# study.best_trial.params
warnings.filterwarnings("ignore", message="Found `num_boost_round` in params. Will use it instead of argument")

params = study.best_params
meta_params = {k[:-5]: v for k, v in params.items() if k.endswith("_meta")}
lgb_params = {k: v for k, v in params.items() if not k.endswith("_meta")}

meta_params.update({
    'stopping_rounds': int(50 + 5 / meta_params['learning_rate']),
    'objective': 'binary',
    'metric': 'custom',
    'boosting_type': 'rf',
    'first_metric_only': True,
    'boost_from_average': True,
    'feature_pre_filter': False,
    'max_depth': -1,
    'num_threads': -1,
    'verbose': -1,
})

lgb_params.update({
    'stopping_rounds': int(50 + 5 / lgb_params['learning_rate']),
    'objective': 'binary',
    'metric': 'custom',
    'boosting_type': 'rf',
    'first_metric_only': True,
    'boost_from_average': True,
    'feature_pre_filter': False,
    'max_depth': -1,
    'num_threads': -1,
    'verbose': -1,
})

mes_column = X_train['foto_mes']
# Obtener los meses únicos en el dataset
previous_months = sorted(mes_column.unique())
oof_predictions = np.zeros((X_validation.shape[0], len(previous_months)))
lgb_models = []

# Entrenar modelos mensuales y generar predicciones sobre el último mes
for i, mes in enumerate(previous_months):
    X_train_mes = X_train[mes_column == mes]
    y_train_mes = y_train_binaria2[mes_column == mes]

    # Entrenar el modelo mensual de LightGBM con los mismos hiperparámetros
    lgb_model = lgb.LGBMClassifier(**lgb_params, random_state=i)
    lgb_model.fit(X_train_mes, y_train_mes)
    
    # Guardar las probabilidades predichas para el meta-modelo usando el último mes
    oof_predictions[:, i] = lgb_model.predict_proba(X_validation)[:, 1]
    lgb_models.append(lgb_model)

# Crear dataset para el meta-modelo
meta_features = pd.DataFrame(oof_predictions, columns=[f'model_mes_{m}' for m in previous_months])
meta_features = pd.concat([meta_features, X_validation.reset_index(drop=True)], axis=1)

# Validación cruzada para el meta-modelo
meta_X = meta_features
meta_y = y_validation_binaria2

meta_model = lgb.LGBMClassifier(**meta_params, random_state=i)
meta_model.fit(meta_X, meta_y)


In [20]:
#ii. Dataframe.

for lgb_model in lgb_models:
    # Graficar feature importace de cada mes
    importance = lgb_model.booster_.feature_importance()
    features = lgb_model.booster_.feature_name()
    importance_df = pd.DataFrame({'Feature': features, 'Importance': importance})
    importance_df = importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)
    print(importance_df.head(10))
    
#a. Extract feature importance and feature names.
importance = meta_model.booster_.feature_importance()
features = meta_model.booster_.feature_name()

#b. Create a dataframe for better visualization.
importance_df = pd.DataFrame({'Feature': features, 'Importance': importance})

#c. Sort by importance in descending order.
importance_df = importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

#d. show.
importance_df.head(30)

                       Feature  Importance
0               mcuentas_saldo         878
1  sumcum_mrentabilidad_annual         865
2              mactivos_margen         756
3         mrentabilidad_annual         750
4  m_promedio_comisiones_otras         690
5            numero_de_cliente         687
6                 cliente_edad         683
7                mrentabilidad         650
8            tc_fechalta_menor         622
9           cliente_antiguedad         615
                              Feature  Importance
0         ratio_actual_mcuentas_saldo         640
1                mrentabilidad_annual         583
2        ratio_actual_mactivos_margen         530
3         sumcum_mrentabilidad_annual         522
4  avg_historico_mrentabilidad_annual         516
5   ratio_actual_mrentabilidad_annual         483
6            sumcum_mcuenta_corriente         453
7      ratio_actual_mcuenta_corriente         451
8                      Visa_fechaalta         426
9        mrentabilidad_annu

Unnamed: 0,Feature,Importance
0,model_mes_202101,777
1,numero_de_cliente,534
2,cliente_edad,498
3,model_mes_202103,492
4,model_mes_202102,410
5,cliente_antiguedad,346
6,ratio_actual_mcuenta_corriente,286
7,mrentabilidad,269
8,ratio_actual_mcuentas_saldo,240
9,mactivos_margen,239


In [22]:
X_test.drop(['Probabilidad'], axis=1, inplace=True)


In [23]:
oof_predictions = np.zeros((X_test.shape[0], len(previous_months)))

# Entrenar modelos mensuales y generar predicciones sobre el último mes
for i, mes in enumerate(previous_months):
    # Guardar las probabilidades predichas para el meta-modelo usando el último mes
    oof_predictions[:, i] = lgb_models[i].predict_proba(X_test)[:, 1]
    
# Crear dataset para el meta-modelo
meta_features = pd.DataFrame(oof_predictions, columns=[f'model_mes_{m}' for m in previous_months])
meta_features = pd.concat([meta_features, X_test.reset_index(drop=True)], axis=1)

predicciones = meta_model.predict(meta_features)

In [16]:
predicciones.sum()

7902

In [37]:
exp_path

'/home/rayser/Documents/Maestria/DM EyF/exp/'

In [24]:
X_test['Probabilidad'] = predicciones
#iii. Ordenamos a los clientes por probabilidad de ser "BAJA" de forma descendente.
tb_entrega = X_test.sort_values(by='Probabilidad', ascending=False)
#iv. Genero una lista de distintos cortes candidatos, para enviar a Kaggle.
cortes = range(9000,14000,200)
#v. Generamos las distintas predicciones de clases a partir de los distintos cortes posibles.
num_subida_kaggle = 1
i = 0
for envios in cortes:
    #1. Le ponemos clase 1 ("BAJA") a los primeros "envios" con mayor probabilidad.
    tb_entrega['Predicted'] = 0
    tb_entrega.iloc[:envios, tb_entrega.columns.get_loc('Predicted')] = 1
    resultados = tb_entrega[["numero_de_cliente", 'Predicted']].reset_index(drop=True)
    
    print("Cantidad de clientes {}".format(envios))
    #2. Guardamos el archivo para Kaggle.
    nombre_archivo = "K_OH_6_6_00{}.csv".format(num_subida_kaggle) #-X meses, con df de -X meses, número de intento.
    ruta_archivo= "{}/{}".format(exp_path,nombre_archivo)
    resultados.to_csv(ruta_archivo, index=False)
    
    num_subida_kaggle += 1
    i += 1

Cantidad de clientes 9000
Cantidad de clientes 9200
Cantidad de clientes 9400
Cantidad de clientes 9600
Cantidad de clientes 9800
Cantidad de clientes 10000
Cantidad de clientes 10200
Cantidad de clientes 10400
Cantidad de clientes 10600
Cantidad de clientes 10800
Cantidad de clientes 11000
Cantidad de clientes 11200
Cantidad de clientes 11400
Cantidad de clientes 11600
Cantidad de clientes 11800
Cantidad de clientes 12000
Cantidad de clientes 12200
Cantidad de clientes 12400
Cantidad de clientes 12600
Cantidad de clientes 12800
Cantidad de clientes 13000
Cantidad de clientes 13200
Cantidad de clientes 13400
Cantidad de clientes 13600
Cantidad de clientes 13800


In [None]:
#7. Guardamos el modelo.
# Primer dígito cuantos meses para atrás desde 06/21, segundo dígito número data drifting, tercer dígito número de entrenamiento.
model_lgb.save_model(modelos_path + 'lgbm_{}_{}_undersampling.txt'.format(cantidad_meses_train,ventana))

In [None]:
####################################################################################################
####################################################################################################
####################################################################################################
####################################################################################################
############## Primera subida orientativa a Kaggle previo medir semillas, etc. #####################
####################################################################################################
####################################################################################################
####################################################################################################
####################################################################################################

In [None]:
#8. Volvemos a leer el modelo.
model_lgb = lgb.Booster(model_file= modelos_path + 'lgbm_{}_{}_undersampling.txt'.format(cantidad_meses_train,ventana))

In [37]:
#a. Importo librería.
from kaggle.api.kaggle_api_extended import KaggleApi
#b. Configura el API de Kaggle
api = KaggleApi()
api.authenticate()

In [None]:
#L. Predecimos Agosto.
#i. Predecimos propiamente dicho.
predicciones = model_lgb.predict(X_test)
#ii. Le pegamos la probabilidad de ser "BAJA" a cada cliente.
X_test['Probabilidad'] = predicciones
#iii. Ordenamos a los clientes por probabilidad de ser "BAJA" de forma descendente.
tb_entrega = X_test.sort_values(by='Probabilidad', ascending=False)
#iv. Genero una lista de distintos cortes candidatos, para enviar a Kaggle.
cortes = range(9000,14000,200)
#v. Generamos las distintas predicciones de clases a partir de los distintos cortes posibles.
num_subida_kaggle = 1
i = 0
for envios in cortes:
    #1. Le ponemos clase 1 ("BAJA") a los primeros "envios" con mayor probabilidad.
    tb_entrega['Predicted'] = 0
    tb_entrega.iloc[:envios, tb_entrega.columns.get_loc('Predicted')] = 1
    resultados = tb_entrega[["numero_de_cliente", 'Predicted']].reset_index(drop=True)
    
    print("Cantidad de clientes {}".format(envios))
    #2. Guardamos el archivo para Kaggle.
    nombre_archivo = "K_OH_6_6_00{}.csv".format(num_subida_kaggle) #-X meses, con df de -X meses, número de intento.
    ruta_archivo= "{}/{}".format(exp_path,nombre_archivo)
    resultados.to_csv(ruta_archivo, index=False)
    
    num_subida_kaggle += 1
    i += 1
    
    #3. Envío a Kaggle.
    #a. Defino los parámetros claves.
    mensaje = f'Archivo {nombre_archivo}.LGBM meses train {cantidad_meses_train} con undersampling, DF {ventana}, punto_corte: {envios}.'
    competencia = 'dm-ey-f-2024-segunda'
    #c. Subo la Submission.
    while i<=15:
        print(i)
        api.competition_submit(file_name=ruta_archivo, message=mensaje, competition=competencia)
        print("Submission successful!")
        break
    else:
        print("Esperamos 30 segundos...")
        time.sleep(30)
        api.competition_submit(file_name=ruta_archivo, message=mensaje, competition=competencia)
        print("Submission successful!")
        i= 0