## A. Configuración General.

In [2]:
#1. Librerías.
%run "../librerias.ipynb"

Matplotlib is building the font cache; this may take a moment.


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#3. Constantes.
%run "../constantes.ipynb"

dataset_con_fe = dataset_file_fe_all_1

cantidad_meses_train = "all"
ventana = 1

mes_train = mes_train_all_menos_1
mes_test = mes_test

In [4]:
#4. Funciones
%run "../funciones.ipynb"

In [5]:
#5. Lectura de datos.
data = pd.read_parquet(dataset_con_fe)

In [6]:
#6. Pequeño pre-procesamiento sobre los datos.
#i. Cambio tipos de datos (Me lo toma como tipo de dato "object"...)
data['ctrx_quarter_normalizado'] = data['ctrx_quarter_normalizado'].astype(float)
#ii. Elimino columnas de último momento por Data Concept.
columnas_de_interes_prestamos = data.filter(like='prestamos_personales').columns
data.drop(columnas_de_interes_prestamos,axis=1,inplace=True)
#iii. Pesos y reclusterización.
data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

data['clase_binaria2'] = 0
data['clase_binaria2'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 1)

In [7]:
#iv. Divido entre Train y Test.
train_data = data[data['foto_mes'].isin(mes_train)]
test_data = data[data['foto_mes'] == mes_test]

In [8]:
#v. Undersampleo.
#a. Filtramos las clases.
continua_train = train_data[(train_data['clase_binaria2'] == 0)]
baja_train = train_data[(train_data['clase_binaria2'] == 1)]

#b. Lista para almacenar los datos submuestreados.
continua_undersampleados = []

#c. Iteramos sobre cada mes para submuestrear.
for mes in continua_train['foto_mes'].unique():
    #1. Filtramos los datos de la clase mayoritaria para el mes específico.
    continua_mes_seleccionado = continua_train[continua_train['foto_mes'] == mes]
    
    #2. Calculamos el tamaño objetivo para el undersampling (30%).
    n_samples = int(len(continua_mes_seleccionado) * 0.3)
    
    #3. Submuestreamos las instancias de la clase mayoritaria para este mes
    continua_mes_seleccionado_undersampleados = resample(continua_mes_seleccionado, 
                                        replace=False, 
                                        n_samples=n_samples, 
                                        random_state=semillas[0])
    
    #4. Agregamos los datos submuestreados a la lista
    continua_undersampleados.append(continua_mes_seleccionado_undersampleados)

#d. Concatenamos todos los meses submuestreados en un solo DataFrame
continua_undersampleados = pd.concat(continua_undersampleados)

#e. Concatenamos la clase mayoritaria submuestreada con la clase minoritaria completa
train_undersampleado = pd.concat([continua_undersampleados, baja_train])

#v. Separar en X e y después del undersampling.
#a. Datos para optimizar Optuna.
X_train_undersampleado = train_undersampleado.drop(['clase_ternaria', 'clase_peso', 'clase_binaria2'], axis=1)
y_train_binaria2_undersampleado = train_undersampleado['clase_binaria2']
w_train_undersampleado = train_undersampleado['clase_peso']

#b. Datos para entrenar todo el modelo final para Kaggle.
X_train = train_data.drop(['clase_ternaria', 'clase_peso','clase_binaria2'], axis=1)
y_train_binaria2 = train_data['clase_binaria2']
w_train = train_data['clase_peso']

#c. Datos de Test (a predecir).
X_test = test_data.drop(['clase_ternaria', 'clase_peso','clase_binaria2'], axis=1)

## B. Train con cantidad_meses_train meses con df -ventana con ratios incluidos.

In [9]:
#1. Funcion de optimización de hiperparámetros.
def objective(trial): 
    # Rango de parámetros a buscar sus valores óptimos.
    num_leaves = trial.suggest_int('num_leaves', 10, 200)
    learning_rate = trial.suggest_float('learning_rate', 0.005, 0.3) # mas bajo, más iteraciones necesita.
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 15, 900)
    feature_fraction = trial.suggest_float('feature_fraction', 0.1, 1.0)
    bagging_fraction = trial.suggest_float('bagging_fraction', 0.1, 1.0)


    # Parámetros que le voy a pasar al modelo.
    params = {
        'objective': 'binary',
        'metric': 'custom',
        'boosting_type': 'gbdt',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_bin': 31,
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'min_data_in_leaf': min_data_in_leaf,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'seed': semillas[0],
        'verbose': -1
    }
    
    # Creo el dataset para Light GBM.
    train_data_ob = lgb.Dataset(X_train_undersampleado,
                              label=y_train_binaria2_undersampleado, # eligir la clase
                              weight=w_train_undersampleado)
    
    # Entreno.
    cv_results = lgb.cv(
        params,
        train_data_ob,
        num_boost_round=1000, # modificar, subit y subir... y descomentar la línea inferior
        callbacks=[lgb.early_stopping(int(50 + 5 / learning_rate))],
        feval=lgb_gan_eval,
        stratified=True,
        nfold=5,
        seed=semillas[0]
    )
    
    # Calculo la ganancia máxima y la mejor iteración donde se obtuvo dicha ganancia.
    max_gan = max(cv_results['valid gan_eval-mean'])
    best_iter = cv_results['valid gan_eval-mean'].index(max_gan) + 1

    # Guardamos cual es la mejor iteración del modelo
    trial.set_user_attr("best_iter", best_iter)

    return max_gan * 5

In [None]:
#2. Voy a realizar un estudio de Optuna para encontrar los mejores parámetros.
#i. Creo la base de datos donde guardar los resultados.
storage_name = "sqlite:///" + db_path + "optimization_lgbm.db"

study_name = f"exp_lgbm_{cantidad_meses_train}_{ventana}_undersampling" # Primer dígito cuantos meses para atrás desde 06/21, segundo dígito número data drifting.

#ii. Creo el estudio.
study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

#iii. Corro el estudio.
study.optimize(objective, n_trials=100)

[I 2024-11-09 02:30:39,257] A new study created in RDB with name: exp_lgbm_all_1_undersampling


Training until validation scores don't improve for 88 rounds
Early stopping, best iteration is:
[223]	cv_agg's valid gan_eval: 6.48021e+08 + 9.7715e+06


[I 2024-11-09 02:54:31,809] Trial 0 finished with value: 3240104000.0 and parameters: {'num_leaves': 121, 'learning_rate': 0.12918106155967501, 'min_data_in_leaf': 639, 'feature_fraction': 0.6391425543002685, 'bagging_fraction': 0.3842259584712716}. Best is trial 0 with value: 3240104000.0.


Training until validation scores don't improve for 70 rounds
Early stopping, best iteration is:
[37]	cv_agg's valid gan_eval: 6.13962e+08 + 1.22272e+07


[I 2024-11-09 03:01:26,333] Trial 1 finished with value: 3069808000.0 and parameters: {'num_leaves': 71, 'learning_rate': 0.24950053842993855, 'min_data_in_leaf': 27, 'feature_fraction': 0.18653538911882572, 'bagging_fraction': 0.5503657943132991}. Best is trial 0 with value: 3240104000.0.


Training until validation scores don't improve for 77 rounds
Early stopping, best iteration is:
[322]	cv_agg's valid gan_eval: 6.34862e+08 + 1.19849e+07


[I 2024-11-09 03:20:35,384] Trial 2 finished with value: 3174311000.0 and parameters: {'num_leaves': 23, 'learning_rate': 0.18284450146142223, 'min_data_in_leaf': 230, 'feature_fraction': 0.18576488490863347, 'bagging_fraction': 0.7951671685232008}. Best is trial 0 with value: 3240104000.0.


Training until validation scores don't improve for 123 rounds
Did not meet early stopping. Best iteration is:
[941]	cv_agg's valid gan_eval: 6.54342e+08 + 9.10444e+06


[I 2024-11-09 04:10:15,193] Trial 3 finished with value: 3271709000.0 and parameters: {'num_leaves': 87, 'learning_rate': 0.06781176296668011, 'min_data_in_leaf': 423, 'feature_fraction': 0.2132086990466358, 'bagging_fraction': 0.49070828021424895}. Best is trial 3 with value: 3271709000.0.


Training until validation scores don't improve for 208 rounds
Did not meet early stopping. Best iteration is:
[999]	cv_agg's valid gan_eval: 6.48861e+08 + 1.01979e+07


[I 2024-11-09 04:40:09,496] Trial 4 finished with value: 3244304000.0 and parameters: {'num_leaves': 56, 'learning_rate': 0.03161803802695924, 'min_data_in_leaf': 611, 'feature_fraction': 0.8647587207465398, 'bagging_fraction': 0.7016375547962735}. Best is trial 3 with value: 3271709000.0.


Training until validation scores don't improve for 83 rounds
Early stopping, best iteration is:
[629]	cv_agg's valid gan_eval: 6.40688e+08 + 1.07401e+07


[I 2024-11-09 05:10:37,693] Trial 5 finished with value: 3203438000.0 and parameters: {'num_leaves': 25, 'learning_rate': 0.14851219181147207, 'min_data_in_leaf': 73, 'feature_fraction': 0.14594034763443645, 'bagging_fraction': 0.820823688518626}. Best is trial 3 with value: 3271709000.0.


Training until validation scores don't improve for 71 rounds
Early stopping, best iteration is:
[776]	cv_agg's valid gan_eval: 6.42162e+08 + 8.62964e+06


[I 2024-11-09 05:41:38,080] Trial 6 finished with value: 3210809000.0 and parameters: {'num_leaves': 185, 'learning_rate': 0.23522180625107134, 'min_data_in_leaf': 431, 'feature_fraction': 0.8851453985072449, 'bagging_fraction': 0.967031139562891}. Best is trial 3 with value: 3271709000.0.


Training until validation scores don't improve for 70 rounds
Early stopping, best iteration is:
[154]	cv_agg's valid gan_eval: 6.28769e+08 + 1.41111e+07


[I 2024-11-09 05:57:48,370] Trial 7 finished with value: 3143847000.0 and parameters: {'num_leaves': 34, 'learning_rate': 0.24768949582241864, 'min_data_in_leaf': 432, 'feature_fraction': 0.49586726411361004, 'bagging_fraction': 0.35022211884453835}. Best is trial 3 with value: 3271709000.0.


Training until validation scores don't improve for 137 rounds
Early stopping, best iteration is:
[337]	cv_agg's valid gan_eval: 6.54567e+08 + 1.05398e+07


[I 2024-11-09 06:26:42,242] Trial 8 finished with value: 3272836000.0 and parameters: {'num_leaves': 153, 'learning_rate': 0.05730360736219985, 'min_data_in_leaf': 516, 'feature_fraction': 0.595216310261227, 'bagging_fraction': 0.8792768688362574}. Best is trial 8 with value: 3272836000.0.


Training until validation scores don't improve for 98 rounds
Early stopping, best iteration is:
[544]	cv_agg's valid gan_eval: 6.49373e+08 + 1.01037e+07


[I 2024-11-09 06:54:11,011] Trial 9 finished with value: 3246866000.0 and parameters: {'num_leaves': 95, 'learning_rate': 0.10334791863401153, 'min_data_in_leaf': 562, 'feature_fraction': 0.11745300319230206, 'bagging_fraction': 0.5637348477974211}. Best is trial 8 with value: 3272836000.0.


Training until validation scores don't improve for 365 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid gan_eval: 6.57364e+08 + 8.21621e+06


[I 2024-11-09 08:11:00,082] Trial 10 finished with value: 3286822000.0 and parameters: {'num_leaves': 150, 'learning_rate': 0.015851835888398745, 'min_data_in_leaf': 844, 'feature_fraction': 0.5169155702487678, 'bagging_fraction': 0.12209372740396313}. Best is trial 10 with value: 3286822000.0.


Training until validation scores don't improve for 851 rounds
Did not meet early stopping. Best iteration is:
[999]	cv_agg's valid gan_eval: 6.48703e+08 + 9.01471e+06


[I 2024-11-09 09:29:44,205] Trial 11 finished with value: 3243513000.0 and parameters: {'num_leaves': 151, 'learning_rate': 0.006240105856715656, 'min_data_in_leaf': 898, 'feature_fraction': 0.5048526795279558, 'bagging_fraction': 0.1080366492091992}. Best is trial 10 with value: 3286822000.0.


Training until validation scores don't improve for 137 rounds
Early stopping, best iteration is:
[649]	cv_agg's valid gan_eval: 6.5492e+08 + 9.11653e+06


[I 2024-11-09 10:06:39,792] Trial 12 finished with value: 3274600000.0 and parameters: {'num_leaves': 140, 'learning_rate': 0.05685146451447202, 'min_data_in_leaf': 858, 'feature_fraction': 0.6807123086312123, 'bagging_fraction': 0.11958793250016339}. Best is trial 10 with value: 3286822000.0.


Training until validation scores don't improve for 110 rounds
Early stopping, best iteration is:
[224]	cv_agg's valid gan_eval: 6.51776e+08 + 9.26667e+06


[I 2024-11-09 10:24:03,080] Trial 13 finished with value: 3258878000.0 and parameters: {'num_leaves': 137, 'learning_rate': 0.08332370417639486, 'min_data_in_leaf': 899, 'feature_fraction': 0.7318738465756347, 'bagging_fraction': 0.11002290900279688}. Best is trial 10 with value: 3286822000.0.


Training until validation scores don't improve for 250 rounds
Did not meet early stopping. Best iteration is:
[968]	cv_agg's valid gan_eval: 6.6079e+08 + 8.49933e+06


[I 2024-11-09 11:34:09,776] Trial 14 finished with value: 3303951000.0 and parameters: {'num_leaves': 184, 'learning_rate': 0.024899492479589086, 'min_data_in_leaf': 737, 'feature_fraction': 0.37633533264500285, 'bagging_fraction': 0.2153762669812679}. Best is trial 14 with value: 3303951000.0.


Training until validation scores don't improve for 632 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid gan_eval: 6.55404e+08 + 1.0107e+07


[I 2024-11-09 12:43:48,924] Trial 15 finished with value: 3277022000.0 and parameters: {'num_leaves': 197, 'learning_rate': 0.008582498094602847, 'min_data_in_leaf': 745, 'feature_fraction': 0.3588880717376752, 'bagging_fraction': 0.26455924338572034}. Best is trial 14 with value: 3303951000.0.


Training until validation scores don't improve for 76 rounds
Early stopping, best iteration is:
[107]	cv_agg's valid gan_eval: 6.39045e+08 + 1.06633e+07


[I 2024-11-09 12:57:42,995] Trial 16 finished with value: 3195227000.0 and parameters: {'num_leaves': 173, 'learning_rate': 0.19135092454147729, 'min_data_in_leaf': 740, 'feature_fraction': 0.36921855176629004, 'bagging_fraction': 0.23586546563252436}. Best is trial 14 with value: 3303951000.0.


Training until validation scores don't improve for 95 rounds
Early stopping, best iteration is:
[204]	cv_agg's valid gan_eval: 6.51266e+08 + 1.09867e+07


[I 2024-11-09 13:18:55,761] Trial 17 finished with value: 3256330000.0 and parameters: {'num_leaves': 170, 'learning_rate': 0.10915106134709467, 'min_data_in_leaf': 734, 'feature_fraction': 0.3817366087463956, 'bagging_fraction': 0.2455865958777283}. Best is trial 14 with value: 3303951000.0.


Training until validation scores don't improve for 67 rounds
Early stopping, best iteration is:
[98]	cv_agg's valid gan_eval: 6.26006e+08 + 8.12419e+06


[I 2024-11-09 13:31:50,118] Trial 18 finished with value: 3130029000.0 and parameters: {'num_leaves': 118, 'learning_rate': 0.28364329779722103, 'min_data_in_leaf': 795, 'feature_fraction': 0.4604140144984927, 'bagging_fraction': 0.4099418168623718}. Best is trial 14 with value: 3303951000.0.


Training until validation scores don't improve for 216 rounds
Did not meet early stopping. Best iteration is:
[939]	cv_agg's valid gan_eval: 6.59604e+08 + 9.72069e+06


[I 2024-11-09 14:31:53,973] Trial 19 finished with value: 3298022000.0 and parameters: {'num_leaves': 196, 'learning_rate': 0.030088816847513786, 'min_data_in_leaf': 310, 'feature_fraction': 0.29124847959394673, 'bagging_fraction': 0.2970912303528168}. Best is trial 14 with value: 3303951000.0.


Training until validation scores don't improve for 167 rounds
Did not meet early stopping. Best iteration is:
[930]	cv_agg's valid gan_eval: 6.58004e+08 + 1.04451e+07


[I 2024-11-09 15:31:00,664] Trial 20 finished with value: 3290021000.0 and parameters: {'num_leaves': 198, 'learning_rate': 0.04251265638167803, 'min_data_in_leaf': 304, 'feature_fraction': 0.28080406350560255, 'bagging_fraction': 0.6537810264305447}. Best is trial 14 with value: 3303951000.0.


Training until validation scores don't improve for 167 rounds
Early stopping, best iteration is:
[682]	cv_agg's valid gan_eval: 6.57299e+08 + 7.52499e+06


[I 2024-11-09 16:20:47,431] Trial 21 finished with value: 3286493000.0 and parameters: {'num_leaves': 191, 'learning_rate': 0.04244147379266566, 'min_data_in_leaf': 301, 'feature_fraction': 0.2775734642872071, 'bagging_fraction': 0.5906300770171131}. Best is trial 14 with value: 3303951000.0.


Training until validation scores don't improve for 105 rounds
Early stopping, best iteration is:
[470]	cv_agg's valid gan_eval: 6.53089e+08 + 8.08136e+06


[I 2024-11-09 16:54:49,584] Trial 22 finished with value: 3265444000.0 and parameters: {'num_leaves': 176, 'learning_rate': 0.08982531989507798, 'min_data_in_leaf': 304, 'feature_fraction': 0.2842783895112754, 'bagging_fraction': 0.6633761717668057}. Best is trial 14 with value: 3303951000.0.


Training until validation scores don't improve for 185 rounds
Did not meet early stopping. Best iteration is:
[879]	cv_agg's valid gan_eval: 6.59179e+08 + 9.94284e+06


[I 2024-11-09 17:54:32,564] Trial 23 finished with value: 3295894000.0 and parameters: {'num_leaves': 196, 'learning_rate': 0.03701027056977342, 'min_data_in_leaf': 185, 'feature_fraction': 0.2994561506017316, 'bagging_fraction': 0.4723791547166918}. Best is trial 14 with value: 3303951000.0.


Training until validation scores don't improve for 226 rounds
Did not meet early stopping. Best iteration is:
[975]	cv_agg's valid gan_eval: 6.57777e+08 + 8.03475e+06


[I 2024-11-09 19:02:18,356] Trial 24 finished with value: 3288887000.0 and parameters: {'num_leaves': 172, 'learning_rate': 0.028382547012044075, 'min_data_in_leaf': 160, 'feature_fraction': 0.4168545180634128, 'bagging_fraction': 0.3063796195532489}. Best is trial 14 with value: 3303951000.0.


Training until validation scores don't improve for 116 rounds
Did not meet early stopping. Best iteration is:
[995]	cv_agg's valid gan_eval: 6.56187e+08 + 1.02712e+07


[I 2024-11-09 20:02:47,046] Trial 25 finished with value: 3280935000.0 and parameters: {'num_leaves': 200, 'learning_rate': 0.07477914118302852, 'min_data_in_leaf': 188, 'feature_fraction': 0.31169550130346235, 'bagging_fraction': 0.46506665369170813}. Best is trial 14 with value: 3303951000.0.


Training until validation scores don't improve for 89 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid gan_eval: 6.56832e+08 + 1.19821e+07


[I 2024-11-09 21:10:23,676] Trial 26 finished with value: 3284162000.0 and parameters: {'num_leaves': 162, 'learning_rate': 0.12564215532140358, 'min_data_in_leaf': 344, 'feature_fraction': 0.42958109145698237, 'bagging_fraction': 0.20540050444001784}. Best is trial 14 with value: 3303951000.0.


Training until validation scores don't improve for 162 rounds
Did not meet early stopping. Best iteration is:
[977]	cv_agg's valid gan_eval: 6.5813e+08 + 7.72732e+06


[I 2024-11-09 22:06:48,660] Trial 27 finished with value: 3290651000.0 and parameters: {'num_leaves': 183, 'learning_rate': 0.044272968270519034, 'min_data_in_leaf': 151, 'feature_fraction': 0.2427752497270366, 'bagging_fraction': 0.44925140426522275}. Best is trial 14 with value: 3303951000.0.


Training until validation scores don't improve for 77 rounds
Early stopping, best iteration is:
[238]	cv_agg's valid gan_eval: 6.38497e+08 + 8.26628e+06


[I 2024-11-09 22:19:59,883] Trial 28 finished with value: 3192483000.0 and parameters: {'num_leaves': 129, 'learning_rate': 0.18285025171265484, 'min_data_in_leaf': 371, 'feature_fraction': 0.9682282307464434, 'bagging_fraction': 0.32350728661133754}. Best is trial 14 with value: 3303951000.0.


Training until validation scores don't improve for 83 rounds


In [None]:
#4. Visualizo los resultados del estudio, para modificar los rangos de análisis.

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:
plot_slice(study)

In [None]:
plot_contour(study)

In [None]:
plot_contour(study, params=['num_leaves','min_data_in_leaf'] )

In [None]:
study.best_trial.params

In [None]:
#5. Tomamos el mejor modelo y con eso entrenamos todos los datos.
best_iter = study.best_trial.user_attrs["best_iter"]
print(f"Mejor cantidad de árboles para el mejor model {best_iter}")
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'first_metric_only': True,
    'boost_from_average': True,
    'feature_pre_filter': False,
    'max_bin': 31,
    'num_leaves': study.best_trial.params['num_leaves'],
    'learning_rate': study.best_trial.params['learning_rate'],
    'min_data_in_leaf': study.best_trial.params['min_data_in_leaf'],
    'feature_fraction': study.best_trial.params['feature_fraction'],
    'bagging_fraction': study.best_trial.params['bagging_fraction'],
    'seed': semillas[0],
    'verbose': 0
}

train_data_modelo = lgb.Dataset(X_train,
                          label=y_train_binaria2,
                          weight=w_train)

model_lgb = lgb.train(params,
                  train_data_modelo,
                  num_boost_round=best_iter)

In [None]:
#6. Observamos las variables más importantes para el modelo.
#i. Gráfico.
lgb.plot_importance(model_lgb, figsize=(10, 20))
plt.show()

In [None]:
#ii. Dataframe.
#a. Extract feature importance and feature names.
importance = model_lgb.feature_importance()
features = model_lgb.feature_name()

#b. Create a dataframe for better visualization.
importance_df = pd.DataFrame({'Feature': features, 'Importance': importance})

#c. Sort by importance in descending order.
importance_df = importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

#d. show.
importance_df.head(100)

In [None]:
#7. Guardamos el modelo.
# Primer dígito cuantos meses para atrás desde 06/21, segundo dígito número data drifting, tercer dígito número de entrenamiento.
model_lgb.save_model(modelos_path + 'lgbm_{}_{}_undersampling.txt'.format(cantidad_meses_train,ventana))

In [None]:
####################################################################################################
####################################################################################################
####################################################################################################
####################################################################################################
############## Primera subida orientativa a Kaggle previo medir semillas, etc. #####################
####################################################################################################
####################################################################################################
####################################################################################################
####################################################################################################

In [None]:
#8. Volvemos a leer el modelo.
model_lgb = lgb.Booster(model_file= modelos_path + 'lgbm_{}_{}_undersampling.txt'.format(cantidad_meses_train,ventana))

In [None]:
#a. Importo librería.
from kaggle.api.kaggle_api_extended import KaggleApi
#b. Configura el API de Kaggle
api = KaggleApi()
api.authenticate()

In [None]:
#L. Predecimos Agosto.
#i. Predecimos propiamente dicho.
predicciones = model_lgb.predict(X_test)
#ii. Le pegamos la probabilidad de ser "BAJA" a cada cliente.
X_test['Probabilidad'] = predicciones
#iii. Ordenamos a los clientes por probabilidad de ser "BAJA" de forma descendente.
tb_entrega = X_test.sort_values(by='Probabilidad', ascending=False)
#iv. Genero una lista de distintos cortes candidatos, para enviar a Kaggle.
cortes = range(9000,14000,200)
#v. Generamos las distintas predicciones de clases a partir de los distintos cortes posibles.
num_subida_kaggle = 1
i = 0
for envios in cortes:
    #1. Le ponemos clase 1 ("BAJA") a los primeros "envios" con mayor probabilidad.
    tb_entrega['Predicted'] = 0
    tb_entrega.iloc[:envios, tb_entrega.columns.get_loc('Predicted')] = 1
    resultados = tb_entrega[["numero_de_cliente", 'Predicted']].reset_index(drop=True)
    
    print("Cantidad de clientes {}".format(envios))
    #2. Guardamos el archivo para Kaggle.
    nombre_archivo = "K_OH_{}_{}_00{}.csv".format(cantidad_meses_train,ventana,num_subida_kaggle) #-X meses, con df de -X meses, número de intento.
    ruta_archivo= "{}/{}".format(exp_path,nombre_archivo)
    resultados.to_csv(ruta_archivo, index=False)
    
    num_subida_kaggle += 1
    i += 1
    
    #3. Envío a Kaggle.
    #a. Defino los parámetros claves.
    mensaje = f'Archivo {nombre_archivo}.LGBM meses train {cantidad_meses_train} con undersampling, DF {ventana}, punto_corte: {envios}.'
    competencia = 'dm-ey-f-2024-segunda'
    #c. Subo la Submission.
    while i<=15:
        print(i)
        api.competition_submit(file_name=ruta_archivo, message=mensaje, competition=competencia)
        print("Submission successful!")
        break
    else:
        print("Esperamos 30 segundos...")
        time.sleep(30)
        api.competition_submit(file_name=ruta_archivo, message=mensaje, competition=competencia)
        print("Submission successful!")
        i= 0

In [None]:
print("Terminó")