# Modelo

La optimizacion la hago con el dataset undersampleado porque sino tarda una eternidad.

En el proximo script entreno ahi si con la totalidad de los datos.

In [1]:
import pandas as pd
import polars as pl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_contour

from time import time

import pickle

In [None]:
# !gsutil cp /home/eanegrin/buckets/b1/datasets/competencia_02_inflacion_adj_ipc_undersampled.parquet /home/eanegrin/datasets/

'gsutil' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
# base_path = '/content/drive/MyDrive/DMEyF/2024/'
base_path = 'C:/Eugenio/Maestria/DMEyF/'
# base_path = '/home/eanegrin/buckets/b1/'

dataset_path = base_path + 'datasets/'
modelos_path = base_path + 'modelos/'
db_path = base_path + 'db/'
dataset_file = 'competencia_02_inflacion_adj_ipc_undersampled_exp.parquet'

ganancia_acierto = 273000
costo_estimulo = 7000

# agregue sus semillas
semillas = [122219, 109279, 400391, 401537, 999961]

# data = pd.read_parquet('/home/eanegrin/datasets/' + dataset_file)
data = pd.read_parquet(dataset_path + dataset_file)

In [3]:
data.shape

(410019, 353)

In [4]:
# corrijo los tipos de estas 2 columnas que se guardaron como string en el parquet

data[['tmobile_app', 'cmobile_app_trx']] = data[['tmobile_app', 'cmobile_app_trx']].astype('float')

In [5]:
meses_train = [201906, 201907, 201908, 201909, 201910, 201911, 201912,
               202001, 202002, 202003, 202004, 202005, 202006,
               202007, 202008, 202009, 202010, 202011, 202012,
               202101, 202102, 202103, 202104, 202105] # dejo afuera 202106 para test

data = data[data['foto_mes'].isin(meses_train)]
data.shape

(391550, 353)

In [6]:
# Asignamos pesos a las clases

data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

In [7]:
data['clase_binaria'] = 0
data['clase_binaria'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 1)

In [8]:
X_train = data.drop(['clase_ternaria', 'clase_peso', 'clase_binaria'], axis=1)
y_train_binaria = data['clase_binaria'] # Junta a los 2 baja
w_train = data['clase_peso']

Para evaluar la calidad del modelo, crearemos nuestra propia función de evaluación que calcule la ganancia. La razón de incluir los pesos es precisamente para poder implementar esta función de evaluación de manera adecuada. Al combinar las clases *BAJA+1* y *BAJA+2* en una sola, necesitamos una forma de diferenciarlas, y es aquí donde entra en juego el *weight*. Este parámetro nos permitirá distinguir entre ambas clases al momento de evaluarlas dentro del algoritmo.


In [9]:
def lgb_gan_eval(y_pred, data):
    weight = data.get_weight()
    ganancia = np.where(weight == 1.00002, ganancia_acierto, 0) - np.where(weight < 1.00002, costo_estimulo, 0)
    ganancia = ganancia[np.argsort(y_pred)[::-1]]
    ganancia = np.cumsum(ganancia)

    return 'gan_eval', np.max(ganancia) , True

# Optimizacion

In [18]:
def objective(trial):

    num_leaves = trial.suggest_int('num_leaves', 8, 150),
    learning_rate = trial.suggest_float('learning_rate', 0.005, 0.05), # mas bajo, más iteraciones necesita
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 1, 2000),
    feature_fraction = trial.suggest_float('feature_fraction', 0.1, 1.0),
    bagging_fraction = trial.suggest_float('bagging_fraction', 0.1, 1.0),

    params = {
        'objective': 'binary',
        'metric': 'custom',
        'boosting_type': 'gbdt',
        'first_metric_only': True,
        'boost_from_average': True,
        'feature_pre_filter': False,
        'max_bin': 31,
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'min_data_in_leaf': min_data_in_leaf,
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'seed': semillas[0],
        'verbose': -1
    }
    
    train_data = lgb.Dataset(X_train,
                              label=y_train_binaria, # eligir la clase
                              weight=w_train)
    
    # print(f"Learning Rate: {learning_rate}, Type: {type(learning_rate)}")
    
    # Use callbacks for early stopping
    early_stopping_cb = lgb.early_stopping(stopping_rounds=50) # creo que en min_delta le tendria que pasar un parametro con la ganancia que considero irrelevante
        
    cv_results = lgb.cv(
        params,
        train_data,
        num_boost_round=1000, # modificar, subit y subir... y descomentar la línea inferior (ahora le puso 100 para mostrarnos, pero hay que ponerle un numero alto, 10.000, 200.000)
        callbacks=[early_stopping_cb],
        feval=lgb_gan_eval,
        stratified=True,
        nfold=5,
        seed=semillas[0]
    )
    
    max_gan = max(cv_results['valid gan_eval-mean'])
    best_iter = cv_results['valid gan_eval-mean'].index(max_gan) + 1

    # Guardamos cual es la mejor iteración del modelo
    trial.set_user_attr("best_iter", best_iter)

    return max_gan * 5 # funcion objetivo, en el proximo paso le digo si quiero maximizarla o minimizarla.

In [19]:
storage_name = "sqlite:///" + db_path + "optimization_experimento_ipc.db"
study_name = "experimento_ipc_v02" # UPDATE

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

[I 2024-11-20 15:27:00,429] Using an existing study with name 'experimento_ipc_v02' instead of creating a new one.


In [20]:
study.optimize(objective, n_trials=50)

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[980]	cv_agg's valid gan_eval: 6.55228e+08 + 5.82237e+06


[I 2024-11-20 15:30:40,575] Trial 1 finished with value: 3276140000.0 and parameters: {'num_leaves': 70, 'learning_rate': 0.019535613904928098, 'min_data_in_leaf': 86, 'feature_fraction': 0.5748671597555829, 'bagging_fraction': 0.6029672848223565}. Best is trial 1 with value: 3276140000.0.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	cv_agg's valid gan_eval: 6.70435e+08 + 7.21188e+06


[I 2024-11-20 15:34:40,426] Trial 2 finished with value: 3352174000.0 and parameters: {'num_leaves': 111, 'learning_rate': 0.04031561089897204, 'min_data_in_leaf': 1572, 'feature_fraction': 0.35570399512329076, 'bagging_fraction': 0.6888488565823928}. Best is trial 2 with value: 3352174000.0.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	cv_agg's valid gan_eval: 6.67911e+08 + 6.58451e+06


[I 2024-11-20 15:39:11,657] Trial 3 finished with value: 3339553000.0 and parameters: {'num_leaves': 109, 'learning_rate': 0.03330992565391703, 'min_data_in_leaf': 596, 'feature_fraction': 0.5784151838196891, 'bagging_fraction': 0.3161258449467295}. Best is trial 2 with value: 3352174000.0.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	cv_agg's valid gan_eval: 6.53768e+08 + 4.95599e+06


[I 2024-11-20 15:43:29,256] Trial 4 finished with value: 3268839000.0 and parameters: {'num_leaves': 106, 'learning_rate': 0.012859159778941951, 'min_data_in_leaf': 1733, 'feature_fraction': 0.8743987202499898, 'bagging_fraction': 0.5916197325333693}. Best is trial 2 with value: 3352174000.0.


Training until validation scores don't improve for 50 rounds


[I 2024-11-20 15:45:59,969] Trial 5 finished with value: 3213728000.0 and parameters: {'num_leaves': 26, 'learning_rate': 0.014593340238881002, 'min_data_in_leaf': 153, 'feature_fraction': 0.12886003584370653, 'bagging_fraction': 0.84296660619227}. Best is trial 2 with value: 3352174000.0.


Did not meet early stopping. Best iteration is:
[996]	cv_agg's valid gan_eval: 6.42746e+08 + 5.93084e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 15:48:32,603] Trial 6 finished with value: 3220819000.0 and parameters: {'num_leaves': 11, 'learning_rate': 0.04015261345281075, 'min_data_in_leaf': 1774, 'feature_fraction': 0.2010029501661364, 'bagging_fraction': 0.16590072125469585}. Best is trial 2 with value: 3352174000.0.


Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid gan_eval: 6.44164e+08 + 4.00607e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 15:51:37,603] Trial 7 finished with value: 3215443000.0 and parameters: {'num_leaves': 31, 'learning_rate': 0.010534059651263628, 'min_data_in_leaf': 1719, 'feature_fraction': 0.32008006565115515, 'bagging_fraction': 0.7246247383814621}. Best is trial 2 with value: 3352174000.0.


Did not meet early stopping. Best iteration is:
[996]	cv_agg's valid gan_eval: 6.43089e+08 + 5.10971e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 15:54:58,143] Trial 8 finished with value: 3203858000.0 and parameters: {'num_leaves': 31, 'learning_rate': 0.009094913202717332, 'min_data_in_leaf': 1074, 'feature_fraction': 0.515868174354819, 'bagging_fraction': 0.4072205934113252}. Best is trial 2 with value: 3352174000.0.


Did not meet early stopping. Best iteration is:
[992]	cv_agg's valid gan_eval: 6.40772e+08 + 5.34144e+06
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	cv_agg's valid gan_eval: 6.55001e+08 + 5.52652e+06


[I 2024-11-20 15:58:30,056] Trial 9 finished with value: 3275006000.0 and parameters: {'num_leaves': 121, 'learning_rate': 0.011951753341556487, 'min_data_in_leaf': 1600, 'feature_fraction': 0.6989937169233078, 'bagging_fraction': 0.4584258398690093}. Best is trial 2 with value: 3352174000.0.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	cv_agg's valid gan_eval: 6.70016e+08 + 5.85925e+06


[I 2024-11-20 16:01:10,712] Trial 10 finished with value: 3350081000.0 and parameters: {'num_leaves': 99, 'learning_rate': 0.04734567999008882, 'min_data_in_leaf': 603, 'feature_fraction': 0.9678500462424783, 'bagging_fraction': 0.7556993177848513}. Best is trial 2 with value: 3352174000.0.


Training until validation scores don't improve for 50 rounds


[I 2024-11-20 16:04:45,416] Trial 11 finished with value: 3342332000.0 and parameters: {'num_leaves': 140, 'learning_rate': 0.0274266445384027, 'min_data_in_leaf': 1233, 'feature_fraction': 0.32010457107598844, 'bagging_fraction': 0.9927383812793777}. Best is trial 2 with value: 3352174000.0.


Did not meet early stopping. Best iteration is:
[999]	cv_agg's valid gan_eval: 6.68466e+08 + 7.52718e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 16:07:20,721] Trial 12 finished with value: 3340211000.0 and parameters: {'num_leaves': 78, 'learning_rate': 0.047495585142107825, 'min_data_in_leaf': 669, 'feature_fraction': 0.9860950144768189, 'bagging_fraction': 0.8092648769032543}. Best is trial 2 with value: 3352174000.0.


Did not meet early stopping. Best iteration is:
[999]	cv_agg's valid gan_eval: 6.68042e+08 + 8.08997e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 16:10:10,419] Trial 13 finished with value: 3343739000.0 and parameters: {'num_leaves': 91, 'learning_rate': 0.04797512626415096, 'min_data_in_leaf': 680, 'feature_fraction': 0.4121932019296003, 'bagging_fraction': 0.6974033539869757}. Best is trial 2 with value: 3352174000.0.


Did not meet early stopping. Best iteration is:
[992]	cv_agg's valid gan_eval: 6.68748e+08 + 7.84102e+06
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	cv_agg's valid gan_eval: 6.71983e+08 + 6.87966e+06


[I 2024-11-20 16:13:29,369] Trial 14 finished with value: 3359916000.0 and parameters: {'num_leaves': 137, 'learning_rate': 0.03930964605368338, 'min_data_in_leaf': 1406, 'feature_fraction': 0.7900337949227298, 'bagging_fraction': 0.9892131837765276}. Best is trial 14 with value: 3359916000.0.


Training until validation scores don't improve for 50 rounds


[I 2024-11-20 16:17:08,072] Trial 15 finished with value: 3362863000.0 and parameters: {'num_leaves': 149, 'learning_rate': 0.03769210830273531, 'min_data_in_leaf': 1353, 'feature_fraction': 0.7606323858696207, 'bagging_fraction': 0.9501974841691402}. Best is trial 15 with value: 3362863000.0.


Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid gan_eval: 6.72573e+08 + 7.29211e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 16:21:17,477] Trial 16 finished with value: 3356500000.0 and parameters: {'num_leaves': 150, 'learning_rate': 0.03242295935140018, 'min_data_in_leaf': 1283, 'feature_fraction': 0.7511604866669245, 'bagging_fraction': 0.9800403944113422}. Best is trial 15 with value: 3362863000.0.


Did not meet early stopping. Best iteration is:
[999]	cv_agg's valid gan_eval: 6.713e+08 + 7.86917e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 16:25:54,688] Trial 17 finished with value: 3359811000.0 and parameters: {'num_leaves': 132, 'learning_rate': 0.037319201322000506, 'min_data_in_leaf': 1386, 'feature_fraction': 0.7833490452042631, 'bagging_fraction': 0.9124720488842647}. Best is trial 15 with value: 3362863000.0.


Did not meet early stopping. Best iteration is:
[991]	cv_agg's valid gan_eval: 6.71962e+08 + 7.35184e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 16:28:28,159] Trial 18 finished with value: 3295012000.0 and parameters: {'num_leaves': 61, 'learning_rate': 0.027128040816506983, 'min_data_in_leaf': 966, 'feature_fraction': 0.6780675587122433, 'bagging_fraction': 0.8731043702445965}. Best is trial 15 with value: 3362863000.0.


Did not meet early stopping. Best iteration is:
[979]	cv_agg's valid gan_eval: 6.59002e+08 + 5.59535e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 16:31:40,398] Trial 19 finished with value: 3319659000.0 and parameters: {'num_leaves': 129, 'learning_rate': 0.020201601044999296, 'min_data_in_leaf': 955, 'feature_fraction': 0.860639605705739, 'bagging_fraction': 0.9191666823057157}. Best is trial 15 with value: 3362863000.0.


Did not meet early stopping. Best iteration is:
[998]	cv_agg's valid gan_eval: 6.63932e+08 + 5.98763e+06
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[983]	cv_agg's valid gan_eval: 6.70984e+08 + 6.83425e+06


[I 2024-11-20 16:35:13,644] Trial 20 finished with value: 3354918000.0 and parameters: {'num_leaves': 150, 'learning_rate': 0.0423480625211588, 'min_data_in_leaf': 1900, 'feature_fraction': 0.8470266222249666, 'bagging_fraction': 0.9982056451644055}. Best is trial 15 with value: 3362863000.0.


Training until validation scores don't improve for 50 rounds


[I 2024-11-20 16:38:41,903] Trial 21 finished with value: 3291617000.0 and parameters: {'num_leaves': 54, 'learning_rate': 0.03273269204476162, 'min_data_in_leaf': 1989, 'feature_fraction': 0.6369564838551885, 'bagging_fraction': 0.15734436022361498}. Best is trial 15 with value: 3362863000.0.


Did not meet early stopping. Best iteration is:
[995]	cv_agg's valid gan_eval: 6.58323e+08 + 4.96599e+06
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid gan_eval: 6.7138e+08 + 7.15178e+06


[I 2024-11-20 16:42:14,504] Trial 22 finished with value: 3356899000.0 and parameters: {'num_leaves': 131, 'learning_rate': 0.03656266742293644, 'min_data_in_leaf': 1414, 'feature_fraction': 0.7742373732651283, 'bagging_fraction': 0.9149282760688835}. Best is trial 15 with value: 3362863000.0.


Training until validation scores don't improve for 50 rounds


[I 2024-11-20 16:46:09,484] Trial 23 finished with value: 3370654000.0 and parameters: {'num_leaves': 136, 'learning_rate': 0.04300513208682082, 'min_data_in_leaf': 1433, 'feature_fraction': 0.800417468570876, 'bagging_fraction': 0.7821716747796461}. Best is trial 23 with value: 3370654000.0.


Did not meet early stopping. Best iteration is:
[991]	cv_agg's valid gan_eval: 6.74131e+08 + 6.71687e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 16:50:06,122] Trial 24 finished with value: 3366727000.0 and parameters: {'num_leaves': 141, 'learning_rate': 0.043512480578399015, 'min_data_in_leaf': 1484, 'feature_fraction': 0.9291299653519416, 'bagging_fraction': 0.7821264556587133}. Best is trial 23 with value: 3370654000.0.


Did not meet early stopping. Best iteration is:
[991]	cv_agg's valid gan_eval: 6.73345e+08 + 7.00551e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 16:53:24,679] Trial 25 finished with value: 3352482000.0 and parameters: {'num_leaves': 117, 'learning_rate': 0.04383154789186061, 'min_data_in_leaf': 1163, 'feature_fraction': 0.9244114191136484, 'bagging_fraction': 0.7873873922318195}. Best is trial 23 with value: 3370654000.0.


Did not meet early stopping. Best iteration is:
[996]	cv_agg's valid gan_eval: 6.70496e+08 + 7.60162e+06
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	cv_agg's valid gan_eval: 6.71479e+08 + 7.15824e+06


[I 2024-11-20 16:57:12,494] Trial 26 finished with value: 3357396000.0 and parameters: {'num_leaves': 121, 'learning_rate': 0.044318147627817384, 'min_data_in_leaf': 1565, 'feature_fraction': 0.9096973859072205, 'bagging_fraction': 0.6439072406540386}. Best is trial 23 with value: 3370654000.0.


Training until validation scores don't improve for 50 rounds


[I 2024-11-20 17:01:44,430] Trial 27 finished with value: 3352335000.0 and parameters: {'num_leaves': 147, 'learning_rate': 0.03546611179062849, 'min_data_in_leaf': 848, 'feature_fraction': 0.826910943149352, 'bagging_fraction': 0.5405680537999519}. Best is trial 23 with value: 3370654000.0.


Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid gan_eval: 6.70467e+08 + 7.10435e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 17:05:43,172] Trial 28 finished with value: 3351082000.0 and parameters: {'num_leaves': 92, 'learning_rate': 0.04967893715985176, 'min_data_in_leaf': 1449, 'feature_fraction': 0.7122778988372293, 'bagging_fraction': 0.8357213182583687}. Best is trial 23 with value: 3370654000.0.


Did not meet early stopping. Best iteration is:
[981]	cv_agg's valid gan_eval: 6.70216e+08 + 6.88864e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 17:10:06,498] Trial 29 finished with value: 3348856000.0 and parameters: {'num_leaves': 136, 'learning_rate': 0.030496089810568797, 'min_data_in_leaf': 1148, 'feature_fraction': 0.47983469127187856, 'bagging_fraction': 0.5083240653331764}. Best is trial 23 with value: 3370654000.0.


Did not meet early stopping. Best iteration is:
[994]	cv_agg's valid gan_eval: 6.69771e+08 + 6.77462e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 17:14:06,837] Trial 30 finished with value: 3323194000.0 and parameters: {'num_leaves': 124, 'learning_rate': 0.021746497198162296, 'min_data_in_leaf': 1284, 'feature_fraction': 0.6040866836602778, 'bagging_fraction': 0.6183661684962536}. Best is trial 23 with value: 3370654000.0.


Did not meet early stopping. Best iteration is:
[999]	cv_agg's valid gan_eval: 6.64639e+08 + 6.13012e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 17:17:20,400] Trial 31 finished with value: 3361820000.0 and parameters: {'num_leaves': 142, 'learning_rate': 0.044590459954147364, 'min_data_in_leaf': 280, 'feature_fraction': 0.9324832784015794, 'bagging_fraction': 0.7692020120013923}. Best is trial 23 with value: 3370654000.0.


Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid gan_eval: 6.72364e+08 + 7.72317e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 17:19:47,295] Trial 32 finished with value: 3350725000.0 and parameters: {'num_leaves': 143, 'learning_rate': 0.04524951039308893, 'min_data_in_leaf': 134, 'feature_fraction': 0.9382461888355386, 'bagging_fraction': 0.7479263447802083}. Best is trial 23 with value: 3370654000.0.


Did not meet early stopping. Best iteration is:
[995]	cv_agg's valid gan_eval: 6.70145e+08 + 7.96975e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 17:24:06,741] Trial 33 finished with value: 3360539000.0 and parameters: {'num_leaves': 141, 'learning_rate': 0.04064625574153185, 'min_data_in_leaf': 296, 'feature_fraction': 0.9061253352478278, 'bagging_fraction': 0.6703857213896768}. Best is trial 23 with value: 3370654000.0.


Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid gan_eval: 6.72108e+08 + 7.77316e+06
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	cv_agg's valid gan_eval: 6.69456e+08 + 7.13773e+06


[I 2024-11-20 17:27:22,928] Trial 34 finished with value: 3347281000.0 and parameters: {'num_leaves': 112, 'learning_rate': 0.04231310307780516, 'min_data_in_leaf': 339, 'feature_fraction': 0.8164124014027679, 'bagging_fraction': 0.8639358512241377}. Best is trial 23 with value: 3370654000.0.


Training until validation scores don't improve for 50 rounds


[I 2024-11-20 17:32:01,293] Trial 35 finished with value: 3360168000.0 and parameters: {'num_leaves': 127, 'learning_rate': 0.04570954963012986, 'min_data_in_leaf': 1597, 'feature_fraction': 0.9790039338362623, 'bagging_fraction': 0.7950595856345702}. Best is trial 23 with value: 3370654000.0.


Did not meet early stopping. Best iteration is:
[993]	cv_agg's valid gan_eval: 6.72034e+08 + 7.93936e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 17:35:08,171] Trial 36 finished with value: 3340351000.0 and parameters: {'num_leaves': 102, 'learning_rate': 0.038452191063430355, 'min_data_in_leaf': 1503, 'feature_fraction': 0.8699203151229075, 'bagging_fraction': 0.9304589128241244}. Best is trial 23 with value: 3370654000.0.


Did not meet early stopping. Best iteration is:
[979]	cv_agg's valid gan_eval: 6.6807e+08 + 6.15706e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 17:37:45,340] Trial 37 finished with value: 3352286000.0 and parameters: {'num_leaves': 113, 'learning_rate': 0.042514676322478476, 'min_data_in_leaf': 848, 'feature_fraction': 0.8946244774388563, 'bagging_fraction': 0.27298458921624597}. Best is trial 23 with value: 3370654000.0.


Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid gan_eval: 6.70457e+08 + 8.0476e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 17:42:10,358] Trial 38 finished with value: 3224025000.0 and parameters: {'num_leaves': 143, 'learning_rate': 0.006055645514251588, 'min_data_in_leaf': 1711, 'feature_fraction': 0.7230045579722503, 'bagging_fraction': 0.58635703097044}. Best is trial 23 with value: 3370654000.0.


Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid gan_eval: 6.44805e+08 + 5.10542e+06
Training until validation scores don't improve for 50 rounds


[I 2024-11-20 17:44:47,552] Trial 39 finished with value: 3341051000.0 and parameters: {'num_leaves': 133, 'learning_rate': 0.040688680744381533, 'min_data_in_leaf': 28, 'feature_fraction': 0.6291859945172456, 'bagging_fraction': 0.7018444838186395}. Best is trial 23 with value: 3370654000.0.


Did not meet early stopping. Best iteration is:
[995]	cv_agg's valid gan_eval: 6.6821e+08 + 6.71905e+06
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[972]	cv_agg's valid gan_eval: 6.73134e+08 + 6.22136e+06


[I 2024-11-20 17:49:10,651] Trial 40 finished with value: 3365670000.0 and parameters: {'num_leaves': 124, 'learning_rate': 0.049542816744732966, 'min_data_in_leaf': 1069, 'feature_fraction': 0.9454191554259131, 'bagging_fraction': 0.7677377823764359}. Best is trial 23 with value: 3370654000.0.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	cv_agg's valid gan_eval: 6.68987e+08 + 7.61503e+06


[I 2024-11-20 17:53:09,723] Trial 41 finished with value: 3344936000.0 and parameters: {'num_leaves': 119, 'learning_rate': 0.03468892846831168, 'min_data_in_leaf': 1094, 'feature_fraction': 0.663310873679285, 'bagging_fraction': 0.874225189987311}. Best is trial 23 with value: 3370654000.0.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[964]	cv_agg's valid gan_eval: 6.72956e+08 + 6.57126e+06


[I 2024-11-20 17:57:30,976] Trial 42 finished with value: 3364781000.0 and parameters: {'num_leaves': 138, 'learning_rate': 0.048980401390669934, 'min_data_in_leaf': 1333, 'feature_fraction': 0.992338540577175, 'bagging_fraction': 0.7580848209503402}. Best is trial 23 with value: 3370654000.0.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[982]	cv_agg's valid gan_eval: 6.75021e+08 + 7.19761e+06


[I 2024-11-20 18:01:29,704] Trial 43 finished with value: 3375106000.0 and parameters: {'num_leaves': 150, 'learning_rate': 0.049929102931299305, 'min_data_in_leaf': 1337, 'feature_fraction': 0.9913667120903564, 'bagging_fraction': 0.8195431189437536}. Best is trial 43 with value: 3375106000.0.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid gan_eval: 6.74391e+08 + 6.99328e+06


[I 2024-11-20 18:04:59,178] Trial 44 finished with value: 3371956000.0 and parameters: {'num_leaves': 135, 'learning_rate': 0.04995393978280137, 'min_data_in_leaf': 1201, 'feature_fraction': 0.977153010442073, 'bagging_fraction': 0.7289435153395362}. Best is trial 43 with value: 3375106000.0.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[978]	cv_agg's valid gan_eval: 6.72382e+08 + 6.74413e+06


[I 2024-11-20 18:08:18,461] Trial 45 finished with value: 3361911000.0 and parameters: {'num_leaves': 125, 'learning_rate': 0.04997816737132934, 'min_data_in_leaf': 1193, 'feature_fraction': 0.9568846292975896, 'bagging_fraction': 0.8211930093350331}. Best is trial 43 with value: 3375106000.0.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[994]	cv_agg's valid gan_eval: 6.70855e+08 + 5.70974e+06


[I 2024-11-20 18:11:14,744] Trial 46 finished with value: 3354274000.0 and parameters: {'num_leaves': 108, 'learning_rate': 0.047138808967144855, 'min_data_in_leaf': 867, 'feature_fraction': 0.9943344065454085, 'bagging_fraction': 0.727517408773284}. Best is trial 43 with value: 3375106000.0.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[986]	cv_agg's valid gan_eval: 6.7227e+08 + 7.01943e+06


[I 2024-11-20 18:15:18,757] Trial 47 finished with value: 3361351000.0 and parameters: {'num_leaves': 135, 'learning_rate': 0.046297750979740614, 'min_data_in_leaf': 1519, 'feature_fraction': 0.8786352149969026, 'bagging_fraction': 0.6358949225144592}. Best is trial 43 with value: 3375106000.0.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[996]	cv_agg's valid gan_eval: 6.69668e+08 + 6.76155e+06


[I 2024-11-20 18:18:10,329] Trial 48 finished with value: 3348338000.0 and parameters: {'num_leaves': 97, 'learning_rate': 0.047840528901023735, 'min_data_in_leaf': 1027, 'feature_fraction': 0.952148405882902, 'bagging_fraction': 0.676369520663652}. Best is trial 43 with value: 3375106000.0.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid gan_eval: 6.47268e+08 + 5.19534e+06


[I 2024-11-20 18:21:02,509] Trial 49 finished with value: 3236338000.0 and parameters: {'num_leaves': 40, 'learning_rate': 0.015577052751234267, 'min_data_in_leaf': 1651, 'feature_fraction': 0.8202484409111204, 'bagging_fraction': 0.7251158334488714}. Best is trial 43 with value: 3375106000.0.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	cv_agg's valid gan_eval: 6.67377e+08 + 6.33569e+06


[I 2024-11-20 18:23:50,119] Trial 50 finished with value: 3336886000.0 and parameters: {'num_leaves': 85, 'learning_rate': 0.049891112480013794, 'min_data_in_leaf': 1259, 'feature_fraction': 0.16116938828133726, 'bagging_fraction': 0.845301442121202}. Best is trial 43 with value: 3375106000.0.


Analizamos los resultados as usual

In [21]:
optuna.visualization.plot_optimization_history(study)

In [22]:
plot_param_importances(study)

El **learning rate** es un parámetro que tiene que ir acompañado por más árboles.

In [23]:
plot_slice(study)

In [24]:
plot_contour(study)

In [25]:
plot_contour(study, params=['num_leaves','min_data_in_leaf'])

In [26]:
study.best_trial.params

{'num_leaves': 150,
 'learning_rate': 0.049929102931299305,
 'min_data_in_leaf': 1337,
 'feature_fraction': 0.9913667120903564,
 'bagging_fraction': 0.8195431189437536}

In [27]:
best_iter = study.best_trial.user_attrs["best_iter"]
best_iter

982