In [1]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np
import mlflow

In [2]:
train = pd.read_csv('../data/train.csv')

In [3]:
X = train.drop(columns=[
    'ID', 'target1', 'target2', 'target3', 'target4', 'target5',
    'target6', 'target7', 'target8', 'target9', 'target10', 'target11'
])

y = train.drop(columns=[
    'ID', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6',
    'feature7', 'feature8', 'feature9', 'feature10', 'feature11',
    'feature12', 'feature13', 'feature14', 'feature15'
])


In [4]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Hyper-param optmization

In [5]:
import optuna
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.multioutput import MultiOutputRegressor
import mlflow
from lightgbm import LGBMRegressor

# Parâmetros fixos dos modelos
params_xgboost = {
    'n_estimators': 1923,
    'learning_rate': 0.0176491247294085,
    'max_depth': 7,
    'min_child_weight': 1,
    'subsample': 0.7565637963621075,
    'reg_alpha': 2.4180671613496878e-06,
    'reg_lambda': 4.3213853733170415,
    'gamma': 0.0017610181848471591,
    'random_state': 42,
    'n_jobs': 1,
    'tree_method': 'hist',
    'device': 'cuda'
}

params_rf = {
    'n_estimators': 1294,
    'max_depth': 30,
    'min_samples_split': 3,
    'min_samples_leaf': 2,
    'max_features': 'log2',
    'bootstrap': False,
    'random_state': 42,
    'n_jobs': 1
}

params_lightGBM = {
    'max_depth': 6, 
    'num_leaves': 42, 
    'n_estimators': 2829, 
    'learning_rate': 0.05884206632618343, 
    'min_child_samples': 1, 
    'subsample': 0.7776450799851002, 
    'colsample_bytree': 0.9954751604555587, 
    'reg_alpha': 0.0024050306501921114, 
    'reg_lambda': 1.1406302521871943, 
    'min_split_gain': 0.0733413175114044,
    'verbose': -1  
    }


kf = KFold(n_splits=10, shuffle=True, random_state=42)

def objective(trial):
    with mlflow.start_run(run_name=f'Trial-{trial.number}', nested=True):
        # Sugere três pesos brutos entre 0 e 1
        w1 = trial.suggest_float('w_xgb_raw', 0.0, 1.0, step=0.05)
        w2 = trial.suggest_float('w_rf_raw',  0.0, 1.0, step = 0.05)
        w3 = trial.suggest_float('w_gbm_raw', 0.0, 1.0, step=0.05)
        
        # Normaliza para somar 1
        total = w1 + w2 + w3
        w_xgb = w1 / total
        w_rf  = w2 / total
        w_gbm = w3 / total
        
        mlflow.log_param("w_xgb", w_xgb)
        mlflow.log_param("w_rf",  w_rf)
        mlflow.log_param("w_mlp", w_gbm)

        fold_rmse = []
        for train_idx, val_idx in kf.split(X):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            # 1) XGBoost
            model_xgb = xgb.XGBRegressor(**params_xgboost)
            model_xgb.fit(X_train, y_train)
            y_pred_xgb = model_xgb.predict(X_val)
            
            # 2) Random Forest
            model_rf = RandomForestRegressor(**params_rf)
            model_rf.fit(X_train, y_train)
            y_pred_rf = model_rf.predict(X_val)
            
            # 3) LightGBM
            base = LGBMRegressor(**params_lightGBM)
            model_lightGBM = MultiOutputRegressor(base, n_jobs=1)
            model_lightGBM.fit(X_train, y_train)
            y_pred_gbm = model_lightGBM.predict(X_val)
            
            # Ensemble ponderado
            y_pred_ens = (w_xgb * y_pred_xgb
                          + w_rf  * y_pred_rf
                          + w_gbm *  y_pred_gbm )
            
            rmse = root_mean_squared_error(y_val, y_pred_ens)
            fold_rmse.append(rmse)

        mean_rmse = np.mean(fold_rmse)
        mlflow.log_metric("rmse", mean_rmse)
        return mean_rmse

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=20,
        n_warmup_steps=10,
        interval_steps=5
    )
)

mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('DSWA - Competicao3')
with mlflow.start_run(run_name='Ensemble weight optimization'):
    study.optimize(objective, n_trials=50)
    
    print("Melhores hiperparâmetros:")
    print(study.best_params)
    print(f"Melhor RMSE: {study.best_value:.4f}")

    # Loga os melhores parâmetros e métrica no MLflow
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_rmse", study.best_value)


[I 2025-07-28 20:11:18,784] A new study created in memory with name: no-name-0fb42c4e-f26b-4299-8817-6f8e3228f014
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
[I 2025-07-28 20:22:34,951] Trial 0 finished with value: 0.4666400179651461 and parameters: {'w_xgb_raw': 0.35000000000000003, 'w_rf_raw': 0.9500000000000001, 'w_gbm_raw': 0.75}. Best is trial 0 with value: 0.4666400179651461.


🏃 View run Trial-0 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/d7a96a8ac1304e079ffe4f5e96c84434
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 20:29:22,411] Trial 1 finished with value: 0.4353504131707629 and parameters: {'w_xgb_raw': 0.6000000000000001, 'w_rf_raw': 0.15000000000000002, 'w_gbm_raw': 0.15000000000000002}. Best is trial 1 with value: 0.4353504131707629.


🏃 View run Trial-1 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/24c7e70532064c139ca17a6bda24b810
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 20:36:08,048] Trial 2 finished with value: 0.4787282432160508 and parameters: {'w_xgb_raw': 0.05, 'w_rf_raw': 0.9, 'w_gbm_raw': 0.6000000000000001}. Best is trial 1 with value: 0.4353504131707629.


🏃 View run Trial-2 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/e1e156d289db4e71bbeebca24eb2467a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 20:42:54,630] Trial 3 finished with value: 0.47705992146778636 and parameters: {'w_xgb_raw': 0.7000000000000001, 'w_rf_raw': 0.0, 'w_gbm_raw': 1.0}. Best is trial 1 with value: 0.4353504131707629.


🏃 View run Trial-3 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/42012ce1e4934c48a182a7daa2e7900b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 20:49:40,269] Trial 4 finished with value: 0.43252631900564575 and parameters: {'w_xgb_raw': 0.8500000000000001, 'w_rf_raw': 0.2, 'w_gbm_raw': 0.15000000000000002}. Best is trial 4 with value: 0.43252631900564575.


🏃 View run Trial-4 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/dc23907c8d4544659da30079666b956e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 20:56:27,068] Trial 5 finished with value: 0.4804521700860521 and parameters: {'w_xgb_raw': 0.15000000000000002, 'w_rf_raw': 0.30000000000000004, 'w_gbm_raw': 0.55}. Best is trial 4 with value: 0.43252631900564575.


🏃 View run Trial-5 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/1e2d5f3c7d5744d3a24594c56c7e4f5b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 21:03:13,591] Trial 6 finished with value: 0.4634807612977604 and parameters: {'w_xgb_raw': 0.45, 'w_rf_raw': 0.30000000000000004, 'w_gbm_raw': 0.6000000000000001}. Best is trial 4 with value: 0.43252631900564575.


🏃 View run Trial-6 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/273638eabbd046b49680d9d8a88d8c39
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 21:10:02,477] Trial 7 finished with value: 0.47515803930476075 and parameters: {'w_xgb_raw': 0.1, 'w_rf_raw': 0.30000000000000004, 'w_gbm_raw': 0.35000000000000003}. Best is trial 4 with value: 0.43252631900564575.


🏃 View run Trial-7 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/64716f00356c431c94534bb1ceb7e957
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 21:16:50,095] Trial 8 finished with value: 0.4484794185432509 and parameters: {'w_xgb_raw': 0.45, 'w_rf_raw': 0.8, 'w_gbm_raw': 0.2}. Best is trial 4 with value: 0.43252631900564575.


🏃 View run Trial-8 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/42e302b4cafb4d9a881ef84a22a791a7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 21:23:36,797] Trial 9 finished with value: 0.43773483198170365 and parameters: {'w_xgb_raw': 0.5, 'w_rf_raw': 0.6000000000000001, 'w_gbm_raw': 0.0}. Best is trial 4 with value: 0.43252631900564575.


🏃 View run Trial-9 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/21b19baadfe24f3f8ba59aefdeb0f3f6
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 21:30:21,402] Trial 10 finished with value: 0.4288645539949135 and parameters: {'w_xgb_raw': 1.0, 'w_rf_raw': 0.55, 'w_gbm_raw': 0.0}. Best is trial 10 with value: 0.4288645539949135.


🏃 View run Trial-10 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/99d905350d2540bbb9b23d57e3f20b1f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 21:37:08,033] Trial 11 finished with value: 0.4288645539949135 and parameters: {'w_xgb_raw': 1.0, 'w_rf_raw': 0.55, 'w_gbm_raw': 0.0}. Best is trial 10 with value: 0.4288645539949135.


🏃 View run Trial-11 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/c9632b5c250b4828a743a939f77a0768
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 21:43:55,141] Trial 12 finished with value: 0.4288645539949135 and parameters: {'w_xgb_raw': 1.0, 'w_rf_raw': 0.55, 'w_gbm_raw': 0.0}. Best is trial 10 with value: 0.4288645539949135.


🏃 View run Trial-12 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/98fcb44f800548c09f40f5e956841687
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 21:50:42,184] Trial 13 finished with value: 0.4393131059607939 and parameters: {'w_xgb_raw': 1.0, 'w_rf_raw': 0.65, 'w_gbm_raw': 0.35000000000000003}. Best is trial 10 with value: 0.4288645539949135.


🏃 View run Trial-13 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/7b7d990de6da49a8b1ca7eda44df398f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[W 2025-07-28 21:51:57,537] Trial 14 failed with parameters: {'w_xgb_raw': 0.8, 'w_rf_raw': 0.45, 'w_gbm_raw': 0.35000000000000003} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\Guilherme\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Guilherme\AppData\Local\Temp\ipykernel_35448\186583367.py", line 80, in objective
    model_xgb.fit(X_train, y_train)
  File "c:\Users\Guilherme\AppData\Local\Programs\Python\Python311\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\Guilherme\AppData\Local\Programs\Python\Python311\Lib\site-packages\xgboost\sklearn.py", line 1247, in fit
    self._Booster = train(
                    ^^^^^^
  File "c:\Users\Guilherme\AppData\Local\Programs\Python\Python311\Lib\site-packages\x

🏃 View run Trial-14 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/cf2e610966e54dd6926b87294285a8e3
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732
🏃 View run Ensemble weight optimization at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/31aad1a36640456390831b698bf0a3ad
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


KeyboardInterrupt: 