In [23]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np
import mlflow

In [24]:
train = pd.read_csv('../data/train.csv')

In [25]:
X = train.drop(columns=[
    'ID', 'target1', 'target2', 'target3', 'target4', 'target5',
    'target6', 'target7', 'target8', 'target9', 'target10', 'target11'
])

y = train.drop(columns=[
    'ID', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6',
    'feature7', 'feature8', 'feature9', 'feature10', 'feature11',
    'feature12', 'feature13', 'feature14', 'feature15'
])


In [26]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Hyper-param optmization

In [27]:
import optuna
from ngboost import NGBoost
from ngboost.learners import default_tree_learner
from ngboost.distns import Normal
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
import mlflow
import numpy as np
import warnings
from sklearn.metrics import root_mean_squared_error

warnings.filterwarnings('ignore')

def objective(trial):
    # Desliga todos os logs do MLflow também no terminal
    mlflow.set_experiment('DSWA - Competicao3')
    with mlflow.start_run(run_name=f'Trial-{trial.number}', nested=True):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'minibatch_frac': trial.suggest_float('minibatch_frac', 0.5, 1.0),
            'col_sample': trial.suggest_float('col_sample', 0.5, 1.0),
            'tol': trial.suggest_float('tol', 1e-5, 1e-3, log=True),
            'random_state': 42,
            'verbose': False   # <-- desliga o print de progresso do NGBoost
        }
        mlflow.log_params(params)

        base_ngb = NGBoost(
            Base=default_tree_learner,
            Dist=Normal,
            **params
        )
        model = MultiOutputRegressor(base_ngb, n_jobs=1)

        kf = KFold(n_splits=10, shuffle=True, random_state=42)
        rmses = []

        for train_idx, val_idx in kf.split(X):
            X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
            y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

            model.fit(X_tr, y_tr)           # sem verbose aqui
            y_pred = model.predict(X_va)
            rmses.append(root_mean_squared_error(y_va, y_pred))

        rmse_mean = np.mean(rmses)
        mlflow.log_metric('rmse', rmse_mean)
        return rmse_mean


In [28]:
study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=20,
        n_warmup_steps=10,
        interval_steps=5
    )
)

mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('DSWA - Competicao3')
with mlflow.start_run(run_name='NGBoost optimization'):
    study.optimize(objective, n_trials=50)
    
    print("Melhores hiperparâmetros:")
    print(study.best_params)
    print(f"Melhor RMSE: {study.best_value:.4f}")

    # Loga os melhores parâmetros e métrica no MLflow
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_rmse", study.best_value)


[I 2025-07-28 15:19:42,002] A new study created in memory with name: no-name-7a9fe583-7989-479f-bbce-448970d885b7
[I 2025-07-28 15:22:02,494] Trial 0 finished with value: 0.5292486240232609 and parameters: {'n_estimators': 218, 'learning_rate': 0.2536999076681772, 'minibatch_frac': 0.8659969709057025, 'col_sample': 0.7993292420985183, 'tol': 2.0513382630874486e-05}. Best is trial 0 with value: 0.5292486240232609.


🏃 View run Trial-0 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/ecad738fe99c4e07a8d4b7ae8762e4b7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 15:23:19,954] Trial 1 finished with value: 0.5218989706677872 and parameters: {'n_estimators': 120, 'learning_rate': 0.012184186502221764, 'minibatch_frac': 0.9330880728874675, 'col_sample': 0.8005575058716043, 'tol': 0.0002607024758370766}. Best is trial 1 with value: 0.5218989706677872.


🏃 View run Trial-1 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/3521caaeff1d4c188353925a41169786
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 15:23:55,349] Trial 2 finished with value: 0.5329458347695042 and parameters: {'n_estimators': 59, 'learning_rate': 0.2708160864249968, 'minibatch_frac': 0.9162213204002109, 'col_sample': 0.6061695553391381, 'tol': 2.3102018878452926e-05}. Best is trial 1 with value: 0.5218989706677872.


🏃 View run Trial-2 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/78f9a6347a9840adbc8d3f473b796fe0
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 15:25:15,043] Trial 3 finished with value: 0.49106512749401343 and parameters: {'n_estimators': 132, 'learning_rate': 0.028145092716060652, 'minibatch_frac': 0.762378215816119, 'col_sample': 0.7159725093210578, 'tol': 3.8234752246751835e-05}. Best is trial 3 with value: 0.49106512749401343.


🏃 View run Trial-3 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/a307e46587b4485a8b1bf1a8de0b2b33
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[W 2025-07-28 15:26:38,707] Trial 4 failed with parameters: {'n_estimators': 325, 'learning_rate': 0.01607123851203988, 'minibatch_frac': 0.6460723242676091, 'col_sample': 0.6831809216468459, 'tol': 8.168455894760161e-05} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\Guilherme\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Guilherme\AppData\Local\Temp\ipykernel_34212\38541763.py", line 43, in objective
    model.fit(X_tr, y_tr)           # sem verbose aqui
    ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Guilherme\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Guilherme\AppData\Local\Programs\Python\Python311\Lib\site-packa

🏃 View run Trial-4 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/2f67b91ee94a4d9b930e880d48dea949
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732
🏃 View run NGBoost optimization at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/a61f73a317d84e1c8bfef3e2ee1a5098
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


KeyboardInterrupt: 