In [1]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np
import mlflow

In [2]:
train = pd.read_csv('../data/train.csv')

In [3]:
X = train.drop(columns=[
    'ID', 'target1', 'target2', 'target3', 'target4', 'target5',
    'target6', 'target7', 'target8', 'target9', 'target10', 'target11'
])

y = train.drop(columns=[
    'ID', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6',
    'feature7', 'feature8', 'feature9', 'feature10', 'feature11',
    'feature12', 'feature13', 'feature14', 'feature15'
])


In [4]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Hyper-param optmization

In [5]:
import optuna
from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import cross_val_score, KFold
import mlflow
import numpy as np
import warnings
from sklearn.metrics import root_mean_squared_error


warnings.filterwarnings('ignore', message='.*Falling back to prediction using DMatrix.*')

def objective(trial):
    with mlflow.start_run(run_name=f'Trial - {trial.number}', nested=True):
        # Hiperparâmetros a serem tunados
        params = {
            'iterations': trial.suggest_int('iterations', 500, 5000),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
            'depth': trial.suggest_int('depth', 4, 12),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
            'random_strength': trial.suggest_float('random_strength', 0.0, 1.0),
            'border_count': trial.suggest_int('border_count', 32, 255),
            'task_type': 'GPU',            # usa GPU se disponível
            'logging_level': 'Silent',    # sem log no terminal
            'random_seed': 42,
            'devices': '0',
        }

        mlflow.log_params(params)

        # CatBoostRegressor não suporta multi-target nativo, então usamos wrapper
        base_model = CatBoostRegressor(**params)
        model = MultiOutputRegressor(base_model, n_jobs=1)


        kf = KFold(n_splits=10, shuffle=True, random_state=42)
        fold_rmse = []

        for fold, (train_index, val_index) in enumerate(kf.split(X), start=1):
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]

            model.fit(X_train, y_train)

            y_pred = model.predict(X_val)
            rmse = root_mean_squared_error(y_val, y_pred)
            fold_rmse.append(rmse)

        rmse = np.mean(fold_rmse)
        mlflow.log_metric("rmse", rmse)
        return rmse

# Para rodar:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=50)


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=20,
        n_warmup_steps=10,
        interval_steps=5
    )
)

mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('DSWA - Competicao3')
with mlflow.start_run(run_name='CatBoost optimization'):
    study.optimize(objective, n_trials=50)
    
    print("Melhores hiperparâmetros:")
    print(study.best_params)
    print(f"Melhor RMSE: {study.best_value:.4f}")

    # Loga os melhores parâmetros e métrica no MLflow
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_rmse", study.best_value)


[I 2025-07-27 23:49:34,143] A new study created in memory with name: no-name-33a16e2a-4094-489e-a1e5-ce79a0bf4172
[I 2025-07-28 00:42:51,548] Trial 0 finished with value: 0.5278360300069197 and parameters: {'iterations': 2185, 'learning_rate': 0.22648248189516848, 'depth': 10, 'l2_leaf_reg': 0.24810409748678125, 'bagging_temperature': 0.15601864044243652, 'random_strength': 0.15599452033620265, 'border_count': 45}. Best is trial 0 with value: 0.5278360300069197.


🏃 View run Trial - 0 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/59c61e5ed4e44a118eaa7f4df8558576
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 03:37:05,989] Trial 1 finished with value: 0.5077707698333646 and parameters: {'iterations': 4398, 'learning_rate': 0.030834348179355788, 'depth': 10, 'l2_leaf_reg': 0.0012087541473056963, 'bagging_temperature': 0.9699098521619943, 'random_strength': 0.8324426408004217, 'border_count': 79}. Best is trial 1 with value: 0.5077707698333646.


🏃 View run Trial - 1 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/b82a53ac84f2486f883258a3e9ee304c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 03:59:13,383] Trial 2 finished with value: 0.5575205265466364 and parameters: {'iterations': 1318, 'learning_rate': 0.002846526357761094, 'depth': 6, 'l2_leaf_reg': 0.12561043700013558, 'bagging_temperature': 0.43194501864211576, 'random_strength': 0.2912291401980419, 'border_count': 169}. Best is trial 1 with value: 0.5077707698333646.


🏃 View run Trial - 2 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/be37514d850e43188fa8c055e4ca2ad2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 04:22:16,437] Trial 3 finished with value: 0.5431111352547316 and parameters: {'iterations': 1127, 'learning_rate': 0.005292705365436975, 'depth': 7, 'l2_leaf_reg': 0.06672367170464207, 'bagging_temperature': 0.7851759613930136, 'random_strength': 0.19967378215835974, 'border_count': 147}. Best is trial 1 with value: 0.5077707698333646.


🏃 View run Trial - 3 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/7fdacc8762c847dd91b6644244b4eb3e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 06:49:20,072] Trial 4 finished with value: 0.5243420348956439 and parameters: {'iterations': 3166, 'learning_rate': 0.0013033567475147442, 'depth': 9, 'l2_leaf_reg': 0.004809461967501573, 'bagging_temperature': 0.06505159298527952, 'random_strength': 0.9488855372533332, 'border_count': 248}. Best is trial 1 with value: 0.5077707698333646.


🏃 View run Trial - 4 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/4ca52ea266f34846a6898d26af363012
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 07:41:32,065] Trial 5 finished with value: 0.5513704242113059 and parameters: {'iterations': 4138, 'learning_rate': 0.0056828375585122656, 'depth': 4, 'l2_leaf_reg': 0.5456725485601477, 'bagging_temperature': 0.4401524937396013, 'random_strength': 0.12203823484477883, 'border_count': 142}. Best is trial 1 with value: 0.5077707698333646.


🏃 View run Trial - 5 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/f54b7e89b8b140ec97c5b131eeb0f389
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 07:51:32,093] Trial 6 finished with value: 0.5524862920682476 and parameters: {'iterations': 654, 'learning_rate': 0.1788532743297921, 'depth': 6, 'l2_leaf_reg': 0.4467752817973907, 'bagging_temperature': 0.31171107608941095, 'random_strength': 0.5200680211778108, 'border_count': 154}. Best is trial 1 with value: 0.5077707698333646.


🏃 View run Trial - 6 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/9ecffac8844644d9a757bd0fad7c7480
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 09:46:47,981] Trial 7 finished with value: 0.5195966174329069 and parameters: {'iterations': 1332, 'learning_rate': 0.25221951700214285, 'depth': 10, 'l2_leaf_reg': 5.727904470799623, 'bagging_temperature': 0.8948273504276488, 'random_strength': 0.5978999788110851, 'border_count': 238}. Best is trial 1 with value: 0.5077707698333646.


🏃 View run Trial - 7 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/1bdf1b0c91e64c7da2680f2577dd3276
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 09:58:15,931] Trial 8 finished with value: 0.5733723730628851 and parameters: {'iterations': 898, 'learning_rate': 0.0030582523213789677, 'depth': 4, 'l2_leaf_reg': 0.02001342062287998, 'bagging_temperature': 0.388677289689482, 'random_strength': 0.2713490317738959, 'border_count': 217}. Best is trial 1 with value: 0.5077707698333646.


🏃 View run Trial - 8 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/cb14e3cbd6f84797ad4b5d0b3378373e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[I 2025-07-28 11:02:03,676] Trial 9 finished with value: 0.5178050980031159 and parameters: {'iterations': 2105, 'learning_rate': 0.0049648810171066555, 'depth': 8, 'l2_leaf_reg': 0.0036618192203924276, 'bagging_temperature': 0.8021969807540397, 'random_strength': 0.07455064367977082, 'border_count': 253}. Best is trial 1 with value: 0.5077707698333646.


🏃 View run Trial - 9 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/68522b309e554af58f9506e639446398
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


[W 2025-07-28 13:39:24,103] Trial 10 failed with parameters: {'iterations': 4905, 'learning_rate': 0.04200891212415249, 'depth': 12, 'l2_leaf_reg': 0.0011799062523159302, 'bagging_temperature': 0.6518665507781786, 'random_strength': 0.907664795282518, 'border_count': 50} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "c:\Users\Guilherme\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Guilherme\AppData\Local\Temp\ipykernel_35824\4031322228.py", line 44, in objective
    model.fit(X_train, y_train)
  File "c:\Users\Guilherme\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Guilherme\AppData\Local\Programs\Python\Python311\Lib\site-

🏃 View run Trial - 10 at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/e89544a3bac345b2bd52476e5b68bd3a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732
🏃 View run CatBoost optimization at: http://127.0.0.1:5000/#/experiments/962208812792484732/runs/2edd6129ad8e4fff97a17706c8e2d35c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/962208812792484732


KeyboardInterrupt: 