In [7]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np
import mlflow

In [8]:
train = pd.read_csv('data/train.csv')

In [9]:
X = train.drop(columns=[
    'ID', 'target1', 'target2', 'target3', 'target4', 'target5',
    'target6', 'target7', 'target8', 'target9', 'target10', 'target11'
])

y = train.drop(columns=[
    'ID', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6',
    'feature7', 'feature8', 'feature9', 'feature10', 'feature11',
    'feature12', 'feature13', 'feature14', 'feature15'
])


In [10]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Hyper-param optmization

In [11]:
import optuna
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

import warnings
warnings.filterwarnings('ignore', message='.*Falling back to prediction using DMatrix.*')

def objective(trial):
    with mlflow.start_run(run_name=f'Trial - {trial.number}', nested=True):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
            'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
            'random_state': 42,
            'n_jobs': 1,
            'tree_method': 'hist',       # para usar GPU corretamente em XGBoost >= 2.0
            'device': 'cuda'             # ativa a GPU
        }

        mlflow.log_params(params)

        model = xgb.XGBRegressor(**params)

        rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

        kf = KFold(n_splits=10, shuffle=True, random_state=42)
        # Retorna o MSE médio negativo (Optuna minimiza)
        print("Ate aqui chega!\n")
        scores = cross_val_score(
            model, X, y,
            scoring= 'neg_root_mean_squared_error',
            cv=kf
        )
        print("Saiu do cross!\n")

        rmse = -scores.mean()

        mlflow.log_metric("rmse", rmse)

        return rmse


In [None]:
study = optuna.create_study(
    direction='minimize',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=20,
        n_warmup_steps=10,
        interval_steps=5
    )
)

mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('DSWA - Competicao3')
with mlflow.start_run(run_name='XGBoost optimization'):
    study.optimize(objective, n_trials=50)
    
    print("Melhores hiperparâmetros:")
    print(study.best_params)
    print(f"Melhor RMSE: {study.best_value:.4f}")

    # Loga os melhores parâmetros e métrica no MLflow
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_rmse", study.best_value)


In [14]:
# Método 1: Usando trials_dataframe() [MAIS COMPLETO]
df_trials = study.trials_dataframe()
df_best = df_trials.nsmallest(10, 'value')  # 10 menores valores (minimize)
    
print("Método 1: DataFrame completo")
print("-" * 40)
for i, (idx, row) in enumerate(df_best.iterrows(), 1):
    print(f"#{i} - Trial {row['number']:3d} | RMSE: {row['value']:.6f}")
        # Mostra apenas alguns parâmetros principais
    params_str = " | ".join([f"{k.replace('params_', '')}: {v}" 
                            for k, v in row.items() 
                            if k.startswith('params_') and pd.notnull(v)][:3])
    print(f"     Params: {params_str}...")
    print()
    
print("\n" + "-"*60)

Método 1: DataFrame completo
----------------------------------------
#1 - Trial  45 | RMSE: 0.430616
     Params: gamma: 0.0017610181848471591 | learning_rate: 0.0176491247294085 | max_depth: 7...

#2 - Trial  31 | RMSE: 0.431177
     Params: gamma: 0.00280025701803661 | learning_rate: 0.010175958490020839 | max_depth: 7...

#3 - Trial  10 | RMSE: 0.431233
     Params: gamma: 0.0021620523265932262 | learning_rate: 0.010206070557576998 | max_depth: 7...

#4 - Trial  13 | RMSE: 0.431320
     Params: gamma: 0.0036889263417112447 | learning_rate: 0.01121012420535718 | max_depth: 6...

#5 - Trial  21 | RMSE: 0.431352
     Params: gamma: 0.002941833408211296 | learning_rate: 0.01073186868662129 | max_depth: 8...

#6 - Trial  11 | RMSE: 0.431765
     Params: gamma: 0.0018504237648254234 | learning_rate: 0.010233524192808546 | max_depth: 7...

#7 - Trial  12 | RMSE: 0.431959
     Params: gamma: 0.003050995871074158 | learning_rate: 0.010269786859910212 | max_depth: 7...

#8 - Trial  41 | RMSE