In [None]:
!pip install optuna



In [2]:
import pandas as pd
# checking read data from collab.
pd.read_csv('/content/blind_test_data.csv').head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19
0,676.867615,32.518822,254.825875,502.26851,609.469688,497.624266,105.246239,269.045539,150.177005,312.64986,765.296227,0.237996,660.030637,147.059794,40.232132,464.424834,121.147466,68.284243,26.96987,314.461582
1,628.695228,426.163933,347.07028,431.106903,915.527507,301.699534,1.666992,306.733041,104.234252,63.24207,467.009734,6.608084,600.994184,43.619815,48.153926,457.256565,49.163652,85.511662,33.500538,819.537877
2,131.765943,323.839669,245.399775,181.814398,710.179159,59.117377,312.622788,687.965027,109.803179,381.1695,700.532108,1.82237,736.306092,138.759029,36.915389,436.174065,10.037994,62.631938,6.211169,341.361374
3,160.970195,489.712029,70.482159,309.486269,888.030604,412.655666,216.124989,47.415477,104.139145,326.462385,378.446187,1.686895,485.144327,143.668518,27.168148,309.715497,149.661493,66.415878,15.001753,539.087409
4,419.907137,216.625219,487.88786,253.704462,323.226862,65.744463,271.811469,527.726782,129.805782,168.429679,637.944633,0.948507,365.946758,72.337904,36.232169,302.772338,186.944884,106.514846,3.443809,364.341969


In [4]:
import warnings, time, json
warnings.filterwarnings("ignore")

import numpy as np, pandas as pd, joblib
from pathlib import Path
from datetime import datetime, timezone

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor
import optuna
from optuna.samplers import TPESampler

# ----------------- Config variables -----------------
DATA_DIR        = Path("/content")
TRAIN_PATH      = DATA_DIR / "training_data.csv"
BLIND_PATH      = DATA_DIR / "blind_test_data.csv"
MODEL_DIR       = Path("model"); MODEL_DIR.mkdir(exist_ok=True, parents=True)
MODEL_OUT       = MODEL_DIR / "model.pkl"
MANIFEST_OUT    = MODEL_DIR / "manifest.json"
PRED_OUT        = Path("target_pred.csv")

SEED            = 777
N_OUTER         = 4
N_INNER         = 3

N_TRIALS_HGBR   = 30
N_TRIALS_XGB    = 32
N_TRIALS_RF     = 32
N_TRIALS_ENET   = 32
TIMEOUT_SECS    = 420
PRUNER          = optuna.pruners.MedianPruner(n_startup_trials=4)

# XGBoost
XGB_TREE_METHOD = "hist"
XGB_N_JOBS      = 1

# Feature selection
CORR_TH         = 0.98
KEEP_RATIOS     = [1.0, 0.75, 0.5, 0.35, 0.25]
MIN_FEATS       = 8

rng = np.random.default_rng(SEED)

# utils functions
def rmse(y, p): return float(np.sqrt(mean_squared_error(y, p)))

def drop_constant_and_correlated(df_tr: pd.DataFrame, corr_th=0.99) -> list:
    # quita constantes
    keep = [c for c in df_tr.columns if df_tr[c].std(skipna=True) > 0.0]
    df = df_tr[keep]
    # quita colineales por |rho|>=th
    corr = df.corr(numeric_only=True).abs()
    upper = corr.where(np.triu(np.ones_like(corr), k=1).astype(bool))
    drop = [col for col in upper.columns if any(upper[col] >= corr_th)]
    return [c for c in df.columns if c not in drop]

def mi_topk(X: np.ndarray, y: np.ndarray, cols: list, keep_ratio: float, min_feats=MIN_FEATS) -> list:
    """
    Variable selection using the mutual information method (MI)
    """
    Xm = X.copy()
    med = np.nanmedian(Xm, axis=0)
    inds = np.where(np.isnan(Xm))
    if len(inds[0])>0: Xm[inds] = np.take(med, inds[1])
    mi = mutual_info_regression(Xm, y, random_state=SEED)
    order = np.argsort(mi)[::-1]
    k = max(min_feats, int(np.ceil(len(cols)*keep_ratio)))
    k = min(k, len(cols))
    return [cols[i] for i in order[:k]]

def suggest_params_hgbr(trial):
    return {
        "max_iter": trial.suggest_int("max_iter", 300, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 2e-1, log=True),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 31, 255),
        "max_depth": trial.suggest_categorical("max_depth", [None, 6, 8, 10, 12]),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 5, 30),
        "l2_regularization": trial.suggest_float("l2_regularization", 1e-6, 1e-1, log=True),
        "max_bins": trial.suggest_categorical("max_bins", [128, 255]),
        "keep_ratio": trial.suggest_categorical("keep_ratio", KEEP_RATIOS),
    }

def suggest_params_xgb(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 400, 1600),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 2e-1, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 3.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-6, 1e-1, log=True),
        "keep_ratio": trial.suggest_categorical("keep_ratio", KEEP_RATIOS),
    }

def suggest_params_rf(trial):
    return {
        "n_estimators": trial.suggest_int("n_estimators", 300, 900),
        "max_depth": trial.suggest_categorical("max_depth", [None, 6, 8, 10, 12, 14]),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 12),
        "max_features": trial.suggest_float("max_features", 0.4, 1.0),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        "keep_ratio": trial.suggest_categorical("keep_ratio", KEEP_RATIOS),
    }

def suggest_params_enet(trial):
    return {
        "alpha": trial.suggest_float("alpha", 1e-4, 1e1, log=True),
        "l1_ratio": trial.suggest_float("l1_ratio", 0.05, 0.95),  #Mix de ambas 0=Ridge; 1=Lasso
        "max_iter": trial.suggest_int("max_iter", 2000, 6000),
        "keep_ratio": trial.suggest_categorical("keep_ratio", KEEP_RATIOS),
    }

def inner_cv_rmse(params, model_name, X_tr_full, y_tr_full, cols_prefiltered):
    kf = KFold(n_splits=N_INNER, shuffle=True, random_state=SEED)
    scores=[]
    keep_ratio = params.pop("keep_ratio")

    for tri, vai in kf.split(X_tr_full):
        X_tr, X_va = X_tr_full[tri], X_tr_full[vai]
        y_tr, y_va = y_tr_full[tri], y_tr_full[vai]

        # Selection MI
        cols_sel = mi_topk(X_tr, y_tr, cols_prefiltered, keep_ratio)
        sel_idx = [cols_prefiltered.index(c) for c in cols_sel]
        X_tr2, X_va2 = X_tr[:, sel_idx], X_va[:, sel_idx]

        if model_name == "xgb":
            mdl = XGBRegressor(random_state=SEED, tree_method=XGB_TREE_METHOD, n_jobs=XGB_N_JOBS,
                               eval_metric="rmse", **params)
            mdl.fit(X_tr2, y_tr)
            yhat = mdl.predict(X_va2)

        elif model_name == "hgbr":
            mdl = HistGradientBoostingRegressor(random_state=SEED, early_stopping=False, **params)
            mdl.fit(X_tr2, y_tr)
            yhat = mdl.predict(X_va2)

        elif model_name == "rf":
            mdl = RandomForestRegressor(random_state=SEED, n_jobs=1, **params)
            mdl.fit(X_tr2, y_tr)
            yhat = mdl.predict(X_va2)

        elif model_name == "enet":
            # Scaler for elastic net model
            mdl = Pipeline([
                ("scaler", StandardScaler(with_mean=True, with_std=True)),
                ("model", ElasticNet(alpha=params["alpha"], l1_ratio=params["l1_ratio"],
                                     max_iter=params["max_iter"], fit_intercept=True, random_state=None))
            ])
            mdl.fit(X_tr2, y_tr)
            yhat = mdl.predict(X_va2)

        else:
            raise ValueError("modelo no soportado")

        scores.append(rmse(y_va, yhat))

    params["keep_ratio"] = keep_ratio  # devolvemos
    return float(np.mean(scores))

def choose_params_of_best_fold(params_list, fold_scores):
    return params_list[int(np.argmin(fold_scores))]

def pct(x): return f"{100*x:.2f}%"

# ----------------- Carga datos -----------------

df = pd.read_csv(TRAIN_PATH)
feature_cols_all = [c for c in df.columns if c.startswith("feature_")]

X_all = df[feature_cols_all].to_numpy(dtype=np.float32)
y_all = df["target"].to_numpy(dtype=np.float32)

# ----------------- Nested CV con 4 modelos -----------------
outer = KFold(n_splits=N_OUTER, shuffle=True, random_state=SEED)

oof_h = np.zeros_like(y_all, dtype=float); fold_rmse_h=[]; best_params_h_by_fold=[]; sel_cols_h_by_fold=[]
oof_x = np.zeros_like(y_all, dtype=float); fold_rmse_x=[]; best_params_x_by_fold=[]; sel_cols_x_by_fold=[]
oof_r = np.zeros_like(y_all, dtype=float); fold_rmse_r=[]; best_params_r_by_fold=[]; sel_cols_r_by_fold=[]
oof_e = np.zeros_like(y_all, dtype=float); fold_rmse_e=[]; best_params_e_by_fold=[]; sel_cols_e_by_fold=[]

print(">>> Nested CV (4x3): feature selection and tuning per model")
t0 = time.time()

for k,(tr_idx, te_idx) in enumerate(outer.split(X_all),1):
    X_tr0, X_te0 = X_all[tr_idx], X_all[te_idx]
    y_tr0, y_te0 = y_all[tr_idx], y_all[te_idx]

    # Filter by correlation
    df_tr = pd.DataFrame(X_tr0, columns=feature_cols_all)
    cols_after_pref = drop_constant_and_correlated(df_tr, corr_th=CORR_TH)
    if len(cols_after_pref) < MIN_FEATS:
        cols_after_pref = feature_cols_all
    sel_idx_pref = [feature_cols_all.index(c) for c in cols_after_pref]
    X_tr_pref, X_te_pref = X_tr0[:, sel_idx_pref], X_te0[:, sel_idx_pref]

    # Fit for 1st model: HGBR
    def obj_h(trial):
        params = suggest_params_hgbr(trial)
        return inner_cv_rmse(params, "hgbr", X_tr_pref, y_tr0, cols_after_pref)

    st_h = optuna.create_study(direction="minimize", sampler=TPESampler(seed=SEED), pruner=PRUNER)
    st_h.optimize(obj_h, n_trials=N_TRIALS_HGBR, timeout=TIMEOUT_SECS, show_progress_bar=False)
    hp_h = st_h.best_params; best_params_h_by_fold.append(hp_h.copy())
    cols_h = mi_topk(X_tr_pref, y_tr0, cols_after_pref, keep_ratio=hp_h["keep_ratio"])
    sel_cols_h_by_fold.append(cols_h)
    idx_h = [cols_after_pref.index(c) for c in cols_h]
    h = HistGradientBoostingRegressor(random_state=SEED, early_stopping=False,
                                      **{k:v for k,v in hp_h.items() if k!="keep_ratio"})
    h.fit(X_tr_pref[:, idx_h], y_tr0)
    pred_h = h.predict(X_te_pref[:, idx_h])
    oof_h[te_idx] = pred_h; fold_rmse_h.append(rmse(y_te0, pred_h))

    # Fit for 2nd model: XGB
    def obj_x(trial):
        params = suggest_params_xgb(trial)
        return inner_cv_rmse(params, "xgb", X_tr_pref, y_tr0, cols_after_pref)

    st_x = optuna.create_study(direction="minimize", sampler=TPESampler(seed=SEED), pruner=PRUNER)
    st_x.optimize(obj_x, n_trials=N_TRIALS_XGB, timeout=TIMEOUT_SECS, show_progress_bar=False)
    hp_x = st_x.best_params; best_params_x_by_fold.append(hp_x.copy())
    cols_x = mi_topk(X_tr_pref, y_tr0, cols_after_pref, keep_ratio=hp_x["keep_ratio"])
    sel_cols_x_by_fold.append(cols_x)
    idx_x = [cols_after_pref.index(c) for c in cols_x]
    x = XGBRegressor(random_state=SEED, tree_method=XGB_TREE_METHOD, n_jobs=XGB_N_JOBS,
                     eval_metric="rmse",
                     **{k:v for k,v in hp_x.items() if k!="keep_ratio"})
    x.fit(X_tr_pref[:, idx_x], y_tr0)
    pred_x = x.predict(X_te_pref[:, idx_x])
    oof_x[te_idx] = pred_x; fold_rmse_x.append(rmse(y_te0, pred_x))

    # Fit for 3rd model: RandomForest
    def obj_r(trial):
        params = suggest_params_rf(trial)
        return inner_cv_rmse(params, "rf", X_tr_pref, y_tr0, cols_after_pref)

    st_r = optuna.create_study(direction="minimize", sampler=TPESampler(seed=SEED), pruner=PRUNER)
    st_r.optimize(obj_r, n_trials=N_TRIALS_RF, timeout=TIMEOUT_SECS, show_progress_bar=False)
    hp_r = st_r.best_params; best_params_r_by_fold.append(hp_r.copy())
    cols_r = mi_topk(X_tr_pref, y_tr0, cols_after_pref, keep_ratio=hp_r["keep_ratio"])
    sel_cols_r_by_fold.append(cols_r)
    idx_r = [cols_after_pref.index(c) for c in cols_r]
    r = RandomForestRegressor(random_state=SEED, n_jobs=1, **{k:v for k,v in hp_r.items() if k!="keep_ratio"})
    r.fit(X_tr_pref[:, idx_r], y_tr0)
    pred_r = r.predict(X_te_pref[:, idx_r])
    oof_r[te_idx] = pred_r; fold_rmse_r.append(rmse(y_te0, pred_r))

    # Fit for 4th model: ElasticNet with Scaler
    def obj_e(trial):
        params = suggest_params_enet(trial)
        return inner_cv_rmse(params, "enet", X_tr_pref, y_tr0, cols_after_pref)

    st_e = optuna.create_study(direction="minimize", sampler=TPESampler(seed=SEED), pruner=PRUNER)
    st_e.optimize(obj_e, n_trials=N_TRIALS_ENET, timeout=TIMEOUT_SECS, show_progress_bar=False)
    hp_e = st_e.best_params; best_params_e_by_fold.append(hp_e.copy())
    cols_e = mi_topk(X_tr_pref, y_tr0, cols_after_pref, keep_ratio=hp_e["keep_ratio"])
    sel_cols_e_by_fold.append(cols_e)
    idx_e = [cols_after_pref.index(c) for c in cols_e]
    e = Pipeline([
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("model", ElasticNet(alpha=hp_e["alpha"], l1_ratio=hp_e["l1_ratio"],
                             max_iter=hp_e["max_iter"], fit_intercept=True))
    ])
    e.fit(X_tr_pref[:, idx_e], y_tr0)
    pred_e = e.predict(X_te_pref[:, idx_e])
    oof_e[te_idx] = pred_e; fold_rmse_e.append(rmse(y_te0, pred_e))

    print(f"[Fold {k}/{N_OUTER}] RMSE HGBR={fold_rmse_h[-1]:.4f} (#{len(cols_h)}) | "
          f"XGB={fold_rmse_x[-1]:.4f} (#{len(cols_x)}) | "
          f"RF={fold_rmse_r[-1]:.4f} (#{len(cols_r)}) | "
          f"ENet={fold_rmse_e[-1]:.4f} (#{len(cols_e)})")

elapsed = time.time()-t0

# OOF per model
def oof_summary(name, oof_vec, fold_scores, sel_cols_list):
    return {
        "name": name,
        "oof_rmse": rmse(y_all, oof_vec),
        "mean": float(np.mean(fold_scores)),
        "std": float(np.std(fold_scores)),
        "median_feats": int(np.median([len(c) for c in sel_cols_list])) if sel_cols_list else None
    }

sum_h = oof_summary("HGBR", oof_h, fold_rmse_h, sel_cols_h_by_fold)
sum_x = oof_summary("XGB",  oof_x, fold_rmse_x, sel_cols_x_by_fold)
sum_r = oof_summary("RF",   oof_r, fold_rmse_r, sel_cols_r_by_fold)
sum_e = oof_summary("ENet", oof_e, fold_rmse_e, sel_cols_e_by_fold)

print("\n >>>>> OOF Summary <<<<<")
for s in [sum_h, sum_x, sum_r, sum_e]:
    print(f"{s['name']:5s} OOF RMSE: {s['oof_rmse']:.6f} | mean={s['mean']:.6f} ± {s['std']:.6f} | "
          f"median #feats={s['median_feats']}")

# Final model
candidates = [sum_h, sum_x, sum_r, sum_e]
best = min(candidates, key=lambda d: d["oof_rmse"])

# Selection of parsimonious model in case of similar performance
def rel_gap(a, b): return abs(a-b)/min(a,b) if min(a,b)>0 else 0.0
second = sorted(candidates, key=lambda d: d["oof_rmse"])[1]
if rel_gap(best["oof_rmse"], second["oof_rmse"]) < 0.005:
    # Select the one with less features
    if best["median_feats"] is not None and second["median_feats"] is not None:
        if second["median_feats"] < best["median_feats"]:
            best = second

print("\n >>>>> FINAL MODEL <<<<<")
print(f"Selected: {best['name']} with OOF RMSE={best['oof_rmse']:.6f}")

# Fit and persist of final model
df_all = pd.DataFrame(X_all, columns=feature_cols_all)
cols_global_pref = drop_constant_and_correlated(df_all, corr_th=CORR_TH)
if len(cols_global_pref) < MIN_FEATS: cols_global_pref = feature_cols_all
idx_pref_all = [feature_cols_all.index(c) for c in cols_global_pref]
X_pref_all = X_all[:, idx_pref_all]

# get choosen parameter for final model
def choose_params(params_by_fold, fold_scores):
    return params_by_fold[int(np.argmin(fold_scores))] if params_by_fold else {}

hp_h_final = choose_params(best_params_h_by_fold, fold_rmse_h)
hp_x_final = choose_params(best_params_x_by_fold, fold_rmse_x)
hp_r_final = choose_params(best_params_r_by_fold, fold_rmse_r)
hp_e_final = choose_params(best_params_e_by_fold, fold_rmse_e)

# Selección global por MI según keep_ratio del ganador de cada modelo
cols_h_final = mi_topk(X_pref_all, y_all, cols_global_pref, hp_h_final.get("keep_ratio", 1.0)) if hp_h_final else cols_global_pref
cols_x_final = mi_topk(X_pref_all, y_all, cols_global_pref, hp_x_final.get("keep_ratio", 1.0)) if hp_x_final else cols_global_pref
cols_r_final = mi_topk(X_pref_all, y_all, cols_global_pref, hp_r_final.get("keep_ratio", 1.0)) if hp_r_final else cols_global_pref
cols_e_final = mi_topk(X_pref_all, y_all, cols_global_pref, hp_e_final.get("keep_ratio", 1.0)) if hp_e_final else cols_global_pref

chosen_name = best["name"]
if chosen_name == "HGBR":
    Xf = X_pref_all[:, [cols_global_pref.index(c) for c in cols_h_final]]
    estimator = HistGradientBoostingRegressor(random_state=SEED, early_stopping=False,
                                              **{k:v for k,v in hp_h_final.items() if k!="keep_ratio"})
    estimator.fit(Xf, y_all)
    features_final = cols_h_final
    model_dict = {"model":"HGBR","estimator":estimator,"features":features_final}

elif chosen_name == "XGB":
    Xf = X_pref_all[:, [cols_global_pref.index(c) for c in cols_x_final]]
    estimator = XGBRegressor(random_state=SEED, tree_method=XGB_TREE_METHOD, n_jobs=XGB_N_JOBS,
                             eval_metric="rmse",
                             **{k:v for k,v in hp_x_final.items() if k!="keep_ratio"})
    estimator.fit(Xf, y_all)
    features_final = cols_x_final
    model_dict = {"model":"XGB","estimator":estimator,"features":features_final}

elif chosen_name == "RF":
    Xf = X_pref_all[:, [cols_global_pref.index(c) for c in cols_r_final]]
    estimator = RandomForestRegressor(random_state=SEED, n_jobs=1,
                                      **{k:v for k,v in hp_r_final.items() if k!="keep_ratio"})
    estimator.fit(Xf, y_all)
    features_final = cols_r_final
    model_dict = {"model":"RF","estimator":estimator,"features":features_final}

else:  # Elastic Net
    Xf = X_pref_all[:, [cols_global_pref.index(c) for c in cols_e_final]]
    estimator = Pipeline([
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("model", ElasticNet(alpha=hp_e_final["alpha"], l1_ratio=hp_e_final["l1_ratio"],
                             max_iter=hp_e_final["max_iter"], fit_intercept=True))
    ])
    estimator.fit(Xf, y_all)
    features_final = cols_e_final
    model_dict = {"model":"ENet","estimator":estimator,"features":features_final}

joblib.dump(model_dict, MODEL_OUT)
print(f"\n Saved: {MODEL_OUT.resolve()}")

# Predicciones blind si existe
if BLIND_PATH.exists():
    blind = pd.read_csv(BLIND_PATH)
    miss = [c for c in features_final if c not in blind.columns]
    if miss:
        print(f"[WARN] Faltan columnas en blind: {miss[:8]}{'...' if len(miss)>8 else ''}")
    else:
        Xb = blind[features_final].to_numpy(dtype=np.float32)
        m = joblib.load(MODEL_OUT)
        yb = m["estimator"].predict(Xb) if isinstance(m, dict) else m.predict(Xb)
        pd.DataFrame({"target_pred": yb}).to_csv(PRED_OUT, index=False)
        print(f"Predicciones: {PRED_OUT.resolve()}")



[I 2025-10-15 02:28:49,580] A new study created in memory with name: no-name-a7f0fb79-56ec-4c8a-80d7-779fe5512d97


>>> Nested CV (4x3): feature selection and tuning per model


[I 2025-10-15 02:28:54,217] Trial 0 finished with value: 2.9522488910438214 and parameters: {'max_iter': 407, 'learning_rate': 0.004962855477056064, 'max_leaf_nodes': 44, 'max_depth': 8, 'min_samples_leaf': 11, 'l2_regularization': 0.0016601466766433721, 'max_bins': 128, 'keep_ratio': 0.5}. Best is trial 0 with value: 2.9522488910438214.
[I 2025-10-15 02:28:57,319] Trial 1 finished with value: 2.444512528290961 and parameters: {'max_iter': 687, 'learning_rate': 0.004155804010335765, 'max_leaf_nodes': 114, 'max_depth': 12, 'min_samples_leaf': 27, 'l2_regularization': 0.0012932846459544705, 'max_bins': 128, 'keep_ratio': 1.0}. Best is trial 1 with value: 2.444512528290961.
[I 2025-10-15 02:28:59,320] Trial 2 finished with value: 3.700695309116584 and parameters: {'max_iter': 673, 'learning_rate': 0.0010730669610487867, 'max_leaf_nodes': 148, 'max_depth': None, 'min_samples_leaf': 23, 'l2_regularization': 0.00020772432462140656, 'max_bins': 128, 'keep_ratio': 0.35}. Best is trial 1 with v

[Fold 1/4] RMSE HGBR=2.7693 (#15) | XGB=2.0748 (#20) | RF=2.5999 (#20) | ENet=2.8950 (#20)


[I 2025-10-15 02:38:15,389] Trial 0 finished with value: 2.719064854006253 and parameters: {'max_iter': 407, 'learning_rate': 0.004962855477056064, 'max_leaf_nodes': 44, 'max_depth': 8, 'min_samples_leaf': 11, 'l2_regularization': 0.0016601466766433721, 'max_bins': 128, 'keep_ratio': 0.5}. Best is trial 0 with value: 2.719064854006253.
[I 2025-10-15 02:38:17,566] Trial 1 finished with value: 2.4869302730934186 and parameters: {'max_iter': 687, 'learning_rate': 0.004155804010335765, 'max_leaf_nodes': 114, 'max_depth': 12, 'min_samples_leaf': 27, 'l2_regularization': 0.0012932846459544705, 'max_bins': 128, 'keep_ratio': 1.0}. Best is trial 1 with value: 2.4869302730934186.
[I 2025-10-15 02:38:19,889] Trial 2 finished with value: 3.742966135548915 and parameters: {'max_iter': 673, 'learning_rate': 0.0010730669610487867, 'max_leaf_nodes': 148, 'max_depth': None, 'min_samples_leaf': 23, 'l2_regularization': 0.00020772432462140656, 'max_bins': 128, 'keep_ratio': 0.35}. Best is trial 1 with v

[Fold 2/4] RMSE HGBR=2.2099 (#15) | XGB=2.0916 (#20) | RF=2.7461 (#15) | ENet=3.2031 (#15)


[I 2025-10-15 02:46:35,457] Trial 0 finished with value: 2.948068788638865 and parameters: {'max_iter': 407, 'learning_rate': 0.004962855477056064, 'max_leaf_nodes': 44, 'max_depth': 8, 'min_samples_leaf': 11, 'l2_regularization': 0.0016601466766433721, 'max_bins': 128, 'keep_ratio': 0.5}. Best is trial 0 with value: 2.948068788638865.
[I 2025-10-15 02:46:37,735] Trial 1 finished with value: 2.5886901811453953 and parameters: {'max_iter': 687, 'learning_rate': 0.004155804010335765, 'max_leaf_nodes': 114, 'max_depth': 12, 'min_samples_leaf': 27, 'l2_regularization': 0.0012932846459544705, 'max_bins': 128, 'keep_ratio': 1.0}. Best is trial 1 with value: 2.5886901811453953.
[I 2025-10-15 02:46:39,739] Trial 2 finished with value: 3.7678978719065754 and parameters: {'max_iter': 673, 'learning_rate': 0.0010730669610487867, 'max_leaf_nodes': 148, 'max_depth': None, 'min_samples_leaf': 23, 'l2_regularization': 0.00020772432462140656, 'max_bins': 128, 'keep_ratio': 0.35}. Best is trial 1 with 

[Fold 3/4] RMSE HGBR=1.8228 (#20) | XGB=1.7280 (#20) | RF=2.3242 (#20) | ENet=2.4309 (#20)


[I 2025-10-15 02:56:01,229] Trial 0 finished with value: 2.814809540168507 and parameters: {'max_iter': 407, 'learning_rate': 0.004962855477056064, 'max_leaf_nodes': 44, 'max_depth': 8, 'min_samples_leaf': 11, 'l2_regularization': 0.0016601466766433721, 'max_bins': 128, 'keep_ratio': 0.5}. Best is trial 0 with value: 2.814809540168507.
[I 2025-10-15 02:56:03,689] Trial 1 finished with value: 2.4853761179740452 and parameters: {'max_iter': 687, 'learning_rate': 0.004155804010335765, 'max_leaf_nodes': 114, 'max_depth': 12, 'min_samples_leaf': 27, 'l2_regularization': 0.0012932846459544705, 'max_bins': 128, 'keep_ratio': 1.0}. Best is trial 1 with value: 2.4853761179740452.
[I 2025-10-15 02:56:06,282] Trial 2 finished with value: 3.7109237558025003 and parameters: {'max_iter': 673, 'learning_rate': 0.0010730669610487867, 'max_leaf_nodes': 148, 'max_depth': None, 'min_samples_leaf': 23, 'l2_regularization': 0.00020772432462140656, 'max_bins': 128, 'keep_ratio': 0.35}. Best is trial 1 with 

[Fold 4/4] RMSE HGBR=2.1630 (#15) | XGB=2.1934 (#15) | RF=2.5168 (#15) | ENet=2.7244 (#15)

 >>>>> OOF Summary <<<<<
HGBR  OOF RMSE: 2.266829 | mean=2.241262 ± 0.339502 | median #feats=15
XGB   OOF RMSE: 2.029570 | mean=2.021950 ± 0.175696 | median #feats=20
RF    OOF RMSE: 2.551320 | mean=2.546760 ± 0.152473 | median #feats=17
ENet  OOF RMSE: 2.827183 | mean=2.813322 ± 0.279609 | median #feats=17

 >>>>> FINAL MODEL <<<<<
Selected: XGB with OOF RMSE=2.029570

 Saved: /content/model/model.pkl
Predicciones: /content/target_pred.csv


In [None]:
# generation of model manifest.json
manifest = {
  "manifest_version": "1.0",
  "model_version": datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S"),
  "model_type": chosen_name,
  "features": features_final,
  "metrics": {
    "oof_rmse": round(best["oof_rmse"], 6),
    "cv_scheme": f"nested {N_OUTER}x{N_INNER}",
    "notes": "OOF = Out-of-Fold RMSE (outer test)."
  },
  "training": {
    "created_utc": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
    "code_ref": "notebooks/multivariate_regression_training.ipynb",
    "seed": SEED,
    "optuna": {
      "sampler": "TPE",
      "trials": {"XGB": N_TRIALS_XGB, "HGBR": N_TRIALS_HGBR, "RF": N_TRIALS_RF, "ENet": N_TRIALS_ENET},
      "timeout_sec": TIMEOUT_SECS
    }
  }
}

MANIFEST_OUT.write_text(json.dumps(manifest, indent=2), encoding="utf-8")
print("\n=== MODEL MANIFEST (copy-paste) ===")
print(json.dumps(manifest, indent=2))

print("\n=== FEATURES SELECCIONADAS (final) ===")
print(f"Total: {len(features_final)}")
print(features_final[:50])
