In [None]:
# === BLOQUE – Optimización de hiperparámetros XGBoost con Optuna ===

# 1) Instalar Optuna (si no lo tienes)
# %pip install optuna --quiet

import optuna
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import os

# 2) Carga y preprocesamiento (igual que antes)
BASE_OUTPUTS = r'C:\Users\Elisabeth\Desktop\MAESTRIA_AUSTRAL\Labo_III\labo3-2025v\entrega_final\output'
FEATURES_DIR = r'C:\Users\Elisabeth\Desktop\MAESTRIA_AUSTRAL\Labo_III\labo3-2025v\entrega_final\features'
os.makedirs(BASE_OUTPUTS, exist_ok=True)
df = pd.read_pickle(os.path.join(FEATURES_DIR, "dataset_features_product_id.pkl"))

if 'target' not in df.columns:
    df = df.sort_values(['product_id','periodo']).reset_index(drop=True)
    df['target'] = df.groupby('product_id')['tn'].shift(-2)
df = df[df['target'].notna()].reset_index(drop=True)
df['target_log'] = np.log1p(df['target'].clip(lower=0))

# Features numéricas
exclude = ['share_total','product_id','n_customers','cat2','cat3',
           'period','periodo','prod_start','target','target_log',
           'tn','brand','cust_request_qty','cust_request_tn',
           'plan_precios_cuidados','sku_size']
features = [c for c in df.columns if c not in exclude and pd.api.types.is_numeric_dtype(df[c])]

# Train/test split temporal
max_ord = df['period_ordinal'].max()
train = df[df['period_ordinal'] <= max_ord - 2]
X_train = train[features]
y_train = train['target_log']

# 3) Definición de la función objetivo para Optuna
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
        'tree_method': 'hist',
        'booster': 'gbtree',
        'max_depth': trial.suggest_int('max_depth', 8, 8),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.05, 0.06),
        'subsample': trial.suggest_uniform('subsample', 0.75, 0.85),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.9, 1),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 8),
        'eta': 0.056,
        'seed': 42,
        'verbosity': 0
    }
    
  # 🏆 Mejores parámetros: {'max_depth': 8, 'learning_rate': 0.057714246298605575, 'subsample': 0.7743395223702895, 'colsample_bytree': 0.9690773222034035, 'reg_alpha': 0.005001769829321724, 'reg_lambda': 9.046521832898346e-07, 'min_child_weight': 5}
#🏆 Mejor MAE CV: 5.6827
    tss = TimeSeriesSplit(n_splits=5)
    maes = []
    for train_idx, valid_idx in tss.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
        dtr = xgb.DMatrix(X_tr, label=y_tr)
        dval = xgb.DMatrix(X_val, label=y_val)
        bst = xgb.train(
            params,
            dtr,
            num_boost_round=1000,
            evals=[(dtr, 'train'), (dval, 'valid')],
            early_stopping_rounds=50,
            verbose_eval=False
        )
        pred = bst.predict(dval)
        maes.append(mean_absolute_error(np.expm1(y_val), np.expm1(pred)))
    return np.mean(maes)

# 4) Ejecutar la búsqueda
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5, show_progress_bar=True)

print("🏆 Mejores parámetros:", study.best_params)
print(f"🏆 Mejor MAE CV: {study.best_value:.4f}")

# 5) Reentrenar con los mejores parámetros y todo el train
best_params = study.best_params.copy()
best_params.update({
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    'tree_method': 'hist',
    'eta': 0.056,
    'seed': 42,
    'verbosity': 0
})

dtrain_full = xgb.DMatrix(X_train, label=y_train)
final_bst = xgb.train(
    best_params,
    dtrain_full,
    num_boost_round=1000,
    early_stopping_rounds=50,
    evals=[(dtrain_full,'train')],
    verbose_eval=100
)

# 6) Guardar el modelo (opcional)
final_bst.save_model(os.path.join(BASE_OUTPUTS, 'xgb_optuna_model.json'))
print("💾 Modelo optimizado guardado.")

[I 2025-07-20 15:50:30,815] A new study created in memory with name: no-name-a1145fd8-3b42-4c8b-82e5-5d53c6c6af9d


  0%|          | 0/5 [00:00<?, ?it/s]

  'learning_rate': trial.suggest_loguniform('learning_rate', 0.05, 0.06),
  'subsample': trial.suggest_uniform('subsample', 0.75, 0.85),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.9, 1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),


[I 2025-07-20 16:07:58,606] Trial 0 finished with value: 5.535149276256561 and parameters: {'max_depth': 8, 'learning_rate': 0.056679046261511316, 'subsample': 0.8222072371851699, 'colsample_bytree': 0.9126245241638805, 'reg_alpha': 3.225577034749856e-06, 'reg_lambda': 2.053015859352387e-05, 'min_child_weight': 7, 'eta': 0}. Best is trial 0 with value: 5.535149276256561.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.05, 0.06),
  'subsample': trial.suggest_uniform('subsample', 0.75, 0.85),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.9, 1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),


[I 2025-07-20 16:29:17,776] Trial 1 finished with value: 6.637826812267304 and parameters: {'max_depth': 8, 'learning_rate': 0.053366930141497924, 'subsample': 0.7593743881872311, 'colsample_bytree': 0.914454431569996, 'reg_alpha': 1.45823865134872e-06, 'reg_lambda': 0.30918447797807735, 'min_child_weight': 1, 'eta': 0}. Best is trial 0 with value: 5.535149276256561.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.05, 0.06),
  'subsample': trial.suggest_uniform('subsample', 0.75, 0.85),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.9, 1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),


[I 2025-07-20 16:48:44,881] Trial 2 finished with value: 6.427153921127319 and parameters: {'max_depth': 8, 'learning_rate': 0.057102599192204055, 'subsample': 0.7904486891465465, 'colsample_bytree': 0.9031032255423622, 'reg_alpha': 8.886481689890709e-07, 'reg_lambda': 0.0005241580606337788, 'min_child_weight': 2, 'eta': 0}. Best is trial 0 with value: 5.535149276256561.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.05, 0.06),
  'subsample': trial.suggest_uniform('subsample', 0.75, 0.85),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.9, 1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),


[I 2025-07-20 17:06:48,800] Trial 3 finished with value: 6.0008597612380985 and parameters: {'max_depth': 8, 'learning_rate': 0.05159204825406145, 'subsample': 0.7686009233917711, 'colsample_bytree': 0.9574989477990986, 'reg_alpha': 0.04110129567347137, 'reg_lambda': 1.1719595868836008e-07, 'min_child_weight': 4, 'eta': 0}. Best is trial 0 with value: 5.535149276256561.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.05, 0.06),
  'subsample': trial.suggest_uniform('subsample', 0.75, 0.85),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.9, 1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),


[I 2025-07-20 17:18:38,079] Trial 4 finished with value: 5.8850068807601925 and parameters: {'max_depth': 8, 'learning_rate': 0.052408135147789005, 'subsample': 0.7603358240296186, 'colsample_bytree': 0.9364004885300342, 'reg_alpha': 0.07478785924206888, 'reg_lambda': 0.009019493316585704, 'min_child_weight': 8, 'eta': 0}. Best is trial 0 with value: 5.535149276256561.
🏆 Mejores parámetros: {'max_depth': 8, 'learning_rate': 0.056679046261511316, 'subsample': 0.8222072371851699, 'colsample_bytree': 0.9126245241638805, 'reg_alpha': 3.225577034749856e-06, 'reg_lambda': 2.053015859352387e-05, 'min_child_weight': 7, 'eta': 0}
🏆 Mejor MAE CV: 5.5351
[0]	train-mae:1.39875
[100]	train-mae:0.14047
[200]	train-mae:0.11145
[300]	train-mae:0.09337
[400]	train-mae:0.07979
[500]	train-mae:0.06901
[600]	train-mae:0.06104
[700]	train-mae:0.05317
[800]	train-mae:0.04712
[900]	train-mae:0.04155
[999]	train-mae:0.03707
💾 Modelo optimizado guardado.


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
import os

# 1) Configura rutas
BASE_OUTPUTS = r'C:\Users\Elisabeth\Desktop\MAESTRIA_AUSTRAL\Labo_III\labo3-2025v\entrega_final\output'
FEATURES_DIR = r'C:\Users\Elisabeth\Desktop\MAESTRIA_AUSTRAL\Labo_III\labo3-2025v\entrega_final\features'
MODEL_PATH   = os.path.join(BASE_OUTPUTS, 'xgb_optuna_model.json')

# 2) Carga tu DataFrame como antes
df = pd.read_pickle(os.path.join(FEATURES_DIR, "dataset_features_product_id.pkl"))

# 3) Vuelve a hacer exactamente el mismo preprocesamiento y split
if 'target' not in df.columns:
    df = df.sort_values(['product_id','periodo']).reset_index(drop=True)
    df['target'] = df.groupby('product_id')['tn'].shift(-2)

df = df[df['target'].notna()].reset_index(drop=True)
df['target_log'] = np.log1p(df['target'].clip(lower=0))

exclude = ['share_total','product_id','n_customers','cat2','cat3',
           'period','periodo','prod_start','target','target_log',
           'tn','brand','cust_request_qty','cust_request_tn',
           'plan_precios_cuidados','sku_size']
features = [c for c in df.columns if c not in exclude and pd.api.types.is_numeric_dtype(df[c])]

max_ord = df['period_ordinal'].max()
train = df[df['period_ordinal'] <= max_ord - 2]
test  = df[df['period_ordinal'] >  max_ord - 2]
X_test = test[features]

# 4) Carga el modelo optimizado
bst = xgb.Booster()
bst.load_model(MODEL_PATH)

# 5) Evaluación en TEST
dtest     = xgb.DMatrix(X_test)
pred_test = np.expm1(bst.predict(dtest))
test['tn_pred'] = pred_test

mae_test  = mean_absolute_error(test['target'], pred_test)
rmse_test = np.sqrt(mean_squared_error(test['target'], pred_test))
print(f"🧪 TEST MAE:  {mae_test:.3f}")
print(f"🧪 TEST RMSE: {rmse_test:.3f}")

# 6) Predicción del ÚLTIMO PERIODO («out»)
df_pred = df[df['period_ordinal'] == max_ord]
dpred   = xgb.DMatrix(df_pred[features])
out     = pd.DataFrame({
    'product_id': df_pred['product_id'],
    'tn_pred'   : np.expm1(bst.predict(dpred))
})

out.to_csv(os.path.join(BASE_OUTPUTS, 'pred_modelo_xgb_optuna.csv'), index=False)
print(f"💾 Predicción final guardada en: {os.path.join(BASE_OUTPUTS, 'pred_modelo_xgb_optuna.csv')}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['tn_pred'] = pred_test


🧪 TEST MAE:  9.438
🧪 TEST RMSE: 28.013
💾 Predicción final guardada en: C:\Users\Elisabeth\Desktop\MAESTRIA_AUSTRAL\Labo_III\labo3-2025v\entrega_final\output\pred_modelo_xgb_optunav4.csv
