In [6]:
import pandas as pd
import torch
from lightning.pytorch import Trainer
from pytorch_forecasting import TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import NaNLabelEncoder
from pytorch_forecasting.metrics import QuantileLoss
from pytorch_forecasting.models import TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import mlflow
import mlflow.pytorch
from mlflow.models import infer_signature
import numpy as np
from sklearn.metrics import mean_absolute_error

In [10]:
df = pd.read_csv("weekly_demand.csv")
df.head()
df["semana_inicio"] = pd.to_datetime(df["semana_inicio"])
df["time_idx"] = df["semana_inicio"].rank(method="dense").astype("int") - 1
df["platillo_id"] = df["platillo_id"].astype(str)   

In [24]:
hp = dict(
    # datos
    batch_size            = 64,
    max_encoder_length    = 16,
    max_prediction_length = 4,
    # modelo TFT
    learning_rate         = 1e-3,
    hidden_size           = 16,
    lstm_layers           = 1,
    attention_head_size   = 1,
    hidden_cont_size      = 8,   # tamaño de capas para variables continuas
    dropout               = 0.1,
    loss_fn               = "QuantileLoss",
    # entrenamiento
    max_epochs            = 30,
    gradient_clip_val     = 0.1,
    reduce_patience       = 4,
)

quantiles = [0.1, 0.5, 0.9]

training_cutoff = df["time_idx"].max() - hp["max_prediction_length"]
train_df = df[df.time_idx <= training_cutoff].copy()
val_df   = df[df.time_idx >= training_cutoff - hp["max_encoder_length"]].copy()

common = dict(
    time_idx="time_idx",
    target="cantidad",
    group_ids=["platillo_id"],
    max_encoder_length=hp["max_encoder_length"],
    max_prediction_length=hp["max_prediction_length"],
    time_varying_known_reals=[
        "time_idx", "mes", "semana_ano", "trimestre", "ano",
        "dia_festivo", "ocupacion"
    ],
    time_varying_unknown_reals=["cantidad", "lag_1", "lag_2"],
    target_normalizer=GroupNormalizer(groups=["platillo_id"]),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True,
)

training_ds   = TimeSeriesDataSet(train_df, **common)
validation_ds = TimeSeriesDataSet.from_dataset(training_ds, val_df, stop_randomization=True)

train_loader = training_ds.to_dataloader(train=True , batch_size=hp["batch_size"], num_workers=0)
val_loader   = validation_ds.to_dataloader(train=False, batch_size=hp["batch_size"], num_workers=0)

tft = TemporalFusionTransformer.from_dataset(
    training_ds,
    learning_rate        = hp["learning_rate"],
    hidden_size          = hp["hidden_size"],
    lstm_layers          = hp["lstm_layers"],
    attention_head_size  = hp["attention_head_size"],
    hidden_continuous_size = hp["hidden_cont_size"],
    dropout              = hp["dropout"],
    loss                 = QuantileLoss(),
    log_interval         = 10,
    reduce_on_plateau_patience = hp["reduce_patience"],
)

trainer = Trainer(
    max_epochs         = hp["max_epochs"],
    gradient_clip_val  = hp["gradient_clip_val"],
    logger             = False,        # usamos MLflow autolog
    deterministic      = True,
)

mlflow.set_tracking_uri("http://localhost:5001")
mlflow.set_experiment("Oumaji_Demand")
mlflow.pytorch.autolog(log_every_n_epoch=1, log_models=False)

with mlflow.start_run():
    # ---------- PARAMS ----------
    mlflow.log_params({f"data.{k}": v for k, v in hp.items()})
    # ---------- ENTRENAMIENTO ----
    trainer.fit(tft, train_dataloaders=train_loader, val_dataloaders=val_loader)
    # ---------- VALIDACIÓN ----
    pred_q, row_idx, *_ = tft.predict(
    val_loader,
    mode="quantiles",
    mode_kwargs={"quantiles": quantiles},
    return_index=True,
    return_x=True,  
    return_y=False,
    return_decoder_lengths=False,
)

    # —— reconstruir DataFrame ——
    time_map = df[["time_idx", "semana_inicio"]].drop_duplicates().set_index("time_idx")["semana_inicio"]
    dec_idx  = row_idx["decoder_time_idx"].flatten().cpu().numpy().astype(int)
    cat_codes = row_idx["groups"][..., 0].flatten().cpu().numpy()
    horiz     = pred_q.shape[1]
    pred_np   = pred_q.cpu().numpy().reshape(-1, pred_q.shape[-1])
    cat_rep   = np.repeat(cat_codes, horiz)
    code_to_id = dict(enumerate(df["platillo_id"].astype("category").cat.categories))
    plat_ids   = [code_to_id[c] for c in cat_rep]

    val_out = pd.DataFrame({
    "platillo_id": plat_ids,
    "semana_pred": time_map.loc[dec_idx].values,
    "p10": np.round(pred_np[:, 0], 2),
    "p50": np.round(pred_np[:, 1], 2),
    "p90": np.round(pred_np[:, 2], 2),
    })
    
    y_true = (
        df.set_index(["platillo_id", "time_idx"])
          .loc[list(zip(val_out["platillo_id"], dec_idx)), "cantidad"]
          .values
    )
    val_out["cantidad"] = y_true

    mae = mean_absolute_error(val_out["cantidad"], val_out["p50"])
    coverage = ((val_out["cantidad"] >= val_out["p10"]) &
                (val_out["cantidad"] <= val_out["p90"])).mean()

    # ---------- MÉTRICAS ----------
    mlflow.log_metrics({
        "MAE_holdout": mae,
        "p90_coverage": coverage,
    })

    # ---------------- SIGNATURE + MODELO ----------------
    x_batch, _ = next(iter(val_loader))
    x_example_np = {k: v[:1].cpu().numpy() for k, v in x_batch.items()}

    y_pred_np = np.zeros((1, hp["max_prediction_length"] * len(quantiles)), dtype=np.float32)

    signature = infer_signature(x_example_np, y_pred_np)

    mlflow.pytorch.log_model(
        pytorch_model=tft,
        artifact_path="model",
        signature=signature,
        input_example=x_example_np,
        registered_model_name="TFT",
    )



    # ---------- ARTIFACTOS -------
    ckpt = trainer.checkpoint_callback.best_model_path
    if ckpt:
        mlflow.log_artifact(ckpt, artifact_path="checkpoints")

    val_out.to_csv("val_out.csv", index=False)
    mlflow.log_artifact("val_out.csv")
    



/Users/axllopez/Desktop/OneDrive _Instituto Tecnologico_y_de_Estudios_Superiores_de_Monterrey/OCTAVO/Proyecto IDM/Oumaji_Final_Project/.venv/lib/python3.11/site-packages/lightning/pytorch/utilities/parsing.py:209: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/Users/axllopez/Desktop/OneDrive _Instituto Tecnologico_y_de_Estudios_Superiores_de_Monterrey/OCTAVO/Proyecto IDM/Oumaji_Final_Project/.venv/lib/python3.11/site-packages/lightning/pytorch/utilities/parsing.py:209: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (mps), used: True
TPU 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/axllopez/Desktop/OneDrive _Instituto Tecnologico_y_de_Estudios_Superiores_de_Monterrey/OCTAVO/Proyecto IDM/Oumaji_Final_Project/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/Users/axllopez/Desktop/OneDrive _Instituto Tecnologico_y_de_Estudios_Superiores_de_Monterrey/OCTAVO/Proyecto IDM/Oumaji_Final_Project/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 29: 100%|██████████| 49/49 [00:22<00:00,  2.15it/s, train_loss_step=7.420, val_loss=10.50, train_loss_epoch=7.220]

`Trainer.fit` stopped: `max_epochs=30` reached.


Epoch 29: 100%|██████████| 49/49 [00:23<00:00,  2.11it/s, train_loss_step=7.420, val_loss=10.50, train_loss_epoch=7.220]


Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/axllopez/Desktop/OneDrive _Instituto Tecnologico_y_de_Estudios_Superiores_de_Monterrey/OCTAVO/Proyecto IDM/Oumaji_Final_Project/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
  "inputs": {
    "encoder_cat": [
      [
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        []
      ]
    ],
    "encoder_cont": [
      [
        [
     

🏃 View run illustrious-asp-80 at: http://localhost:5001/#/experiments/400716610318719328/runs/b82175e0e93f4c9896edd58b19156386
🧪 View experiment at: http://localhost:5001/#/experiments/400716610318719328


In [22]:
exp_id = "710601173413351203"                 # tu experimento
df_runs = mlflow.search_runs(experiment_ids=[exp_id])

bad = df_runs[df_runs.filter(regex="metrics.").isna().any(axis=1)]
print(bad[['run_id'] + [c for c in bad.columns if c.startswith("metrics.")]])

                             run_id  metrics.MPE_holdout  metrics.val_RMSE  \
0  329315f89fa140659dfcfc1f75d78483                  NaN         32.543385   
1  190652c4253e470e9362deeec98e2160                  NaN         32.018600   
2  e0ec57062fac42fa9a5113ab83e6cdd6                  NaN         33.529804   
3  5110704bfbb84ff6833febf47fb102be                  NaN         31.776373   
4  96204246010d4ee290a76dc6e7344f5b                  NaN         31.068344   
5  fe8872652c464a0e80aa403f4bb67702                  NaN         33.158695   
6  724a93ea7865493481439186f3282889                  NaN         32.286457   
7  cc9e1af8b74c481ea1b45a5329514d3b                  NaN         33.020603   

   metrics.val_loss  metrics.p90_coverage  metrics.val_SMAPE  metrics.val_MAE  \
0         10.487294              0.201613           0.470246        16.831039   
1         10.373519              0.198925           0.457364        16.294785   
2         10.451321              0.196237           0.