In [9]:
import torch

if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"using device: {device}")

using device: mps


In [10]:
import os
from pathlib import Path

import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, MaxAbsScaler

# ─── configuration ───────────────────────────────────────────────────────────
root           = Path(".")
forecast_dir   = root / "bmrs_csv_filled"
df_path        = root / forecast_dir / "df_all.csv"
mask_dir       = root / "bmrs_csv_masks"

date_start     = "2021-07-01"
date_end       = "2025-06-30"
train_end_date = "2025-03-01"
val_end_date   = "2025-05-01"

horizon        = 48
use_time_feat  = False   # whether to add trig-based time features

# ─── sanity checks & seeding ─────────────────────────────────────────────────
assert df_path.exists(), f"{df_path} not found"
for d in (forecast_dir, mask_dir):
    assert d.exists(), f"{d} not found"

seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)

In [11]:
df = (
    pd.read_csv(df_path, index_col="startTime", parse_dates=True)
      .loc[date_start:date_end]
)
print(f"[debug] loaded df rows = {len(df)}")

# drop any forecast/actual cols
to_drop = [c for c in df.columns if "forecast" in c.lower() or "actual" in c.lower()]
df.drop(columns=to_drop, errors="ignore", inplace=True)

# embed time features
df["month_idx"]   = df.index.month - 1
df["weekday_idx"] = df.index.dayofweek
if "settlement period" in (c.lower() for c in df.columns):
    df["sp_idx"] = df.pop("Settlement Period").astype(int) - 1
else:
    df["sp_idx"] = 0
    print("[warn] 'settlement period' not found; sp_idx set to 0")
df["dtype_idx"]   = (df.index.dayofweek >= 5).astype(int)

print(df[["month_idx","weekday_idx","sp_idx","dtype_idx"]].head())

def load_forecast_matrix(name, prefix, idx, horizon):
    path = forecast_dir / f"{name}.csv"
    if not path.exists():
        raise FileNotFoundError(f"{path} not found")
    fdf = (
        pd.read_csv(path, index_col="startTime", parse_dates=True)
          .loc[date_start:date_end]
          .reindex(idx)
    )
    print(f"[debug] {name}: {len(fdf)} rows reindexed")
    cols    = [f"{prefix}_f{i}" for i in range(1, horizon+1)]
    present = [c for c in cols if c in fdf.columns]
    mat     = fdf[present].fillna(0).to_numpy()
    if mat.shape[1] < horizon:
        pad = np.zeros((len(mat), horizon - mat.shape[1]), dtype=mat.dtype)
        mat = np.hstack([mat, pad])
        print(f"[debug] {name}: padded from {len(present)}→{horizon}")
    else:
        print(f"[debug] {name}: full horizon loaded")
    return mat

idx        = df.index
demand_mat = load_forecast_matrix("DEMAND_FORECASTS", "demand", idx, horizon)
wind_mat   = load_forecast_matrix("WIND_FORECASTS",   "wind",   idx, horizon)
drm_mat    = load_forecast_matrix("DRM_FORECASTS",    "drm",    idx, horizon)
x_fut      = np.stack([demand_mat, wind_mat, drm_mat], axis=2)
assert x_fut.shape[0] == len(df)
print(f"[debug] x_fut shape = {x_fut.shape}")

[debug] loaded df rows = 70126
                     month_idx  weekday_idx  sp_idx  dtype_idx
startTime                                                     
2021-07-01 00:00:00          6            3       2          0
2021-07-01 00:30:00          6            3       3          0
2021-07-01 01:00:00          6            3       4          0
2021-07-01 01:30:00          6            3       5          0
2021-07-01 02:00:00          6            3       6          0
[debug] DEMAND_FORECASTS: 70126 rows reindexed
[debug] DEMAND_FORECASTS: full horizon loaded
[debug] WIND_FORECASTS: 70126 rows reindexed
[debug] WIND_FORECASTS: full horizon loaded
[debug] DRM_FORECASTS: 70126 rows reindexed
[debug] DRM_FORECASTS: full horizon loaded
[debug] x_fut shape = (70126, 48, 3)


  pd.read_csv(path, index_col="startTime", parse_dates=True)


In [12]:
target_col = "Imbalance Price"
assert target_col in df.columns, f"missing target: {target_col}"

masks = {
    "train": df.index < train_end_date,
    "val"  : (df.index >= train_end_date) & (df.index < val_end_date),
    "test" : df.index >= val_end_date,
}

cal_cols  = ["month_idx","weekday_idx","sp_idx","dtype_idx"]
hist_cols = [c for c in df.columns if c not in cal_cols]

splits = {}
for split, mask in masks.items():
    sub = df.loc[mask]
    splits[split] = {
        "x_hist": sub[hist_cols].to_numpy(),
        "x_cal" : sub[cal_cols].to_numpy(),
        "y"     : sub[target_col].to_numpy(),
        "x_fut" : x_fut[mask],
    }
    print(
        f"{split:5s} → "
        f"x_hist {splits[split]['x_hist'].shape}, "
        f"x_cal {splits[split]['x_cal'].shape}, "
        f"y {splits[split]['y'].shape}, "
        f"x_fut {splits[split]['x_fut'].shape}"
    )

train → x_hist (64272, 10), x_cal (64272, 4), y (64272,), x_fut (64272, 48, 3)
val   → x_hist (2928, 10), x_cal (2928, 4), y (2928,), x_fut (2928, 48, 3)
test  → x_hist (2926, 10), x_cal (2926, 4), y (2926,), x_fut (2926, 48, 3)


In [None]:
# Cell 3 ─── dataset & model class definitions (with inline comments) ──────
from torch.utils.data import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from src.models.factory import MODEL_FACTORY
from src.datasets.factory import DATASET_FACTORY

transformer_factory = {
    "MinMax":   MinMaxScaler,
    "Robust":   RobustScaler,
    "Standard": StandardScaler,
    "MaxAbs":   MaxAbsScaler,
}

loss_factory = {
    "MAE":   nn.L1Loss,
    "MSE":   nn.MSELoss,
    "Huber": nn.SmoothL1Loss,
}

# model & sequence configuration
seq_len   = 48   # look-back window
feed_len  = 48   # known-future window
fut_len   = 4   # forecast horizon

# network widths & depths
lstm_hidden = 64
dec_hidden  = 64
attn_dim    = 48
num_layers  = 1
dropout     = 0.0

# training settings
batch_size = 144
lr         = 1e-4
patience   = 20
max_epochs = 200

# scaler & loss choices (must match factory keys)
scaler_used = "MaxAbs"
model_used  = "BiAttnPointForecaster"
loss_used   = "Huber"
beta        = 0.01   # only for Huber loss
notes       = None

print(f"configs → seq={seq_len}, feed={feed_len}, fut={fut_len}; "
      f"batch={batch_size}, lr={lr}, epochs={max_epochs}")


configs → seq=48, feed=48, fut=4; batch=144, lr=0.0001, epochs=200


In [15]:
# ─── metadata & model directory ─────────────────────────────────────────────
from pathlib import Path

# metadata dict
md = {
    "model":        model_used,
    "seq_len":      seq_len,
    "feed_len":     feed_len,
    "horizon":      fut_len,
    "lstm_hidden":  lstm_hidden,
    "dec_hidden":   dec_hidden,
    "num_layers":   num_layers,
    "batch_size":   batch_size,
    "learning_rate":lr,
    "max_epochs":   max_epochs,
    "patience":     patience,
    "scaler":       scaler_used,
    "loss":         loss_used,
    **({"notes": notes} if notes else {}),
}

# helper to abbreviate keys
initials = lambda s: "".join(w[0] for w in s.split("_"))

# build tag parts
parts = []
for k, v in md.items():
    sv = str(v)
    if isinstance(v, float) and sv.startswith("0."):
        sv = sv.replace("0.", ".")
    part = sv if k in {"model","scaler","loss","notes"} else f"{initials(k)}{sv}"
    parts.append(part)

# combine into tag
tag = "_".join(parts)

# determine candidate path, bumping version if needed
models_root = Path("saved_runs")
candidate   = models_root / tag
version     = 0
while candidate.exists():
    version    += 1
    candidate   = models_root / f"{tag}_v{version}"

print(f"[info] saving model to: {candidate}")


[info] saving model to: saved_runs/BiAttnPointForecaster_sl48_fl48_h4_lh64_dh64_nl1_bs144_lr.0001_me200_p20_MaxAbs_Huber_v1


In [None]:
# ─── train/val/test loop + early stopping ─────────────────────────────────
import os
import json
import copy
import joblib
import sys
import platform
from datetime import datetime, timezone
import numpy as np
import torch
from sklearn.metrics import mean_absolute_error, mean_squared_error
from src.datasets.datasets import MultiFeedDataset

# create output directory
base_dir = candidate  # candidate is a pathlib.Path
base_dir.mkdir(parents=True, exist_ok=True)
print(f"→ saving run in: {base_dir}")

# rebuild train/val/test df slices for metadata
df_train = df.loc[masks["train"]]
df_val   = df.loc[masks["val"]]
df_test  = df.loc[masks["test"]]

# 1) scale train/val/test

# a) history features
scaler_x = transformer_factory[scaler_used]()
x_train_hist_scaled = scaler_x.fit_transform(splits["train"]["x_hist"])
x_val_hist_scaled   = scaler_x.transform(splits["val"]["x_hist"])
x_test_hist_scaled  = scaler_x.transform(splits["test"]["x_hist"])

# b) future features
n_fut_feats = splits["train"]["x_fut"].shape[2]
scaler_f    = transformer_factory[scaler_used]()
flat_f_train = splits["train"]["x_fut"].reshape(-1, n_fut_feats)
flat_f_train = scaler_f.fit_transform(flat_f_train)
x_fut_train_scaled = flat_f_train.reshape(splits["train"]["x_fut"].shape)

flat_f_val   = splits["val"]["x_fut"].reshape(-1, n_fut_feats)
flat_f_val   = scaler_f.transform(flat_f_val)
x_fut_val_scaled = flat_f_val.reshape(splits["val"]["x_fut"].shape)

flat_f_test  = splits["test"]["x_fut"].reshape(-1, n_fut_feats)
flat_f_test  = scaler_f.transform(flat_f_test)
x_fut_test_scaled = flat_f_test.reshape(splits["test"]["x_fut"].shape)

# c) targets
scaler_y       = transformer_factory[scaler_used]()
y_train_scaled = scaler_y.fit_transform(splits["train"]["y"].reshape(-1,1)).flatten()
y_val_scaled   = scaler_y.transform(splits["val"]["y"].reshape(-1,1)).flatten()
y_test_scaled  = scaler_y.transform(splits["test"]["y"].reshape(-1,1)).flatten()

# 2) build dataloaders
pin_memory = (device.type == "cuda")

def to_tensor(x, dtype):
    return torch.tensor(x, dtype=dtype)

train_ds = MultiFeedDataset(
    hist        = to_tensor(x_train_hist_scaled, torch.float32),
    full_fut    = to_tensor(x_fut_train_scaled,    torch.float32),
    y           = to_tensor(y_train_scaled,        torch.float32),
    month_idx   = to_tensor(splits["train"]["x_cal"][:,0], torch.long),
    weekday_idx = to_tensor(splits["train"]["x_cal"][:,1], torch.long),
    sp_idx      = to_tensor(splits["train"]["x_cal"][:,2], torch.long),
    dtype_idx   = to_tensor(splits["train"]["x_cal"][:,3], torch.long),
    seq_len     = seq_len,
    feed_len    = feed_len,
    fut_len     = fut_len
)
val_ds = MultiFeedDataset(
    hist        = to_tensor(x_val_hist_scaled,   torch.float32),
    full_fut    = to_tensor(x_fut_val_scaled,    torch.float32),
    y           = to_tensor(y_val_scaled,        torch.float32),
    month_idx   = to_tensor(splits["val"]["x_cal"][:,0], torch.long),
    weekday_idx = to_tensor(splits["val"]["x_cal"][:,1], torch.long),
    sp_idx      = to_tensor(splits["val"]["x_cal"][:,2], torch.long),
    dtype_idx   = to_tensor(splits["val"]["x_cal"][:,3], torch.long),
    seq_len     = seq_len,
    feed_len    = feed_len,
    fut_len     = fut_len
)
test_ds = MultiFeedDataset(
    hist        = to_tensor(x_test_hist_scaled,  torch.float32),
    full_fut    = to_tensor(x_fut_test_scaled,   torch.float32),
    y           = to_tensor(y_test_scaled,       torch.float32),
    month_idx   = to_tensor(splits["test"]["x_cal"][:,0], torch.long),
    weekday_idx = to_tensor(splits["test"]["x_cal"][:,1], torch.long),
    sp_idx      = to_tensor(splits["test"]["x_cal"][:,2], torch.long),
    dtype_idx   = to_tensor(splits["test"]["x_cal"][:,3], torch.long),
    seq_len     = seq_len,
    feed_len    = feed_len,
    fut_len     = fut_len
)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  pin_memory=pin_memory)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, pin_memory=pin_memory)
test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, pin_memory=pin_memory)

# 3) instantiate model, optimizer, criterion
time_feat_dim = 4 + 3 + 6 + 2
model = MODEL_FACTORY[model_used](
    num_hist_feats = x_train_hist_scaled.shape[1],
    num_fut_feats  = n_fut_feats,
    time_feat_dim  = time_feat_dim,
    lstm_hidden    = lstm_hidden,
    dec_hidden     = dec_hidden,
    attn_dim       = attn_dim,
    hist_len       = seq_len,
    feed_len       = feed_len,
    fut_len        = fut_len
).to(device)

print(f"num_hist_feats: {x_train_hist_scaled.shape[1]}")
print(f"num_fut_feats: {n_fut_feats}")
print(f"time_feat_dim: {time_feat_dim}")

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = (
    loss_factory[loss_used](beta=beta)
    if loss_used == "huber"
    else loss_factory[loss_used]()
)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5, min_lr=1e-7
)
last_lrs = scheduler.get_last_lr()

# 4) train w/ early stopping on val
best_val, epochs_no_improve, best_ckpt = float('inf'), 0, None

for epoch in range(1, max_epochs+1):
    model.train()
    total_train_loss = 0.0
    for x_h, x_f, y_t, mi, wi, si, di in train_loader:
        x_h, x_f, y_t = x_h.to(device), x_f.to(device), y_t.to(device)
        mi, wi, si, di = mi.to(device), wi.to(device), si.to(device), di.to(device)

        optimizer.zero_grad()
        out  = model(x_h, x_f, mi, wi, si, di)
        loss = criterion(out, y_t)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
    train_loss = total_train_loss / len(train_loader)

    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for x_h, x_f, y_t, mi, wi, si, di in val_loader:
            x_h, x_f, y_t = x_h.to(device), x_f.to(device), y_t.to(device)
            mi, wi, si, di = mi.to(device), wi.to(device), si.to(device), di.to(device)
            out = model(x_h, x_f, mi, wi, si, di)
            total_val_loss += criterion(out, y_t).item()
    val_loss = total_val_loss / len(val_loader)

    scheduler.step(val_loss)
    new_lr = scheduler.get_last_lr()[0]
    if new_lr != last_lrs[0]:
        print(f"→ lr reduced from {last_lrs[0]:.2e} to {new_lr:.2e}")
    last_lrs = scheduler.get_last_lr()

    print(f"[epoch {epoch:03d}] train={train_loss:.5f} val={val_loss:.5f}")
    if val_loss < best_val:
        best_val, epochs_no_improve = val_loss, 0
        best_ckpt = {
            "model":     copy.deepcopy(model.state_dict()),
            "optimizer": copy.deepcopy(optimizer.state_dict())
        }
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"→ early stopping after {epoch} epochs")
            break

# restore best checkpoint
model.load_state_dict(best_ckpt["model"])
optimizer.load_state_dict(best_ckpt["optimizer"])

# 5) inference on test
model.eval()
preds_all, trues_all = [], []
with torch.no_grad():
    for x_h, x_f, y_t, mi, wi, si, di in test_loader:
        x_h, x_f = x_h.to(device), x_f.to(device)
        mi, wi, si, di = mi.to(device), wi.to(device), si.to(device), di.to(device)
        out = model(x_h, x_f, mi, wi, si, di)
        preds_all.append(out.cpu().numpy())
        trues_all.append(y_t.numpy())

preds_all = np.concatenate(preds_all, axis=0)
trues_all = np.concatenate(trues_all, axis=0)
preds     = scaler_y.inverse_transform(preds_all.reshape(-1,1)).flatten()
trues     = scaler_y.inverse_transform(trues_all.reshape(-1,1)).flatten()
errors    = trues - preds

mae   = mean_absolute_error(trues, preds)
rmse  = np.sqrt(mean_squared_error(trues, preds))
smape = np.mean(2.0 * np.abs(errors) / (np.abs(trues) + np.abs(preds) + 1e-8)) * 100
huber_vals = np.where(np.abs(errors) <= beta,
                      0.5 * errors**2 / beta,
                      np.abs(errors) - 0.5 * beta)
huber = huber_vals.mean()

print(f"\ntest → mae={mae:.4f}, rmse={rmse:.4f}, smape={smape:.4f}%, huber={huber:.4f}")

# 6) build metadata & save
class NpTorchJSONEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, np.generic):   return o.item()
        if isinstance(o, np.ndarray):   return o.tolist()
        if isinstance(o, torch.Tensor): return o.detach().cpu().tolist()
        if isinstance(o, torch.device): return str(o)
        if isinstance(o, datetime):     return o.isoformat()
        return super().default(o)

env_meta = {
    "seed_torch":          torch.initial_seed(),
    "seed_numpy":          np.random.get_state()[1][0],
    "seed_python":         random.getstate()[1][0],
    "cudnn_deterministic": getattr(torch.backends.cudnn, "deterministic", None),
    "cudnn_benchmark":     getattr(torch.backends.cudnn, "benchmark", None),
    "torch_version":       torch.__version__,
    "python_version":      platform.python_version(),
    "run_timestamp":       datetime.now(timezone.utc).isoformat()
}

data_meta = {
    "start":     df_train.index.min().strftime("%Y-%m-%d"),
    "train_end": train_end_date,
    "val_end":   val_end_date,
    "end":       df_test.index.max().strftime("%Y-%m-%d"),
    "n_train":   len(train_loader.dataset),
    "n_val":     len(val_loader.dataset),
    "n_test":    len(test_loader.dataset)
}

feat_meta = {
    "hist_feats": {
        "cols": hist_cols,
        "n": x_train_hist_scaled.shape[1]
    },
    "time_feats": {
        "cols": cal_cols,
        "n": time_feat_dim,
    },
    "fut_feats": {
        "prefixes": ["demand", "wind", "drm"],  # update this to match your data
        "n": n_fut_feats,
        "feed_len": feed_len,
        "fut_len": fut_len
    },
    "total_feats": len(hist_cols) + len(cal_cols) + n_fut_feats
}

loader_meta = {
    "batch_size":  batch_size,
    "shuffle":     {"train": True, "val": False, "test": False},
    "num_workers": os.cpu_count() or 1,
    "pin_memory":  pin_memory,
    "device":      str(device),
}

hyperparams_meta = {
    "model":         model_used,
    "seq_len":       seq_len,
    "feed_len":      feed_len,
    "horizon":       fut_len,
    "lstm_hidden":   lstm_hidden,
    "dec_hidden":    dec_hidden,
    "attn_dim":      attn_dim,
    "num_layers":    num_layers,
    "batch_size":    batch_size,
    "learning_rate": lr,
    "scaler":        scaler_used,
    "loss":          loss_used,
    **({"beta": beta} if loss_used == "huber" else {})
}

optim_meta = {
    "type": optimizer.__class__.__name__,
    "lr":   optimizer.defaults.get("lr"),
    **{k: optimizer.defaults[k]
       for k in ("betas","eps","weight_decay")
       if k in optimizer.defaults}
}

sched_meta = {
    "type":     scheduler.__class__.__name__,
    "mode":     getattr(scheduler, "mode", None),
    "factor":   getattr(scheduler, "factor", None),
    "patience": getattr(scheduler, "patience", None),
    "min_lr":   (scheduler.min_lrs[0]
                 if hasattr(scheduler, "min_lrs")
                 else getattr(scheduler, "eta_min", None)),
    "last_lr":  scheduler.get_last_lr()
}

earlystop_meta = {
    "max_epochs": max_epochs,
    "patience":  patience,
    "final_epoch": epoch,
    "best_epoch": epoch - epochs_no_improve
}

metrics_meta = {
    "mae":   float(mae),
    "rmse":  float(rmse),
    "smape": float(smape),
    "huber": float(huber)
}


# save model & scalers
torch.save({
    "model_state_dict":     model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "scheduler_state_dict": scheduler.state_dict()
}, base_dir / "torch_model.pt")

joblib.dump({
    "scaler_x": scaler_x,
    "scaler_f": scaler_f,
    "scaler_y": scaler_y
}, str(base_dir / "scalers.joblib"))

with open(base_dir / "test_summary.json", "w") as f:
    json.dump({
        "environment":  env_meta,
        "data":         data_meta,
        "features":     feat_meta,
        "dataloader":   loader_meta,
        "hyperparams":  hyperparams_meta,
        "optimizer":    optim_meta,
        "scheduler":    sched_meta,
        "early_stop":   earlystop_meta,
        "metrics":      metrics_meta
    }, f, indent=2, cls=NpTorchJSONEncoder)

print(f"✅ saved all outputs to {base_dir}")


num_hist_feats: 10
num_fut_feats: 3
time_feat_dim: 15


KeyboardInterrupt: 