### Sequential Training 

In [None]:
# lightning >=2.2
import torch, pytorch_lightning as pl
from torch.utils.data import IterableDataset, DataLoader
import pandas as pd
import numpy as np

# ---------- 1.  Build an iterable Dataset that streams in time order ----------
class RollingWindowDataset(IterableDataset):
    """
    Yields (X_window, y_target) pairs in chronological order.
    Assumes df already sorted by timestamp ascending.
    """
    def __init__(self, df: pd.DataFrame, feature_cols, target_col,
                 window: int = 24, horizon: int = 1):
        self.X = df[feature_cols].values.astype(np.float32)
        self.y = df[target_col].values.astype(np.float32)
        self.window, self.horizon = window, horizon

    def __iter__(self):
        # leave the last `horizon` rows unused as inputs
        for t in range(self.window, len(self.X) - self.horizon):
            X_win = self.X[t - self.window:t]          # shape (window, n_feat)
            y_tar = self.y[t + self.horizon - 1]       # scalar regression target
            yield torch.from_numpy(X_win), torch.tensor(y_tar)

# ---------- 2.  LightningDataModule ----------
class TSDataModule(pl.LightningDataModule):
    def __init__(self, df, feature_cols, target_col, window=24, horizon=1,
                 val_size=0.05, batch_size=32):
        super().__init__()
        self.save_hyperparameters()

    def setup(self, stage=None):
        df = self.hparams.df
        split_idx = int(len(df) * (1 - self.hparams.val_size))
        self.train_ds = RollingWindowDataset(df.iloc[:split_idx],
                                             self.hparams.feature_cols,
                                             self.hparams.target_col,
                                             self.hparams.window,
                                             self.hparams.horizon)
        self.val_ds   = RollingWindowDataset(df.iloc[split_idx:],
                                             self.hparams.feature_cols,
                                             self.hparams.target_col,
                                             self.hparams.window,
                                             self.hparams.horizon)

    def train_dataloader(self):
        return DataLoader(self.train_ds,
                          batch_size=self.hparams.batch_size,
                          shuffle=False,  # critical!
                          drop_last=False)

    def val_dataloader(self):
        return DataLoader(self.val_ds,
                          batch_size=self.hparams.batch_size,
                          shuffle=False)

# ---------- 3.  LightningModule ----------
import torch.nn as nn
import torch.nn.functional as F

class SimpleTSRegressor(pl.LightningModule):
    """
    Example network: flatten window → 2-layer MLP → scalar.
    Replace with LSTM/Transformer for better TS handling.
    """
    def __init__(self, n_features, window, lr=1e-3, hidden=128):
        super().__init__()
        self.save_hyperparameters()
        in_dim = n_features * window
        self.net = nn.Sequential(
            nn.Flatten(),                       # (B, window*n_feat)
            nn.Linear(in_dim, hidden),
            nn.ReLU(),
            nn.LayerNorm(hidden),
            nn.Linear(hidden, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(-1)           # (B,)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y)
        self.log("val_loss", loss, prog_bar=True)

    def configure_optimizers(self):
        opt = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)
        sched = torch.optim.lr_scheduler.StepLR(opt, step_size=10, gamma=0.5)
        return [opt], [sched]

# ---------- 4.  Putting it together ----------
def run_training(df, feature_cols, target_col):
    window = 24          # last 24 timesteps → predict t+1
    dm = TSDataModule(df, feature_cols, target_col, window=window,
                      horizon=1, val_size=0.05, batch_size=64)
    n_feat = len(feature_cols)
    model = SimpleTSRegressor(n_features=n_feat, window=window)
    trainer = pl.Trainer(max_epochs=50,
                         gradient_clip_val=1.0,
                         callbacks=[
                             pl.callbacks.ModelCheckpoint(
                                 monitor="val_loss", save_top_k=3, mode="min"
                             )
                         ])
    trainer.fit(model, dm)

Loss Logger 

In [None]:
from pytorch_lightning.loggers import TensorBoardLogger
logger = TensorBoardLogger("lightning_logs", name="seq_ts")

trainer = pl.Trainer(max_epochs=50,
                     logger=logger,              # ⬅️ add logger
                     callbacks=[
                         pl.callbacks.ModelCheckpoint(
                             monitor="val_loss", mode="min", save_top_k=3),
                         pl.callbacks.RichProgressBar()  # nicer bar (optional)
                     ])
trainer.fit(model, datamodule=dm)

### Choosing different mechanisms 

In [None]:
import torch
import torch.nn as nn
import math

def build_backbone(arch: str,
                   n_features: int,
                   window: int,
                   hidden: int = 128,
                   n_layers: int = 2,
                   n_heads: int = 4):
    """
    Returns a nn.Module that maps (B, window, n_features) ➜ (B, 1)
    Supported arch: "mlp", "lstm", "gru", "cnn", "transformer"
    """
    arch = arch.lower()

    # 1) Plain MLP (baseline we used before)
    if arch == "mlp":
        in_dim = n_features * window
        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_dim, hidden),
            nn.ReLU(),
            nn.LayerNorm(hidden),
            nn.Linear(hidden, 1)
        )

    # 2) LSTM (take last hidden state)
    if arch == "lstm":
        class LSTMHead(nn.Module):
            def __init__(self):
                super().__init__()
                self.lstm = nn.LSTM(
                    input_size=n_features,
                    hidden_size=hidden,
                    num_layers=n_layers,
                    batch_first=True)
                self.fc = nn.Linear(hidden, 1)

            def forward(self, x):             # x: (B, w, n_feat)
                out, _ = self.lstm(x)
                return self.fc(out[:, -1])     # last time step
        return LSTMHead()

    # 3) GRU
    if arch == "gru":
        class GRUHead(nn.Module):
            def __init__(self):
                super().__init__()
                self.gru = nn.GRU(
                    input_size=n_features,
                    hidden_size=hidden,
                    num_layers=n_layers,
                    batch_first=True)
                self.fc = nn.Linear(hidden, 1)

            def forward(self, x):
                out, _ = self.gru(x)
                return self.fc(out[:, -1])
        return GRUHead()

    # 4) 1-D Temporal Convolution (Causal)
    if arch == "cnn":
        class CNNHead(nn.Module):
            def __init__(self):
                super().__init__()
                self.conv = nn.Sequential(
                    nn.Conv1d(n_features, hidden,
                              kernel_size=3, padding=2, dilation=2),
                    nn.ReLU(),
                    nn.Conv1d(hidden, hidden,
                              kernel_size=3, padding=4, dilation=4),
                    nn.ReLU(),
                    nn.AdaptiveAvgPool1d(1)
                )
                self.fc = nn.Linear(hidden, 1)

            def forward(self, x):
                x = x.permute(0, 2, 1)        # (B, C=n_feat, L=window)
                x = self.conv(x).squeeze(-1)   # (B, hidden)
                return self.fc(x)
        return CNNHead()

    # 5) Transformer Encoder (positional embedding + last token)
    if arch == "transformer":
        class TransEncHead(nn.Module):
            def __init__(self):
                super().__init__()
                self.input_proj = nn.Linear(n_features, hidden)
                layer = nn.TransformerEncoderLayer(
                    d_model=hidden, nhead=n_heads,
                    batch_first=True, norm_first=True)
                self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
                # learned positional embedding
                self.pos = nn.Parameter(torch.zeros(window, hidden))
                nn.init.uniform_(self.pos, -0.02, 0.02)
                self.fc = nn.Linear(hidden, 1)

            def forward(self, x):
                x = self.input_proj(x) + self.pos  # add position
                x = self.encoder(x)
                return self.fc(x[:, -1])
        return TransEncHead()

    raise ValueError(f"Unknown architecture: {arch}")


    ### Calling Function ###
    
import pytorch_lightning as pl
import torch.nn.functional as F

class FlexibleTSRegressor(pl.LightningModule):
    """
    Same loss-switcher we used before; only backbone replaced by build_backbone.
    """
    def __init__(self, *,
                 n_features: int,
                 window: int,
                 backbone: str = "mlp",
                 hidden: int = 128,
                 loss_name: str = "mse",
                 lr: float = 1e-3,
                 **loss_kw):
        super().__init__()
        self.save_hyperparameters()

        # build chosen architecture
        self.backbone = build_backbone(backbone,
                                       n_features, window,
                                       hidden=hidden)

        # ----- loss selection (identical to earlier snippet) ------------
        ln = loss_name.lower()
        if ln == "mse":
            self.criterion = nn.MSELoss()
        elif ln == "mae":
            self.criterion = nn.L1Loss()
        elif ln == "huber":
            beta = loss_kw.get("huber_beta", 1.0)
            self.criterion = nn.SmoothL1Loss(beta=beta)
        else:
            self.criterion = None             # rmse / quantile handled below
        self.loss_name = ln
        self.quantile = loss_kw.get("quantile", 0.9)

    # ----- forward & loss ------------------------------------------------
    def forward(self, x):
        return self.backbone(x).squeeze(-1)

    def _compute_loss(self, y_hat, y):
        if self.loss_name in ("mse", "mae", "huber"):
            return self.criterion(y_hat, y)
        if self.loss_name == "rmse":
            return F.mse_loss(y_hat, y).sqrt()
        if self.loss_name == "quantile":
            q = self.quantile
            diff = y_hat - y
            return torch.where(diff >= 0, q * diff, (q - 1) * diff).mean()
        raise RuntimeError

    # ----- training & validation ----------------------------------------
    def training_step(self, batch, _):
        x, y = batch
        loss = self._compute_loss(self(x), y)
        self.log("train_loss", loss, prog_bar=True, on_epoch=True)
        return loss

    def validation_step(self, batch, _):
        x, y = batch
        loss = self._compute_loss(self(x), y)
        self.log("val_loss", loss, prog_bar=True, on_epoch=True)

    def configure_optimizers(self):
        opt = torch.optim.AdamW(self.parameters(),
                                lr=self.hparams.lr)
        sch = torch.optim.lr_scheduler.StepLR(opt, 10, gamma=0.5)
        return [opt], [sch]

# keep your existing RollingWindowDataset and TSDataModule

n_feat  = len(feature_cols)
window  = 24

# ➊ MLP (baseline)
model = FlexibleTSRegressor(n_features=n_feat,
                            window=window,
                            backbone="mlp")

# ➋ LSTM
model = FlexibleTSRegressor(n_features=n_feat,
                            window=window,
                            backbone="lstm",
                            hidden=256, n_layers=2)

# ➌ GRU
model = FlexibleTSRegressor(n_features=n_feat,
                            window=window,
                            backbone="gru",
                            hidden=256, n_layers=3)

#  Temporal-CNN
model = FlexibleTSRegressor(n_features=n_feat,
                            window=window,
                            backbone="cnn",
                            hidden=128)

# Transformer Encoder
model = FlexibleTSRegressor(n_features=n_feat,
                            window=window,
                            backbone="transformer",
                            hidden=128,
                            n_layers=2,
                            n_heads=4,
                            loss_name="quantile",
                            quantile=0.95)

In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 1. Imports
# ──────────────────────────────────────────────────────────────────────────────
import os, torch, pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from pytorch_lightning.loggers import TensorBoardLogger

# ──────────────────────────────────────────────────────────────────────────────
# 2. Rolling-window Dataset  (indexable so Lightning knows its length)
# ──────────────────────────────────────────────────────────────────────────────
class RollingWindowDataset(Dataset):
    def __init__(self, df: pd.DataFrame,
                 feature_cols, target_col,
                 window: int = 24, horizon: int = 1):
        self.X  = df[feature_cols].values.astype("float32")
        self.y  = df[target_col].values.astype("float32")
        self.w  = window
        self.h  = horizon
        self.indices = range(window, len(df) - horizon)

    def __len__(self):               # Lightning can now draw 0/??? bars
        return len(self.indices)

    def __getitem__(self, idx):
        t = self.indices[idx]
        X_win = self.X[t-self.w : t]              # shape (window, n_feat)
        y_tar = self.y[t + self.h - 1]            # scalar
        return torch.from_numpy(X_win), torch.tensor(y_tar)

# ──────────────────────────────────────────────────────────────────────────────
# 3. DataModule (keeps order, no shuffle!)
# ──────────────────────────────────────────────────────────────────────────────
class TSDataModule(pl.LightningDataModule):
    def __init__(self, df, feature_cols, target_col,
                 window=24, horizon=1,
                 val_size=0.05, batch_size=64,
                 num_workers=os.cpu_count()//2):
        super().__init__()
        self.save_hyperparameters(ignore=["df"])
        self.df = df.copy()

    def setup(self, stage=None):
        split = int(len(self.df) * (1 - self.hparams.val_size))
        train_df = self.df.iloc[:split]
        val_df   = self.df.iloc[split:]

        self.train_ds = RollingWindowDataset(train_df,
                                             self.hparams.feature_cols,
                                             self.hparams.target_col,
                                             self.hparams.window,
                                             self.hparams.horizon)
        self.val_ds   = RollingWindowDataset(val_df,
                                             self.hparams.feature_cols,
                                             self.hparams.target_col,
                                             self.hparams.window,
                                             self.hparams.horizon)

    def train_dataloader(self):
        return DataLoader(self.train_ds,
                          batch_size=self.hparams.batch_size,
                          shuffle=False,
                          drop_last=False,
                          num_workers=self.hparams.num_workers,
                          persistent_workers=True)

    def val_dataloader(self):
        return DataLoader(self.val_ds,
                          batch_size=self.hparams.batch_size,
                          shuffle=False,
                          num_workers=self.hparams.num_workers,
                          persistent_workers=True)

# ──────────────────────────────────────────────────────────────────────────────
# 4. Simple MLP regressor (swap with LSTM/GRU later if you like)
# ──────────────────────────────────────────────────────────────────────────────
class SimpleTSRegressor(pl.LightningModule):
    def __init__(self, n_features, window, hidden=128, lr=1e-3):
        super().__init__()
        self.save_hyperparameters()
        self.net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(window * n_features, hidden),
            nn.ReLU(),
            nn.LayerNorm(hidden),
            nn.Linear(hidden, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(-1)

    def training_step(self, batch, _):
        x, y = batch
        loss = F.mse_loss(self(x), y)
        self.log("train_loss", loss, prog_bar=True, on_epoch=True)
        return loss

    def validation_step(self, batch, _):
        x, y = batch
        loss = F.mse_loss(self(x), y)
        self.log("val_loss", loss, prog_bar=True, on_epoch=True)

    def configure_optimizers(self):
        opt  = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)
        sched = torch.optim.lr_scheduler.StepLR(opt, 10, gamma=0.5)
        return [opt], [sched]

# ──────────────────────────────────────────────────────────────────────────────
# 5. One-shot training + return best checkpoint path
# ──────────────────────────────────────────────────────────────────────────────
def train_and_get_best_ckpt(df, feature_cols, target_col,
                            window=24, horizon=1):
    dm = TSDataModule(df, feature_cols, target_col,
                      window=window, horizon=horizon)

    model = SimpleTSRegressor(n_features=len(feature_cols), window=window)

    ckpt_cb = pl.callbacks.ModelCheckpoint(
        monitor="val_loss", mode="min", save_top_k=1)

    logger = TensorBoardLogger("lightning_logs", name="seq_ts")

    trainer = pl.Trainer(
        max_epochs=50,
        accelerator="auto",
        devices="auto",
        logger=logger,
        callbacks=[ckpt_cb, pl.callbacks.RichProgressBar()])
    trainer.fit(model, dm)

    return ckpt_cb.best_model_path

# ──────────────────────────────────────────────────────────────────────────────
# 6. ==== RUN TRAINING =========================================================
# df, feature_cols, target_col must already exist in your interpreter
# ──────────────────────────────────────────────────────────────────────────────
ckpt_path = train_and_get_best_ckpt(df, feature_cols, target_col)
print("Best model checkpoint ➜", ckpt_path)

# ──────────────────────────────────────────────────────────────────────────────
# 7. Load best model & make predictions on (for example) the validation slice
# ──────────────────────────────────────────────────────────────────────────────
best_model = SimpleTSRegressor.load_from_checkpoint(
                 ckpt_path,
                 n_features=len(feature_cols),
                 window=24).eval()

val_ds = RollingWindowDataset(df.iloc[int(len(df)*0.95):],
                              feature_cols, target_col,
                              window=24, horizon=1)
val_dl = DataLoader(val_ds, batch_size=64, shuffle=False)

preds = []
with torch.no_grad():
    for x, _ in val_dl:
        preds.append(best_model(x).cpu())
preds = torch.cat(preds).numpy()
print("Inference done. 1st five predictions:", preds[:5])

# ──────────────────────────────────────────────────────────────────────────────
# 8. Launch TensorBoard (run this **in a terminal**, not inside Python)
# ──────────────────────────────────────────────────────────────────────────────
#   tensorboard --logdir lightning_logs
# Then open the printed URL in your browser to see train/val loss curves.


In [None]:
tensorboard --logdir lightning_logs