# ETTS Time-Series Forecasting with S4 (Agnostic Flow)

Train S4 model on Electricity Transformer Temperature (ETT) dataset using the same task-agnostic trainer and evaluation flow as LMU. The configuration is kept generic so we can swap LMU/S4 easily for fair comparison.

In [1]:
from __future__ import annotations
from typing import Dict, Any
from pathlib import Path
import torch

from src.notebooks.etts.utils import ETTSTask, make_block_cfg_ctor, evaluate_best_model

CUDA extension for structured kernels (Cauchy and Vandermonde multiplication) not found. Install by going to extensions/kernels/ and running `python setup.py install`, for improved speed and memory efficiency. Note that the kernel changed for state-spaces 4.0 and must be recompiled.
Falling back on slow Cauchy and Vandermonde kernel. Install at least one of pykeops or the CUDA extension for better speed and memory efficiency.


## Configuration

In [2]:
current_dir = Path.cwd()
project_root = current_dir.parent.parent.parent
data_root = str(project_root / "src" / "datasets" / "etts" / "data")

# S4-specific base parameters
s4_base_params = {
    "d_state": 64,
    "channels": 1,
    "bidirectional": False,  # Causal for forecasting
    "mode": "s4d",
    "dt_min": 1e-3,
    "dt_max": 1e-1,
}

# --- Agnostic Configurations for each ETT Dataset ---

etth1_config_s4 = {
    "data_root": data_root,
    "batch": 64,
    "epochs": 50,
    "lr": 1e-4,
    "wd": 1e-3,
    "amp": True,
    "save_dir": "./runs/etts_s4_task_h1",
    "warmup_epochs": 5,
    "patience": 10,
    "min_delta": 0.001,
    "d_model": 256,
    "depth": 6,
    "dropout": 0.3,
    "mlp_ratio": 2.0,
    "droppath_final": 0.1,
    "layerscale_init": 1e-2,
    "residual_gain": 1.0,
    "pool": "none",
    "data_loader_kwargs": {
        "num_workers": 0,
        "which": "ETTh1",
        "seq_len": 96,
        "pred_len": 24,
        "feature_mode": "target",
        "target_col": "OT",
        "split_ratio": (0.7, 0.1, 0.2),
        "normalize": "zscore",
        "pin_memory": False,
        "persistent_workers": False,
    },
    **s4_base_params
}

etth2_config_s4 = {
    "data_root": data_root,
    "batch": 64,
    "epochs": 50,
    "lr": 1e-4,
    "wd": 2e-3,
    "amp": True,
    "save_dir": "./runs/etts_s4_task_h2",
    "warmup_epochs": 5,
    "patience": 10,
    "min_delta": 0.001,
    "d_model": 256,
    "depth": 4,
    "dropout": 0.3,
    "mlp_ratio": 2.0,
    "droppath_final": 0.1,
    "layerscale_init": 1e-2,
    "residual_gain": 1.0,
    "pool": "none",
    "data_loader_kwargs": {
        "num_workers": 0,
        "which": "ETTh2",
        "seq_len": 96,
        "pred_len": 24,
        "feature_mode": "target",
        "target_col": "OT",
        "split_ratio": (0.7, 0.1, 0.2),
        "normalize": "zscore",
        "pin_memory": False,
        "persistent_workers": False,
    },
    **s4_base_params
}

ettm1_config_s4 = {
    "data_root": data_root,
    "batch": 64,
    "epochs": 50,
    "lr": 1e-4,
    "wd": 2e-3,
    "amp": True,
    "save_dir": "./runs/etts_s4_task_m1",
    "warmup_epochs": 5,
    "patience": 10,
    "min_delta": 0.001,
    "d_model": 256,
    "depth": 4,
    "dropout": 0.3,
    "mlp_ratio": 2.0,
    "droppath_final": 0.1,
    "layerscale_init": 1e-2,
    "residual_gain": 1.0,
    "pool": "none",
    "data_loader_kwargs": {
        "num_workers": 0,
        "which": "ETTm1",
        "seq_len": 96,
        "pred_len": 24,
        "feature_mode": "target",
        "target_col": "OT",
        "split_ratio": (0.7, 0.1, 0.2),
        "normalize": "zscore",
        "pin_memory": False,
        "persistent_workers": False,
    },
    **s4_base_params
}

ettm2_config_s4 = {
    "data_root": data_root,
    "batch": 64,
    "epochs": 50,
    "lr": 1e-4,
    "wd": 2e-3,
    "amp": True,
    "save_dir": "./runs/etts_s4_task_m2",
    "warmup_epochs": 5,
    "patience": 10,
    "min_delta": 0.001,
    "d_model": 256,
    "depth": 4,
    "dropout": 0.3,
    "mlp_ratio": 2.0,
    "droppath_final": 0.1,
    "layerscale_init": 1e-2,
    "residual_gain": 1.0,
    "pool": "none",
    "data_loader_kwargs": {
        "num_workers": 0,
        "which": "ETTm2",
        "seq_len": 96,
        "pred_len": 24,
        "feature_mode": "target",
        "target_col": "OT",
        "split_ratio": (0.7, 0.1, 0.2),
        "normalize": "zscore",
        "pin_memory": False,
        "persistent_workers": False,
    },
    **s4_base_params
}

args: Dict[str, Any] = ettm1_config_s4
print(f"Training S4 on: {args['data_loader_kwargs']['which']}")


# Agnostic: swap kind between "s4" and "lmu" to compare
args["block_cfg_ctor"] = make_block_cfg_ctor(
    kind="s4",
    dropout=args["dropout"],
    mlp_ratio=args["mlp_ratio"],
    droppath_final=args["droppath_final"],
    layerscale_init=args["layerscale_init"],
    residual_gain=args["residual_gain"],
    pool=args["pool"],
    # LMU
    memory_size=256,
    # S4
    d_state=args["d_state"],
    channels=args["channels"],
    bidirectional=args["bidirectional"],
    mode=args["mode"],
    dt_min=args["dt_min"],
    dt_max=args["dt_max"],
)

# Device selection (MPS first)
if torch.backends.mps.is_available():
    args["device"] = torch.device("mps")
    print("Using MPS (Apple Silicon)")
elif torch.cuda.is_available():
    args["device"] = torch.device("cuda")
else:
    args["device"] = torch.device("cpu")
    args["amp"] = False


Training S4 on: ETTm1
Using MPS (Apple Silicon)


## Training

In [None]:
from src.train_utils.trainer import Trainer
from src.models.v2.build_model import build_model
import os

# Define the task
task = ETTSTask()

# MPS memory options if available
if args.get("device") and args["device"].type == "mps":
    torch.mps.set_per_process_memory_fraction(0.9)
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

trainer = Trainer(args=args, task=task, model_builder=build_model)

best_metric, best_path = trainer.fit()

history = trainer.history

print(f"\nTraining complete! Best validation {trainer.early_key}: {best_metric:.6f}")
print(f"Best model saved to: {best_path}")


  A = T @ M @ np.linalg.inv(T)
  A = T @ M @ np.linalg.inv(T)
  A = T @ M @ np.linalg.inv(T)
                                                        

ðŸ’¾ saved best model to ./runs/etts_s4_task_m1/best.pt
âœ… new best mse 2.5395
Epoch 000/50 | train 1.2114/1.2114 | val 2.5395/2.5395 | t 118.5s/7.6s | lr 1.00e-07


train:  18%|â–ˆâ–Š        | 134/760 [00:21<01:40,  6.21it/s]

## Plot History

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history["train_loss"], label="train_loss")
plt.plot(history["val_loss"], label="val_loss")
plt.xlabel("epoch")
plt.ylabel("MSE Loss")
plt.legend()
plt.title("Training Loss")
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history["train_mae"], label="train_mae")
plt.plot(history["val_mae"], label="val_mae")
plt.xlabel("epoch")
plt.ylabel("MAE")
plt.legend()
plt.title("Mean Absolute Error")
plt.grid(True)
plt.tight_layout()
plt.show()


## Test Evaluation

In [None]:
from src.models.v2.build_model import build_model

preds, targets = evaluate_best_model(
    args=args,
    task=task,
    model_builder=build_model,
    best_model_path=best_path,
)