In [1]:
import torch, os
device = "mps" if torch.backends.mps.is_available() else "cpu"
print("Device:", device)

Device: mps


In [2]:
# # tvar_neural_operator.py
# # Time-Varying AR as a Neural Operator over Continuous Delays
# # - Learns k_t(τ) with Fourier features (operator-style kernel in delay domain)
# # - Works across sampling rates via continuous-τ interpolation
# # - Includes multi-horizon loss and refresh-every-k rollout

# import math
# import numpy as np
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# # ----------------------------
# # Utilities
# # ----------------------------
# def to_tensor(x, device="cpu", dtype=torch.float32):
#     if isinstance(x, np.ndarray):
#         x = torch.from_numpy(x)
#     return x.to(device=device, dtype=dtype)

# def fractional_delay_samples(x,  # [B, T]
#                              tau_grid,  # [L] delays in *seconds*
#                              dt,        # scalar seconds per sample
#                              t_offset=0):
#     """
#     Gather x(t - τ_ℓ) with linear interpolation for a batch over time.
#     - x: [B,T]
#     - tau_grid: [L] (seconds), monotone increasing (e.g., np.linspace(Δmin, Δmax, L))
#     - dt: float (seconds/sample)
#     - t_offset: starting absolute time index (in samples) for the first output time step.
#       For plain teacher-forced 1-step at times t = 0..T-1, set t_offset=0.
#       For test windows taken from the middle of a long series, set t_offset to that start index.
#     Returns:
#       Xlags: [B, T, L] with x(t - τℓ).
#     """
#     B, T = x.shape
#     device = x.device
#     L = len(tau_grid)
#     # Convert τ (sec) -> τ_idx (samples)
#     tau_idx = to_tensor(np.asarray(tau_grid) / dt, device=device)  # [L]
#     # times in *sample indices* for each step we predict
#     t_idx = to_tensor(np.arange(T) + t_offset, device=device).view(1, T, 1)  # [1,T,1]
#     # desired source indices (float)
#     src = t_idx - tau_idx.view(1, 1, L)  # [1,T,L]
#     src0 = torch.clamp(torch.floor(src), 0, T - 1)        # lower index
#     src1 = torch.clamp(src0 + 1, 0, T - 1)                # upper index
#     w = (src - src0).to(x.dtype)                          # interpolation weight

#     # Gather with batch broadcasting
#     idx0 = src0.long().expand(B, -1, -1)  # [B,T,L]
#     idx1 = src1.long().expand(B, -1, -1)  # [B,T,L]
#     x_exp = x.unsqueeze(-1).expand(-1, -1, L)  # [B,T,L] (for fancy indexing we gather per time below)
#     # We need to gather per time step; easiest is to reshape
#     x0 = torch.gather(x_exp, 1, idx0)  # [B,T,L]
#     x1 = torch.gather(x_exp, 1, idx1)  # [B,T,L]
#     return (1 - w) * x0 + w * x1       # [B,T,L]

# def total_variation_time(k):
#     # k: [B,T,L] -> TV along time
#     return (k[:, 1:, :] - k[:, :-1, :]).abs().mean()

# def l1_energy(k):
#     return k.abs().mean()

# # ----------------------------
# # Model: TVAROperator
# # ----------------------------
# class TVAROperator(nn.Module):
#     """
#     Time-varying AR as a neural operator over continuous delays:
#       y_t = c(t) + ∫ k_t(τ) x(t-τ) dτ  ≈  c(t) + Σ k_t(τ_ℓ) x(t-τ_ℓ) Δτ
#     We discretize τ on a grid and parameterize k_t(τ) via Fourier features of τ,
#     with time-varying amplitudes produced by a small causal context encoder.

#     Inputs:
#       L        : number of delay points (τ samples) within [tau_min, tau_max]
#       tau_min  : minimum delay (seconds) > 0
#       tau_max  : maximum delay (seconds) > tau_min
#       n_modes  : # Fourier modes for kernel over τ
#       hidden   : channels in context encoder
#     """
#     def __init__(self, L=128, tau_min=0.0, tau_max=0.5, n_modes=16, hidden=64):
#         super().__init__()
#         assert tau_max > tau_min >= 0.0
#         self.L = L
#         self.tau_min = tau_min
#         self.tau_max = tau_max
#         self.register_buffer("tau_grid", torch.linspace(tau_min, tau_max, L))  # seconds

#         # Causal context encoder over time (1D convs, left padding)
#         self.ctx = nn.Sequential(
#             nn.Conv1d(1, hidden, kernel_size=9, padding=8, dilation=2), nn.ReLU(),
#             nn.Conv1d(hidden, hidden, kernel_size=5, padding=4, dilation=2), nn.ReLU()
#         )

#         # Fourier basis over τ (fixed frequencies 0..π * modes)
#         self.register_buffer("freqs", torch.linspace(0.0, math.pi, n_modes))
#         self.head_a = nn.Linear(hidden, n_modes)  # cos amplitudes
#         self.head_b = nn.Linear(hidden, n_modes)  # sin amplitudes
#         self.bias   = nn.Linear(hidden, 1)        # c(t), time-varying intercept

#         # Optional global gain to stabilize scale
#         self.kernel_gain = nn.Parameter(torch.tensor(0.1))

#     def make_kernel(self, h):
#         """
#         h: [B,T,H] context features -> k: [B,T,L] kernel over delays τ
#         k_t(τ) = Σ_m [ a_m(t) cos(ω_m τ) + b_m(t) sin(ω_m τ) ]
#         """
#         B, T, H = h.shape
#         a = self.head_a(h)  # [B,T,M]
#         b = self.head_b(h)  # [B,T,M]
#         # Build Fourier features over τ once, broadcast to batch/time
#         tau = self.tau_grid.view(1, 1, self.L, 1)         # [1,1,L,1]
#         omega = self.freqs.view(1, 1, 1, -1)              # [1,1,1,M]
#         cosF = torch.cos(omega * tau)                     # [1,1,L,M]
#         sinF = torch.sin(omega * tau)                     # [1,1,L,M]
#         # combine with amplitudes
#         k = (a.unsqueeze(2) * cosF + b.unsqueeze(2) * sinF).sum(-1)  # [B,T,L]
#         return self.kernel_gain * k

#     # def forward(self, x, dt, t_offset=0, return_kernel=False):
#     #     """
#     #     x: [B,T] observed scalar series
#     #     dt: float seconds per sample (tensor or python float)
#     #     t_offset: absolute index (samples) for time=0 of x wrt the original series (for eval alignment)
#     #     returns:
#     #       yhat: [B,T]
#     #       (optional) k: [B,T,L], c: [B,T], Xlags: [B,T,L]
#     #     """
#     #     B, T = x.shape
#     #     # causal context features
#     #     # x1 = x.unsqueeze(1)                               # [B,1,T]
#     #     # h  = self.ctx(F.pad(x1, (32, 0)))                 # left pad -> causal
#     #     # hT = h.transpose(1, 2)                            # [B,T,H]
#     #     # k  = self.make_kernel(hT)                         # [B,T,L]
#     #     # c  = self.bias(hT).squeeze(-1)                    # [B,T]

#     #     # # sample lagged signal at continuous τ-grid
#     #     # Xlags = fractional_delay_samples(x, self.tau_grid, float(dt), t_offset=t_offset)  # [B,T,L]

#     #    # inside TVAROperator.forward(...)
#     #     x1 = x.unsqueeze(1)                               # [B,1,T]
#     #     h  = self.ctx(F.pad(x1, (32, 0)))                 # [B,H,T+32] due to extra pad
#     #     h  = h[..., -x.shape[-1]:]                        # <-- crop to last T to align
#     #     hT = h.transpose(1, 2)                            # [B,T,H]
#     #     k  = self.make_kernel(hT)                         # [B,T,L]
#     #     c  = self.bias(hT).squeeze(-1)                    # [B,T]
#     #     Xlags = fractional_delay_samples(x, self.tau_grid, float(dt), t_offset=t_offset)  # [B,T,L]
#     #     yhat = (k * Xlags).sum(-1) * delta_tau + c

#     #     # Riemann sum over τ (Δτ constant)
#     #     delta_tau = (self.tau_max - self.tau_min) / max(self.L - 1, 1)
#     #     yhat = (k * Xlags).sum(-1) * delta_tau + c        # [B,T]
#     #     if return_kernel:
#     #         return yhat, k, c, Xlags
#     #     return yhat
#     def forward(self, x, dt, t_offset=0, return_kernel=False):
#         B, T = x.shape

#         # causal context features
#         x1 = x.unsqueeze(1)                               # [B,1,T]
#         h  = self.ctx(F.pad(x1, (32, 0)))                 # [B,H,T+32] due to extra left pad
#         h  = h[..., -T:]                                   # <-- crop back to length T
#         hT = h.transpose(1, 2)                            # [B,T,H]

#         k  = self.make_kernel(hT)                         # [B,T,L]
#         c  = self.bias(hT).squeeze(-1)                    # [B,T]

#         # sample lagged signal at continuous τ-grid
#         Xlags = fractional_delay_samples(
#             x, self.tau_grid, float(dt), t_offset=t_offset
#         )                                                 # [B,T,L]

#         # Riemann sum over τ (Δτ constant)  <-- define BEFORE yhat
#         delta_tau = (self.tau_max - self.tau_min) / max(self.L - 1, 1)

#         yhat = (k * Xlags).sum(-1) * delta_tau + c        # [B,T]

#         if return_kernel:
#             return yhat, k, c, Xlags
#         return yhat


# # ----------------------------
# # Losses & training helpers
# # ----------------------------
# def rollout_multi_horizon(model, x, dt, horizons, refresh_every=1):
#     """
#     Multi-horizon prediction with optional refresh-every-k hybrid.
#     - x: [B,T] full sequence used for teacher forcing at refresh points
#     - horizons: list like [1,5,20]
#     Returns:
#       dict: {h: yhat_h [B,T]} where yhat_h aligns so that yhat_h[:, t] predicts x[:, t+h]
#     """
#     B, T = x.shape
#     device = x.device
#     outs = {}
#     # For each horizon, we do a causal pass that mixes teacher-forced prefixes and open-loop segments.
#     # Simple approach: simulate step-by-step and store predictions for all horizons we need.
#     max_h = max(horizons)
#     # buffer for open-loop generation
#     x_gen = x.clone()  # start from truth; we overwrite future points when we go open-loop

#     for t in range(T - max_h):
#         # every refresh_every steps, reset generator segment to truth
#         if (t % max(1, int(refresh_every))) == 0:
#             # ensure the last window aligns with truth up to t
#             x_gen[:, :t+1] = x[:, :t+1]

#         # predict next step using the operator at the *current* dt
#         # we need a window covering (at least) the τ_max back in time; the model samples internally
#         # For efficiency, we call model once per block; here we do a tiny call that returns the immediate next step.
#         yhat_step = model(x_gen[:, :t+1], dt, t_offset=0)  # [B, t+1]
#         next_pred = yhat_step[:, -1]                       # [B]
#         # write predicted (open-loop) step into x_gen
#         x_gen[:, t+1] = next_pred

#         # record the horizons that equal 1 at this step (and later ones below)
#         # We'll collect full arrays after the loop.
#         pass

#     # After generating, compute horizon-specific aligned predictions in one shot
#     for h in horizons:
#         # For each t, yhat_h[t] should be the model's prediction of x[t+h].
#         # A simple approximation from the above loop is to take x_gen shifted by -h.
#         yhat_h = torch.zeros_like(x)
#         yhat_h[:, :-h] = x_gen[:, h:]
#         yhat_h[:, -h:] = x_gen[:, -1:].expand(-1, h)  # dummy fill (unused in loss)
#         outs[h] = yhat_h
#     return outs

# def loss_step(model, batch_x, dt, horizons=(1,5,20), refresh_every=1,
#               lambda_tv=1e-3, lambda_l1=1e-4):
#     """
#     Compute:
#       - 1-step MSE (teacher-forced, direct forward)
#       - multi-horizon rollout MSE with refresh-every-k hybrid
#       - TV and L1 regularization on kernels
#     """
#     y1, k, c, _ = model(batch_x, dt, return_kernel=True)   # teacher-forced pass
#     # 1-step alignment: y1[:, :-1] predicts x[:, 1:]
#     mse_1 = F.mse_loss(y1[:, :-1], batch_x[:, 1:])

#     # rollout losses
#     outs = rollout_multi_horizon(model, batch_x, dt, horizons, refresh_every=refresh_every)
#     mse_roll = 0.0
#     for h in horizons:
#         # only compare where target exists
#         mse_roll = mse_roll + F.mse_loss(outs[h][:, :-h], batch_x[:, h:])
#     mse_roll = mse_roll / len(horizons)

#     # regularization
#     reg = lambda_tv * total_variation_time(k) + lambda_l1 * l1_energy(k)
#     return mse_1, mse_roll, reg, {"k": k.detach()}

# # ----------------------------
# # Data prep: toy Lorenz x(t)
# # ----------------------------
# def lorenz(T_steps=30000, dt=0.005, sigma=10.0, rho=28.0, beta=8/3, x0=(1.0,1.0,1.0)):
#     x, y, z = x0
#     xs, ys, zs = [], [], []
#     for _ in range(T_steps):
#         def f(x,y,z):
#             dx = sigma*(y-x)
#             dy = x*(rho - z) - y
#             dz = x*y - beta*z
#             return dx, dy, dz
#         k1 = f(x,y,z)
#         k2 = f(x + 0.5*dt*k1[0], y + 0.5*dt*k1[1], z + 0.5*dt*k1[2])
#         k3 = f(x + 0.5*dt*k2[0], y + 0.5*dt*k2[1], z + 0.5*dt*k2[2])
#         k4 = f(x + dt*k3[0], y + dt*k3[1], z + dt*k3[2])
#         x += (dt/6.0)*(k1[0] + 2*k2[0] + 2*k3[0] + k4[0])
#         y += (dt/6.0)*(k1[1] + 2*k2[1] + 2*k3[1] + k4[1])
#         z += (dt/6.0)*(k1[2] + 2*k2[2] + 2*k3[2] + k4[2])
#         xs.append(x); ys.append(y); zs.append(z)
#     return np.array(xs), np.array(ys), np.array(zs)

# # ----------------------------
# # Minimal training loop (demo)
# # ----------------------------
# if __name__ == "__main__":
#     device = "cuda" if torch.cuda.is_available() else "cpu"

#     # --- Generate toy data (replace with your real series) ---
#     dt0 = 0.005
#     xs, ys, zs = lorenz(T_steps=20000, dt=dt0)
#     x_np = xs.astype(np.float32)
#     # standardize (optional but helpful)
#     x_np = (x_np - x_np.mean()) / (x_np.std() + 1e-8)

#     # Train / test split
#     T = len(x_np)
#     split = int(0.8 * T)
#     x_train = to_tensor(x_np[:split][None, :], device)   # [B=1, T_tr]
#     x_test  = to_tensor(x_np[split:][None, :], device)   # [B=1, T_te]

#     # --- Build model ---
#     # Choose τ window in *seconds* (e.g., up to 0.5 s; tune to your data)
#     tau_min, tau_max = 0.0, 0.5
#     model = TVAROperator(L=128, tau_min=tau_min, tau_max=tau_max, n_modes=16, hidden=64).to(device)
#     opt = torch.optim.Adam(model.parameters(), lr=2e-3, weight_decay=1e-6)

#     # --- Train ---
#     horizons = (1, 5, 20)
#     refresh_every = 5   # hybrid rollout during training (matches your pipeline idea)

#     for epoch in range(20):
#         model.train()
#         mse1, mser, reg, extras = loss_step(model, x_train, dt0, horizons=horizons,
#                                             refresh_every=refresh_every,
#                                             lambda_tv=1e-3, lambda_l1=1e-4)
#         loss = mse1 + mser + reg
#         opt.zero_grad(set_to_none=True)
#         loss.backward()
#         nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#         opt.step()

#         if (epoch+1) % 5 == 0:
#             print(f"[{epoch+1:03d}] 1-step: {mse1.item():.4e}  roll:{mser.item():.4e}  reg:{reg.item():.4e}")

#     # --- Evaluate on test at original dt ---
#     model.eval()
#     with torch.no_grad():
#         # Teacher-forced 1-step MSE
#         yhat_test = model(x_test, dt0)        # [1,Tte]
#         mse1_te = F.mse_loss(yhat_test[:, :-1], x_test[:, 1:]).item()

#         # Hybrid rollout with your knob (refresh-every-k)
#         k_refresh = 20
#         outs = rollout_multi_horizon(model, x_test, dt0, horizons=(1,5,20,50), refresh_every=k_refresh)
#         mse_roll_20 = F.mse_loss(outs[20][:, :-20], x_test[:, 20:]).item()
#         print(f"Test 1-step MSE: {mse1_te:.4e}   20-step (refresh={k_refresh}) MSE: {mse_roll_20:.4e}")

#     # --- (Key demo) Generalize to a different sampling rate ---
#     # Downsample by 2 -> dt' = 2*dt0 ; the *same model* is used, only dt changes.
#     with torch.no_grad():
#         x_coarse = x_np[::2]
#         x_coarse = to_tensor(((x_coarse - x_coarse.mean()) / (x_coarse.std() + 1e-8))[None, :], device)
#         dt1 = dt0 * 2.0
#         yhat_coarse = model(x_coarse, dt1)  # uses same τ-grid in seconds, resampled via interpolation
#         mse1_coarse = F.mse_loss(yhat_coarse[:, :-1], x_coarse[:, 1:]).item()
#         print(f"Generalization to dt'={dt1:.4f}: 1-step MSE={mse1_coarse:.4e}")

#     # --- Inspect learned kernel on test (optional) ---
#     with torch.no_grad():
#         _, k_test, _, _ = model(x_test, dt0, return_kernel=True)  # [1,Tte,L]
#         k_mean = k_test.mean(dim=1).squeeze(0).cpu().numpy()      # [L]
#         print("Kernel summary over τ (mean over time): mean=", k_mean.mean(), " std=", k_mean.std())


In [11]:
# tvar_neural_operator.py
# Time-Varying AR as a Neural Operator over Continuous Delays
# - Learns k_t(τ) with Fourier features (operator-style kernel in delay domain)
# - Works across sampling rates via continuous-τ interpolation
# - Includes multi-horizon loss and refresh-every-k rollout

import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

# ----------------------------
# Utilities
# ----------------------------
def to_tensor(x, device="mpc", dtype=torch.float32):
    if isinstance(x, np.ndarray):
        x = torch.from_numpy(x)
    return x.to(device=device, dtype=dtype)

# def fractional_delay_samples(x,  # [B, T]
#                              tau_grid,  # [L] delays in *seconds*
#                              dt,        # scalar seconds per sample
#                              t_offset=0):
#     """
#     Gather x(t - τ_ℓ) with linear interpolation for a batch over time.
#     - x: [B,T]
#     - tau_grid: [L] (seconds), monotone increasing (e.g., np.linspace(Δmin, Δmax, L))
#     - dt: float (seconds/sample)
#     - t_offset: starting absolute time index (in samples) for the first output time step.
#       For plain teacher-forced 1-step at times t = 0..T-1, set t_offset=0.
#       For test windows taken from the middle of a long series, set t_offset to that start index.
#     Returns:
#       Xlags: [B, T, L] with x(t - τℓ).
#     """
#     B, T = x.shape
#     device = x.device
#     L = len(tau_grid)
#     # Convert τ (sec) -> τ_idx (samples)
#     tau_idx = to_tensor(np.asarray(tau_grid) / dt, device=device)  # [L]
#     # times in *sample indices* for each step we predict
#     t_idx = to_tensor(np.arange(T) + t_offset, device=device).view(1, T, 1)  # [1,T,1]
#     # desired source indices (float)
#     src = t_idx - tau_idx.view(1, 1, L)  # [1,T,L]
#     src0 = torch.clamp(torch.floor(src), 0, T - 1)        # lower index
#     src1 = torch.clamp(src0 + 1, 0, T - 1)                # upper index
#     w = (src - src0).to(x.dtype)                          # interpolation weight

#     # Gather with batch broadcasting
#     idx0 = src0.long().expand(B, -1, -1)  # [B,T,L]
#     idx1 = src1.long().expand(B, -1, -1)  # [B,T,L]
#     x_exp = x.unsqueeze(-1).expand(-1, -1, L)  # [B,T,L] (for fancy indexing we gather per time below)
#     # We need to gather per time step; easiest is to reshape
#     x0 = torch.gather(x_exp, 1, idx0)  # [B,T,L]
#     x1 = torch.gather(x_exp, 1, idx1)  # [B,T,L]
#     return (1 - w) * x0 + w * x1       # [B,T,L]

# def fractional_delay_samples(x,  # [B, T]
#                              tau_grid,  # [L] delays in *seconds* (torch tensor or numpy)
#                              dt,        # float seconds per sample
#                              t_offset=0):
#     """
#     Gather x(t - τ_ℓ) with linear interpolation for a batch over time.
#     Returns Xlags: [B, T, L]
#     """
#     B, T = x.shape
#     device = x.device
#     dtype  = x.dtype

#     # ---- make τ indices on the same device, no NumPy conversion ----
#     if isinstance(tau_grid, torch.Tensor):
#         tau_idx = tau_grid.to(device=device, dtype=dtype) / float(dt)   # [L]
#     else:
#         tau_idx = torch.as_tensor(tau_grid, device=device, dtype=dtype) / float(dt)  # [L]

#     # time indices (in samples) for each step
#     t_idx = torch.arange(T, device=device, dtype=dtype).view(1, T, 1) + float(t_offset)  # [1,T,1]

#     # desired (float) source indices and interpolation weights
#     src  = t_idx - tau_idx.view(1, 1, -1)                                  # [1,T,L]
#     src0 = torch.clamp(torch.floor(src), 0, T - 1).to(torch.long)          # [1,T,L]
#     src1 = torch.clamp(src0 + 1,          0, T - 1)                        # [1,T,L]
#     w    = (src - src0.to(dtype)).to(dtype)                                 # [1,T,L]

#     # gather values
#     idx0 = src0.expand(B, -1, -1)                                          # [B,T,L]
#     idx1 = src1.expand(B, -1, -1)                                          # [B,T,L]
#     x_exp = x.unsqueeze(-1).expand(-1, -1, tau_idx.numel())                # [B,T,L]
#     x0 = torch.gather(x_exp, 1, idx0)                                      # [B,T,L]
#     x1 = torch.gather(x_exp, 1, idx1)                                      # [B,T,L]

#     return (1 - w) * x0 + w * x1

def fractional_delay_samples(x, tau_grid, dt, t_offset=0):
    B, T = x.shape
    device, dtype = x.device, x.dtype

    # keep everything on-device; no numpy
    if isinstance(tau_grid, torch.Tensor):
        tau_idx = tau_grid.to(device=device, dtype=dtype) / float(dt)
    else:
        tau_idx = torch.as_tensor(tau_grid, device=device, dtype=dtype) / float(dt)

    t_idx = torch.arange(T, device=device, dtype=dtype).view(1, T, 1) + float(t_offset)

    src  = t_idx - tau_idx.view(1, 1, -1)                          # [1,T,L]
    src0 = torch.clamp(torch.floor(src), 0, T - 1).to(torch.long)   # [1,T,L]
    src1 = torch.clamp(src0 + 1,          0, T - 1)                 # [1,T,L]
    w    = (src - src0.to(dtype)).to(dtype)                         # [1,T,L]

    idx0 = src0.expand(B, -1, -1).contiguous()                      # [B,T,L]
    idx1 = src1.expand(B, -1, -1).contiguous()                      # [B,T,L]

    # use repeat (real memory) not expand (as_strided view)
    L = tau_idx.numel()
    x_exp = x.unsqueeze(-1).repeat(1, 1, L).contiguous()            # [B,T,L]

    x0 = torch.gather(x_exp, 1, idx0)
    x1 = torch.gather(x_exp, 1, idx1)
    return (1 - w) * x0 + w * x1



def total_variation_time(k):
    # k: [B,T,L] -> TV along time
    return (k[:, 1:, :] - k[:, :-1, :]).abs().mean()

def l1_energy(k):
    return k.abs().mean()

# ----------------------------
# Model: TVAROperator
# ----------------------------
class TVAROperator(nn.Module):
    """
    Time-varying AR as a neural operator over continuous delays:
      y_t = c(t) + ∫ k_t(τ) x(t-τ) dτ  ≈  c(t) + Σ k_t(τ_ℓ) x(t-τ_ℓ) Δτ
    We discretize τ on a grid and parameterize k_t(τ) via Fourier features of τ,
    with time-varying amplitudes produced by a small causal context encoder.

    Inputs:
      L        : number of delay points (τ samples) within [tau_min, tau_max]
      tau_min  : minimum delay (seconds) > 0
      tau_max  : maximum delay (seconds) > tau_min
      n_modes  : # Fourier modes for kernel over τ
      hidden   : channels in context encoder
    """
    def __init__(self, L=128, tau_min=0.0, tau_max=0.5, n_modes=16, hidden=64):
        super().__init__()
        assert tau_max > tau_min >= 0.0
        self.L = L
        self.tau_min = tau_min
        self.tau_max = tau_max
        self.register_buffer("tau_grid", torch.linspace(tau_min, tau_max, L))  # seconds

        # Causal context encoder over time (1D convs, left padding)
        self.ctx = nn.Sequential(
            nn.Conv1d(1, hidden, kernel_size=9, padding=8, dilation=2), nn.ReLU(),
            nn.Conv1d(hidden, hidden, kernel_size=5, padding=4, dilation=2), nn.ReLU()
        )

        # Fourier basis over τ (fixed frequencies 0..π * modes)
        self.register_buffer("freqs", torch.linspace(0.0, math.pi, n_modes))
        self.head_a = nn.Linear(hidden, n_modes)  # cos amplitudes
        self.head_b = nn.Linear(hidden, n_modes)  # sin amplitudes
        self.bias   = nn.Linear(hidden, 1)        # c(t), time-varying intercept

        # Optional global gain to stabilize scale
        self.kernel_gain = nn.Parameter(torch.tensor(0.1))

    def make_kernel(self, h):
        """
        h: [B,T,H] context features -> k: [B,T,L] kernel over delays τ
        k_t(τ) = Σ_m [ a_m(t) cos(ω_m τ) + b_m(t) sin(ω_m τ) ]
        """
        B, T, H = h.shape
        a = self.head_a(h)  # [B,T,M]
        b = self.head_b(h)  # [B,T,M]
        # Build Fourier features over τ once, broadcast to batch/time
        tau = self.tau_grid.view(1, 1, self.L, 1)         # [1,1,L,1]
        omega = self.freqs.view(1, 1, 1, -1)              # [1,1,1,M]
        cosF = torch.cos(omega * tau)                     # [1,1,L,M]
        sinF = torch.sin(omega * tau)                     # [1,1,L,M]
        # combine with amplitudes
        k = (a.unsqueeze(2) * cosF + b.unsqueeze(2) * sinF).sum(-1)  # [B,T,L]
        return self.kernel_gain * k

    # def forward(self, x, dt, t_offset=0, return_kernel=False):
    #     """
    #     x: [B,T] observed scalar series
    #     dt: float seconds per sample (tensor or python float)
    #     t_offset: absolute index (samples) for time=0 of x wrt the original series (for eval alignment)
    #     returns:
    #       yhat: [B,T]
    #       (optional) k: [B,T,L], c: [B,T], Xlags: [B,T,L]
    #     """
    #     B, T = x.shape
    #     # causal context features
    #     # x1 = x.unsqueeze(1)                               # [B,1,T]
    #     # h  = self.ctx(F.pad(x1, (32, 0)))                 # left pad -> causal
    #     # hT = h.transpose(1, 2)                            # [B,T,H]
    #     # k  = self.make_kernel(hT)                         # [B,T,L]
    #     # c  = self.bias(hT).squeeze(-1)                    # [B,T]

    #     # # sample lagged signal at continuous τ-grid
    #     # Xlags = fractional_delay_samples(x, self.tau_grid, float(dt), t_offset=t_offset)  # [B,T,L]

    #    # inside TVAROperator.forward(...)
    #     x1 = x.unsqueeze(1)                               # [B,1,T]
    #     h  = self.ctx(F.pad(x1, (32, 0)))                 # [B,H,T+32] due to extra pad
    #     h  = h[..., -x.shape[-1]:]                        # <-- crop to last T to align
    #     hT = h.transpose(1, 2)                            # [B,T,H]
    #     k  = self.make_kernel(hT)                         # [B,T,L]
    #     c  = self.bias(hT).squeeze(-1)                    # [B,T]
    #     Xlags = fractional_delay_samples(x, self.tau_grid, float(dt), t_offset=t_offset)  # [B,T,L]
    #     yhat = (k * Xlags).sum(-1) * delta_tau + c

    #     # Riemann sum over τ (Δτ constant)
    #     delta_tau = (self.tau_max - self.tau_min) / max(self.L - 1, 1)
    #     yhat = (k * Xlags).sum(-1) * delta_tau + c        # [B,T]
    #     if return_kernel:
    #         return yhat, k, c, Xlags
    #     return yhat
    def forward(self, x, dt, t_offset=0, return_kernel=False):
        B, T = x.shape

        # causal context features
        x1 = x.unsqueeze(1)                               # [B,1,T]
        h  = self.ctx(F.pad(x1, (32, 0)))                 # [B,H,T+32] due to extra left pad
        h  = h[..., -T:]                                   # <-- crop back to length T
        hT = h.transpose(1, 2)                            # [B,T,H]

        k  = self.make_kernel(hT)                         # [B,T,L]
        c  = self.bias(hT).squeeze(-1)                    # [B,T]

        # sample lagged signal at continuous τ-grid
        Xlags = fractional_delay_samples(
            x, self.tau_grid, float(dt), t_offset=t_offset
        )                                                 # [B,T,L]

        # Riemann sum over τ (Δτ constant)  <-- define BEFORE yhat
        delta_tau = (self.tau_max - self.tau_min) / max(self.L - 1, 1)

        yhat = (k * Xlags).sum(-1) * delta_tau + c        # [B,T]

        if return_kernel:
            return yhat, k, c, Xlags
        return yhat


# ----------------------------
# Losses & training helpers
# ----------------------------
def rollout_multi_horizon(model, x, dt, horizons, refresh_every=1):
    """
    Multi-horizon prediction with optional refresh-every-k hybrid.
    - x: [B,T] full sequence used for teacher forcing at refresh points
    - horizons: list like [1,5,20]
    Returns:
      dict: {h: yhat_h [B,T]} where yhat_h aligns so that yhat_h[:, t] predicts x[:, t+h]
    """
    B, T = x.shape
    device = x.device
    outs = {}
    # For each horizon, we do a causal pass that mixes teacher-forced prefixes and open-loop segments.
    # Simple approach: simulate step-by-step and store predictions for all horizons we need.
    max_h = max(horizons)
    # buffer for open-loop generation
    x_gen = x.clone()  # start from truth; we overwrite future points when we go open-loop

    for t in range(T - max_h):
        # every refresh_every steps, reset generator segment to truth
        if (t % max(1, int(refresh_every))) == 0:
            # ensure the last window aligns with truth up to t
            x_gen[:, :t+1] = x[:, :t+1]

        # predict next step using the operator at the *current* dt
        # we need a window covering (at least) the τ_max back in time; the model samples internally
        # For efficiency, we call model once per block; here we do a tiny call that returns the immediate next step.
        yhat_step = model(x_gen[:, :t+1], dt, t_offset=0)  # [B, t+1]
        next_pred = yhat_step[:, -1]                       # [B]
        # write predicted (open-loop) step into x_gen
        x_gen[:, t+1] = next_pred

        # record the horizons that equal 1 at this step (and later ones below)
        # We'll collect full arrays after the loop.
        pass

    # After generating, compute horizon-specific aligned predictions in one shot
    for h in horizons:
        # For each t, yhat_h[t] should be the model's prediction of x[t+h].
        # A simple approximation from the above loop is to take x_gen shifted by -h.
        yhat_h = torch.zeros_like(x)
        yhat_h[:, :-h] = x_gen[:, h:]
        yhat_h[:, -h:] = x_gen[:, -1:].expand(-1, h)  # dummy fill (unused in loss)
        outs[h] = yhat_h
    return outs

def loss_step(model, batch_x, dt, horizons=(1,5,20), refresh_every=1,
              lambda_tv=1e-3, lambda_l1=1e-4):
    """
    Compute:
      - 1-step MSE (teacher-forced, direct forward)
      - multi-horizon rollout MSE with refresh-every-k hybrid
      - TV and L1 regularization on kernels
    """
    y1, k, c, _ = model(batch_x, dt, return_kernel=True)   # teacher-forced pass
    # 1-step alignment: y1[:, :-1] predicts x[:, 1:]
    mse_1 = F.mse_loss(y1[:, :-1], batch_x[:, 1:])

    # rollout losses
    outs = rollout_multi_horizon(model, batch_x, dt, horizons, refresh_every=refresh_every)
    mse_roll = 0.0
    for h in horizons:
        # only compare where target exists
        mse_roll = mse_roll + F.mse_loss(outs[h][:, :-h], batch_x[:, h:])
    mse_roll = mse_roll / len(horizons)

    # regularization
    reg = lambda_tv * total_variation_time(k) + lambda_l1 * l1_energy(k)
    return mse_1, mse_roll, reg, {"k": k.detach()}

# ----------------------------
# Data prep: toy Lorenz x(t)
# ----------------------------
def lorenz(T_steps=30000, dt=0.005, sigma=10.0, rho=28.0, beta=8/3, x0=(1.0,1.0,1.0)):
    x, y, z = x0
    xs, ys, zs = [], [], []
    for _ in range(T_steps):
        def f(x,y,z):
            dx = sigma*(y-x)
            dy = x*(rho - z) - y
            dz = x*y - beta*z
            return dx, dy, dz
        k1 = f(x,y,z)
        k2 = f(x + 0.5*dt*k1[0], y + 0.5*dt*k1[1], z + 0.5*dt*k1[2])
        k3 = f(x + 0.5*dt*k2[0], y + 0.5*dt*k2[1], z + 0.5*dt*k2[2])
        k4 = f(x + dt*k3[0], y + dt*k3[1], z + dt*k3[2])
        x += (dt/6.0)*(k1[0] + 2*k2[0] + 2*k3[0] + k4[0])
        y += (dt/6.0)*(k1[1] + 2*k2[1] + 2*k3[1] + k4[1])
        z += (dt/6.0)*(k1[2] + 2*k2[2] + 2*k3[2] + k4[2])
        xs.append(x); ys.append(y); zs.append(z)
    return np.array(xs), np.array(ys), np.array(zs)

# ----------------------------
# Minimal training loop (demo)
# ----------------------------
if __name__ == "__main__":
    device = "mps" if torch.backends.mps.is_available() else "cpu"

    # --- Generate toy data (replace with your real series) ---
    dt0 = 0.005
    xs, ys, zs = lorenz(T_steps=20000, dt=dt0)
    x_np = xs.astype(np.float32)
    # standardize (optional but helpful)
    x_np = (x_np - x_np.mean()) / (x_np.std() + 1e-8)

    # Train / test split
    T = len(x_np)
    split = int(0.8 * T)
    x_train = to_tensor(x_np[:split][None, :], device)   # [B=1, T_tr]
    x_test  = to_tensor(x_np[split:][None, :], device)   # [B=1, T_te]

    # --- Build model ---
    # Choose τ window in *seconds* (e.g., up to 0.5 s; tune to your data)
    tau_min, tau_max = 0.0, 0.5
    model = TVAROperator(L=128, tau_min=tau_min, tau_max=tau_max, n_modes=16, hidden=64).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=2e-3, weight_decay=1e-6)

    # --- Train ---
    horizons = (1, 5, 20)
    refresh_every = 5   # hybrid rollout during training (matches your pipeline idea)

    for epoch in range(20):
        model.train()
        mse1, mser, reg, extras = loss_step(model, x_train, dt0, horizons=horizons,
                                            refresh_every=refresh_every,
                                            lambda_tv=1e-3, lambda_l1=1e-4)
        loss = mse1 + mser + reg
        opt.zero_grad(set_to_none=True)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()

        if (epoch+1) % 5 == 0:
            print(f"[{epoch+1:03d}] 1-step: {mse1.item():.4e}  roll:{mser.item():.4e}  reg:{reg.item():.4e}")

    # --- Evaluate on test at original dt ---
    model.eval()
    with torch.no_grad():
        # Teacher-forced 1-step MSE
        yhat_test = model(x_test, dt0)        # [1,Tte]
        mse1_te = F.mse_loss(yhat_test[:, :-1], x_test[:, 1:]).item()

        # Hybrid rollout with your knob (refresh-every-k)
        k_refresh = 20
        outs = rollout_multi_horizon(model, x_test, dt0, horizons=(1,5,20,50), refresh_every=k_refresh)
        mse_roll_20 = F.mse_loss(outs[20][:, :-20], x_test[:, 20:]).item()
        print(f"Test 1-step MSE: {mse1_te:.4e}   20-step (refresh={k_refresh}) MSE: {mse_roll_20:.4e}")

    # --- (Key demo) Generalize to a different sampling rate ---
    # Downsample by 2 -> dt' = 2*dt0 ; the *same model* is used, only dt changes.
    with torch.no_grad():
        x_coarse = x_np[::2]
        x_coarse = to_tensor(((x_coarse - x_coarse.mean()) / (x_coarse.std() + 1e-8))[None, :], device)
        dt1 = dt0 * 2.0
        yhat_coarse = model(x_coarse, dt1)  # uses same τ-grid in seconds, resampled via interpolation
        mse1_coarse = F.mse_loss(yhat_coarse[:, :-1], x_coarse[:, 1:]).item()
        print(f"Generalization to dt'={dt1:.4f}: 1-step MSE={mse1_coarse:.4e}")

    # --- Inspect learned kernel on test (optional) ---
    with torch.no_grad():
        _, k_test, _, _ = model(x_test, dt0, return_kernel=True)  # [1,Tte,L]
        k_mean = k_test.mean(dim=1).squeeze(0).cpu().numpy()      # [L]
        print("Kernel summary over τ (mean over time): mean=", k_mean.mean(), " std=", k_mean.std())


KeyboardInterrupt: 

In [12]:
# ----------------------------
# Data prep: toy Lorenz x(t)
# ----------------------------
def lorenz(T_steps=30000, dt=0.005, sigma=10.0, rho=28.0, beta=8/3, x0=(1.0,1.0,1.0)):
    x, y, z = x0
    xs, ys, zs = [], [], []
    for _ in range(T_steps):
        def f(x,y,z):
            dx = sigma*(y-x)
            dy = x*(rho - z) - y
            dz = x*y - beta*z
            return dx, dy, dz
        k1 = f(x,y,z)
        k2 = f(x + 0.5*dt*k1[0], y + 0.5*dt*k1[1], z + 0.5*dt*k1[2])
        k3 = f(x + 0.5*dt*k2[0], y + 0.5*dt*k2[1], z + 0.5*dt*k2[2])
        k4 = f(x + dt*k3[0], y + dt*k3[1], z + dt*k3[2])
        x += (dt/6.0)*(k1[0] + 2*k2[0] + 2*k3[0] + k4[0])
        y += (dt/6.0)*(k1[1] + 2*k2[1] + 2*k3[1] + k4[1])
        z += (dt/6.0)*(k1[2] + 2*k2[2] + 2*k3[2] + k4[2])
        xs.append(x); ys.append(y); zs.append(z)
    return np.array(xs), np.array(ys), np.array(zs)

In [5]:



# ----------------------------
# Minimal training loop (demo)
# ----------------------------
if __name__ == "__main__":
    # device = "cuda" if torch.cuda.is_available() else "cpu"
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    # if MPS is flaky, force CPU once to confirm it’s your code, not the backend:
    # device = "cpu"
    print("Device:", device)


    # --- Generate toy data (replace with your real series) ---
    dt0 = 0.005
    xs, ys, zs = lorenz(T_steps=20000, dt=dt0)
    x_np = xs.astype(np.float32)
    # standardize (optional but helpful)
    x_np = (x_np - x_np.mean()) / (x_np.std() + 1e-8)

    # Train / test split
    T = len(x_np)
    split = int(0.8 * T)
    x_train = to_tensor(x_np[:split][None, :], device)   # [B=1, T_tr]
    x_test  = to_tensor(x_np[split:][None, :], device)   # [B=1, T_te]

    # --- Build model ---
    # Choose τ window in *seconds* (e.g., up to 0.5 s; tune to your data)
    tau_min, tau_max = dt0, 0.5
    model = TVAROperator(L=128, tau_min=tau_min, tau_max=tau_max, n_modes=16, hidden=64).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=2e-3, weight_decay=1e-6)

    # --- Train ---
    horizons = (1, 5, 20)
    refresh_every = 5   # hybrid rollout during training (matches your pipeline idea)

    for epoch in range(20):
        model.train()
        mse1, mser, reg, extras = loss_step(model, x_train, dt0, horizons=horizons,
                                            refresh_every=refresh_every,
                                            lambda_tv=1e-3, lambda_l1=1e-4)
        loss = mse1 + mser + reg
        opt.zero_grad(set_to_none=True)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()

        if (epoch+1) % 5 == 0:
            print(f"[{epoch+1:03d}] 1-step: {mse1.item():.4e}  roll:{mser.item():.4e}  reg:{reg.item():.4e}")

    # --- Evaluate on test at original dt ---
    model.eval()
    with torch.no_grad():
        # Teacher-forced 1-step MSE
        yhat_test = model(x_test, dt0)        # [1,Tte]
        mse1_te = F.mse_loss(yhat_test[:, :-1], x_test[:, 1:]).item()

        # Hybrid rollout with your knob (refresh-every-k)
        k_refresh = 20
        outs = rollout_multi_horizon(model, x_test, dt0, horizons=(1,5,20,50), refresh_every=k_refresh)
        mse_roll_20 = F.mse_loss(outs[20][:, :-20], x_test[:, 20:]).item()
        print(f"Test 1-step MSE: {mse1_te:.4e}   20-step (refresh={k_refresh}) MSE: {mse_roll_20:.4e}")

    # --- (Key demo) Generalize to a different sampling rate ---
    # Downsample by 2 -> dt' = 2*dt0 ; the *same model* is used, only dt changes.
    with torch.no_grad():
        x_coarse = x_np[::2]
        x_coarse = to_tensor(((x_coarse - x_coarse.mean()) / (x_coarse.std() + 1e-8))[None, :], device)
        dt1 = dt0 * 2.0
        yhat_coarse = model(x_coarse, dt1)  # uses same τ-grid in seconds, resampled via interpolation
        mse1_coarse = F.mse_loss(yhat_coarse[:, :-1], x_coarse[:, 1:]).item()
        print(f"Generalization to dt'={dt1:.4f}: 1-step MSE={mse1_coarse:.4e}")

    # --- Inspect learned kernel on test (optional) ---
    with torch.no_grad():
        _, k_test, _, _ = model(x_test, dt0, return_kernel=True)  # [1,Tte,L]
        k_mean = k_test.mean(dim=1).squeeze(0).cpu().numpy()      # [L]
        print("Kernel summary over τ (mean over time): mean=", k_mean.mean(), " std=", k_mean.std())


Device: mps


KeyboardInterrupt: 

In [13]:
from tqdm.auto import tqdm

# --- Tiny debug config ---
dt0 = 0.05
# use only a small slice initially
x_np_small = x_np[:400].astype(np.float32)
x_train = to_tensor(x_np_small[:3200][None, :], device)
x_test  = to_tensor(x_np_small[3200:][None, :], device)

model = TVAROperator(
    L=32,
    tau_min=dt0,      # avoid τ=0 for forecasting
    tau_max=0.25,
    n_modes=8,
    hidden=16
).to(device)

opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-6)
horizons = (1, 5, 10)
refresh_every = 5

# Optional: keep kernels small & stable
torch.set_float32_matmul_precision("high")

# Forward smoke test
with torch.no_grad():
    yhat, k, c, Xlags = model(x_train, dt0, return_kernel=True)
    assert yhat.shape == x_train.shape
    assert k.shape == Xlags.shape == (x_train.size(0), x_train.size(1), model.L)
    assert c.shape == (x_train.size(0), x_train.size(1))
    for name, t in [("yhat", yhat), ("k", k), ("c", c), ("Xlags", Xlags)]:
        assert torch.isfinite(t).all(), f"non-finite in {name}"

# Tiny training loop
LOG_EVERY = 1
for epoch in tqdm(range(1, 6), desc="Training"):
# for epoch in range(5):
    model.train()
    mse1, mser, reg, _ = loss_step(
        model, x_train, dt0,
        horizons=horizons,
        refresh_every=refresh_every,
        lambda_tv=1e-3, lambda_l1=1e-4
    )
    # guard against NaNs
    for name, t in [("mse1", mse1), ("mser", mser), ("reg", reg)]:
        if not torch.isfinite(t):
            raise RuntimeError(f"non-finite {name}: {t.item()}")
    loss = mse1 + mser + reg
    opt.zero_grad(set_to_none=True)
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    opt.step()
    if (epoch + 1) % LOG_EVERY == 0 or epoch == 0:
        print(f"[{epoch+1}] 1-step {mse1.item():.4e} | roll {mser.item():.4e} | reg {reg.item():.4e}", flush=True)
# Quick eval
model.eval()
with torch.no_grad():
    yhat_te = model(x_test, dt0)
    mse1_te = F.mse_loss(yhat_te[:, :-1], x_test[:, 1:]).item()
    outs = rollout_multi_horizon(model, x_test, dt0, horizons=(1,5,10), refresh_every=10)
    mse10 = F.mse_loss(outs[10][:, :-10], x_test[:, 10:]).item()
    print(f"Test 1-step: {mse1_te:.4e} | 10-step hybrid: {mse10:.4e}")


Training:   0%|          | 0/5 [00:00<?, ?it/s]

[2] 1-step 1.0390e+00 | roll 1.1882e-02 | reg 1.5435e-05


Training:  20%|██        | 1/5 [04:57<19:49, 297.36s/it]

[3] 1-step 9.7421e-01 | roll 1.1617e-02 | reg 1.6640e-05


Training:  40%|████      | 2/5 [06:04<08:05, 161.98s/it]

[4] 1-step 9.1016e-01 | roll 1.1343e-02 | reg 1.7920e-05


Training:  60%|██████    | 3/5 [07:11<03:56, 118.49s/it]

[5] 1-step 8.4813e-01 | roll 1.1066e-02 | reg 1.9290e-05


Training:  80%|████████  | 4/5 [08:15<01:37, 97.08s/it] 

[6] 1-step 7.8913e-01 | roll 1.0786e-02 | reg 2.0719e-05


Training: 100%|██████████| 5/5 [09:19<00:00, 111.88s/it]


RuntimeError: The size of tensor a (32) must match the size of tensor b (0) at non-singleton dimension 1

In [10]:
# Data
dt0 = 0.005
xs, ys, zs = lorenz(T_steps=8000, dt=dt0)  # ↓
x_np = ((xs - xs.mean()) / (xs.std() + 1e-8)).astype(np.float32)
split = int(0.8 * len(x_np))
x_train = to_tensor(x_np[:split][None, :], device)
x_test  = to_tensor(x_np[split:][None, :], device)

# Model
model = TVAROperator(
    L=32,            # ↓ 16 if you need more speed
    tau_min=dt0,
    tau_max=0.25,    # ↓ 0.15 for more speed
    n_modes=8,       # ↓ 4
    hidden=16        # ↓ 8
).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-6)

# Training
epochs = 5
horizons = (1,)     # training only; see USE_ROLLOUT below
refresh_every = 10
USE_ROLLOUT = False # ← main speed win

for epoch in range(1, epochs+1):
    model.train()
    mse1, mser, reg, _ = loss_step(model, x_train, dt0,
                                   horizons=horizons, refresh_every=refresh_every,
                                   lambda_tv=1e-3, lambda_l1=1e-4)
    if not USE_ROLLOUT:
        mser = 0.0 * mse1
    loss = mse1 + mser + reg
    opt.zero_grad(set_to_none=True); loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), 1.0); opt.step()
    print(f"[{epoch}] 1step={mse1.item():.3e} reg={reg.item():.3e}")

# Eval (run rollout here if you want)
model.eval()
with torch.no_grad():
    yhat_te = model(x_test, dt0)
    mse1_te = F.mse_loss(yhat_te[:, :-1], x_test[:, 1:]).item()
    outs = rollout_multi_horizon(model, x_test, dt0,
                                 horizons=(1,5,10), refresh_every=20)  # bigger refresh
    mse10 = F.mse_loss(outs[10][:, :-10], x_test[:, 10:]).item()
    print(f"Test 1-step={mse1_te:.3e} | 10-step hybrid={mse10:.3e}")


KeyboardInterrupt: 