# TinyTimeMixers (TTM) GPU Demo

This notebook demonstrates the TinyTimeMixers implementation on GPU.

**Paper:** arXiv 2401.03955 - Tiny Time Mixers (TTM): Fast Pre-trained Models for Enhanced Zero/Few-Shot Forecasting of Multivariate Time Series

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/evelynmitchell/TinyTimeMixers/blob/main/notebooks/TTM_GPU_Demo.ipynb)

## Setup

First, let's install the TinyTimeMixers package and dependencies.

In [None]:
# Install TinyTimeMixers from GitHub
!pip install git+https://github.com/evelynmitchell/TinyTimeMixers.git

# Or install dependencies manually if developing locally
# !pip install torch numpy einops tqdm

In [None]:
import time

import numpy as np
import torch

# Check GPU availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(
        f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB"
    )

## Model Overview

TinyTimeMixers (TTM) is a lightweight time series foundation model based on the TSMixer architecture.

In [None]:
from tinytimemixers import TTM, TTMConfig

# Default configuration
config = TTMConfig()
print("TTM Configuration:")
print(f"  Context length: {config.context_length}")
print(f"  Prediction length: {config.prediction_length}")
print(f"  Patch length: {config.patch_length}")
print(f"  Hidden features: {config.hidden_features}")
print(f"  Backbone levels: {config.num_backbone_levels}")
print(f"  Blocks per level: {config.blocks_per_level}")

In [None]:
# Create model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = TTM(config, num_channels=7).to(device)
print("\nModel created!")
print(f"Total parameters: {model.get_num_parameters():,}")

# Parameter breakdown
print("\nParameter breakdown:")
for name, count in model.get_parameter_breakdown().items():
    print(f"  {name}: {count:,}")

## Forward Pass Demo

Let's test the model with random data.

In [None]:
# Generate random input
batch_size = 32
num_channels = 7
context_length = config.context_length

x = torch.randn(batch_size, num_channels, context_length).to(device)
print(f"Input shape: {x.shape}")

# Forward pass
model.eval()
with torch.no_grad():
    y = model(x)

print(f"Output shape: {y.shape}")
print(
    f"\nExpected: (batch={batch_size}, channels={num_channels}, pred_len={config.prediction_length})"
)

## Inference Speed Benchmark

Compare CPU vs GPU inference speed.

In [None]:
def benchmark_inference(model, x, num_runs=100, warmup=10):
    """Benchmark inference speed."""
    model.eval()

    # Warmup
    with torch.no_grad():
        for _ in range(warmup):
            _ = model(x)

    # Sync CUDA if using GPU
    if x.is_cuda:
        torch.cuda.synchronize()

    # Benchmark
    start = time.time()
    with torch.no_grad():
        for _ in range(num_runs):
            _ = model(x)
            if x.is_cuda:
                torch.cuda.synchronize()
    elapsed = time.time() - start

    return elapsed / num_runs * 1000  # ms per inference


# Benchmark on current device
batch_size = 1
x = torch.randn(batch_size, 7, config.context_length).to(device)

ms_per_inference = benchmark_inference(model, x)
print(f"Device: {device}")
print(f"Batch size: {batch_size}")
print(f"Inference time: {ms_per_inference:.2f} ms")
print(f"Throughput: {1000/ms_per_inference:.1f} samples/sec")

In [None]:
# Benchmark with different batch sizes
if torch.cuda.is_available():
    print("Batch size vs Throughput:\n")
    for bs in [1, 4, 16, 32, 64, 128]:
        try:
            x = torch.randn(bs, 7, config.context_length).to(device)
            ms = benchmark_inference(model, x, num_runs=50)
            throughput = bs * 1000 / ms
            print(f"  Batch {bs:3d}: {ms:6.2f} ms, {throughput:7.1f} samples/sec")
        except RuntimeError:
            print(f"  Batch {bs:3d}: OOM")
            break

## Training Example

Simple training loop example with synthetic data.

In [None]:
# Generate synthetic dataset
def generate_synthetic_data(num_samples, num_channels, context_len, pred_len):
    """Generate synthetic time series with trend and seasonality."""
    t = np.linspace(0, 4 * np.pi, context_len + pred_len)

    X, Y = [], []
    for _ in range(num_samples):
        # Random parameters
        amplitude = np.random.uniform(0.5, 2.0, num_channels)
        phase = np.random.uniform(0, 2 * np.pi, num_channels)
        trend = np.random.uniform(-0.1, 0.1, num_channels)

        # Generate time series
        series = []
        for c in range(num_channels):
            s = amplitude[c] * np.sin(t + phase[c]) + trend[c] * t
            s += np.random.randn(len(t)) * 0.1  # noise
            series.append(s)
        series = np.stack(series, axis=0)

        X.append(series[:, :context_len])
        Y.append(series[:, context_len:])

    return torch.FloatTensor(np.stack(X)), torch.FloatTensor(np.stack(Y))


# Create dataset
X_train, Y_train = generate_synthetic_data(
    1000, 7, config.context_length, config.prediction_length
)
X_val, Y_val = generate_synthetic_data(
    100, 7, config.context_length, config.prediction_length
)

print(f"Training data: X={X_train.shape}, Y={Y_train.shape}")
print(f"Validation data: X={X_val.shape}, Y={Y_val.shape}")

In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Create data loaders
train_dataset = TensorDataset(X_train, Y_train)
val_dataset = TensorDataset(X_val, Y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Reset model
model = TTM(config, num_channels=7).to(device)

# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
criterion = torch.nn.MSELoss()

print(f"Training on {device}...")

In [None]:
# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0
    for X_batch, Y_batch in train_loader:
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)

        optimizer.zero_grad()
        Y_pred = model(X_batch)
        loss = criterion(Y_pred, Y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, Y_batch in val_loader:
            X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
            Y_pred = model(X_batch)
            val_loss += criterion(Y_pred, Y_batch).item()

    val_loss /= len(val_loader)

    print(
        f"Epoch {epoch+1:2d}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}"
    )

## Visualize Predictions

In [None]:
import matplotlib.pyplot as plt

# Get a sample prediction
model.eval()
with torch.no_grad():
    sample_x = X_val[0:1].to(device)
    sample_y_true = Y_val[0:1]
    sample_y_pred = model(sample_x).cpu()

# Plot
fig, axes = plt.subplots(2, 2, figsize=(14, 8))
axes = axes.flatten()

for i, ax in enumerate(axes):
    channel = i

    # Context
    context = sample_x[0, channel].cpu().numpy()
    ax.plot(range(len(context)), context, "b-", label="Context")

    # True future
    true = sample_y_true[0, channel].numpy()
    ax.plot(
        range(len(context), len(context) + len(true)),
        true,
        "g-",
        label="True",
        linewidth=2,
    )

    # Predicted
    pred = sample_y_pred[0, channel].numpy()
    ax.plot(
        range(len(context), len(context) + len(pred)),
        pred,
        "r--",
        label="Predicted",
        linewidth=2,
    )

    ax.axvline(x=len(context), color="k", linestyle=":", alpha=0.5)
    ax.set_title(f"Channel {channel}")
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Save/Load Model

In [None]:
# Save model
model.save("ttm_trained.pt")
print("Model saved to ttm_trained.pt")

# Load model
loaded_model = TTM.load("ttm_trained.pt")
loaded_model = loaded_model.to(device)
print(f"Model loaded. Parameters: {loaded_model.get_num_parameters():,}")

## Memory Usage

In [None]:
if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()

    # Forward pass
    x = torch.randn(64, 7, config.context_length).to(device)
    y = model(x)

    allocated = torch.cuda.max_memory_allocated() / 1e6
    print(f"Peak GPU memory: {allocated:.1f} MB")

    # Training pass
    torch.cuda.reset_peak_memory_stats()
    model.train()
    optimizer.zero_grad()
    y_pred = model(x)
    loss = criterion(y_pred, torch.randn_like(y_pred))
    loss.backward()

    allocated = torch.cuda.max_memory_allocated() / 1e6
    print(f"Peak GPU memory (training): {allocated:.1f} MB")

## Summary

This notebook demonstrated:
1. Loading and configuring TTM model
2. Forward pass and inference benchmarking
3. Training loop with synthetic data
4. Visualization of predictions
5. Model saving/loading

For production use, you would:
- Use real time series datasets (Monash, GIFT-Eval)
- Implement proper train/val/test splits
- Add learning rate scheduling
- Use the TTMForFinetune class with frozen backbone