# IIR2D Explainer Notebook: Fast Filters, Weird Art, Real Utility

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fyremael/iir2d/blob/main/docs/notebooks/IIR2D_Explainer_Colab.ipynb)

This notebook teaches **what IIR2D is**, **how to use all 8 filters**, and **why it matters** for image/video pipelines.

We'll use playful exemplars (cosmic portrait, mountain scroll, microbe swarm), then map them to production intuition.

## What You'll Do

1. Boot the environment (Colab-friendly).
2. Auto-bootstrap the native IIR2D CUDA core (no JAX path).
3. Generate entertaining synthetic scenes.
4. Run GPU-first filter demos directly through `iir2d_forward_cuda`.
5. Compare border modes and precision modes on the GPU core.
6. Run a mini temporal/video-style GPU demo.
7. Use a live side-by-side video scrubber with per-frame metrics.
8. Benchmark CPU reference vs GPU core timings side-by-side.
9. Export selected outputs as README-ready PNG/MP4.
10. Auto-generate a benchmark claims packet from notebook runs.

In [None]:
# Colab + local setup
import os
import sys
import subprocess
from pathlib import Path

IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    repo = Path("/content/iir2d")
    if not repo.exists():
        subprocess.check_call(["git", "clone", "https://github.com/fyremael/iir2d.git", str(repo)])
    os.chdir(repo)

root = Path.cwd().resolve()
if not (root / "scripts").exists():
    candidates = [root.parent, root.parent.parent, root / "iir2d_op"]
    for cand in candidates:
        if cand.exists() and (cand / "scripts").exists():
            root = cand
            break

os.chdir(root)
if str(root) not in sys.path:
    sys.path.insert(0, str(root))
if str(root / "python") not in sys.path:
    sys.path.insert(0, str(root / "python"))

for pkg in ["numpy", "matplotlib"]:
    try:
        __import__(pkg)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

print(f"ROOT={root}")

## GPU Bootstrap (Native CUDA Core)

This cell prepares the native `iir2d_forward_cuda` C API on GPU.
- First choice: download the prebuilt Linux binary bundle from the latest GitHub release.
- Fallback: build from source with `cmake + nvcc` if available.

JAX is not used in this notebook execution path.

In [None]:
import os
import tarfile
import urllib.request
import urllib.error
import json as _json
import shutil
from pathlib import Path


def run_cmd(cmd):
    proc = subprocess.run(cmd, capture_output=True, text=True)
    return proc.returncode, proc.stdout.strip(), proc.stderr.strip()


def has_cuda_gpu():
    code, out, err = run_cmd(["nvidia-smi", "-L"])
    if code == 0 and out:
        return True, out
    return False, err or out or "nvidia-smi not available"


def download_latest_linux_bundle(repo_root: Path):
    api = "https://api.github.com/repos/fyremael/iir2d/releases/latest"
    req = urllib.request.Request(api, headers={"User-Agent": "iir2d-colab-notebook"})
    with urllib.request.urlopen(req, timeout=30) as r:
        rel = _json.loads(r.read().decode("utf-8"))

    asset = None
    for a in rel.get("assets", []):
        name = a.get("name", "")
        if name.startswith("iir2d-linux-x86_64-") and name.endswith(".tar.gz"):
            asset = a
            break
    if asset is None:
        raise RuntimeError("No Linux binary bundle found in latest release assets.")

    cache_dir = repo_root / ".colab_cache"
    cache_dir.mkdir(parents=True, exist_ok=True)
    archive_path = cache_dir / asset["name"]
    extract_dir = cache_dir / "release_bundle"

    urllib.request.urlretrieve(asset["browser_download_url"], archive_path)

    if extract_dir.exists():
        shutil.rmtree(extract_dir)
    extract_dir.mkdir(parents=True, exist_ok=True)

    with tarfile.open(archive_path, "r:gz") as tf:
        tf.extractall(extract_dir)

    lib = extract_dir / "libiir2d_jax.so"
    if not lib.exists():
        raise RuntimeError("Downloaded bundle did not contain libiir2d_jax.so")
    return lib, asset["name"]


GPU_AVAILABLE, GPU_INFO = has_cuda_gpu()
IIR2D_CORE_READY = False
IIR2D_CORE_REASON = ""
IIR2D_CORE_LIB_PATH = ""

print("GPU_AVAILABLE:", GPU_AVAILABLE)
if GPU_INFO:
    print(GPU_INFO.splitlines()[0])

if GPU_AVAILABLE:
    repo_root = Path.cwd()
    dst_dir = repo_root / "python" / "iir2d_jax"
    dst_dir.mkdir(parents=True, exist_ok=True)
    dst_lib = dst_dir / "libiir2d_jax.so"

    lib_src = None

    try:
        lib_src, asset_name = download_latest_linux_bundle(repo_root)
        print("Using release bundle:", asset_name)
    except Exception as exc:
        print("Release bundle fetch failed, attempting local build.")
        print("Reason:", exc)

        nvcc = shutil.which("nvcc")
        cmake = shutil.which("cmake")
        if not nvcc or not cmake:
            IIR2D_CORE_REASON = "GPU detected but neither release bundle nor local CUDA build is available."
        else:
            build_dir = repo_root / "build_wsl"
            subprocess.check_call(["cmake", "-S", ".", "-B", str(build_dir), "-DCMAKE_BUILD_TYPE=Release"])
            subprocess.check_call(["cmake", "--build", str(build_dir), "-j"])
            cand = build_dir / "libiir2d_jax.so"
            if not cand.exists():
                raise RuntimeError("Build completed but libiir2d_jax.so not found.")
            lib_src = cand

    if lib_src is not None:
        shutil.copy2(lib_src, dst_lib)

        import ctypes
        from scripts.core_harness import load_core_library, find_cudart, configure_core_lib, configure_cudart

        core_lib, lib_path = load_core_library(repo_root)
        cudart = find_cudart()
        configure_core_lib(core_lib)
        configure_cudart(cudart)

        IIR2D_CORE_READY = True
        IIR2D_CORE_LIB_PATH = str(lib_path)
        IIR2D_CUDART = cudart
        IIR2D_CORE_LIB = core_lib
        IIR2D_CORE_REASON = "ready"
else:
    IIR2D_CORE_REASON = "No CUDA GPU detected"

print("IIR2D_CORE_READY:", IIR2D_CORE_READY)
print("IIR2D_CORE_REASON:", IIR2D_CORE_REASON)
if IIR2D_CORE_LIB_PATH:
    print("IIR2D_CORE_LIB_PATH:", IIR2D_CORE_LIB_PATH)

In [None]:
import ctypes
import math
import time
import numpy as np
import matplotlib.pyplot as plt

from scripts.iir2d_cpu_reference import iir2d_cpu_reference
from scripts.core_harness import (
    BORDER_MAP,
    PRECISION_MAP,
    IIR2D_Params,
    CUDA_MEMCPY_HOST_TO_DEVICE,
    CUDA_MEMCPY_DEVICE_TO_HOST,
    cuda_check,
)

plt.style.use("dark_background")
np.random.seed(42)

if not IIR2D_CORE_READY:
    raise RuntimeError(
        "Native iir2d GPU core is not ready. "
        "Use a CUDA GPU runtime in Colab and rerun bootstrap. "
        f"Reason: {IIR2D_CORE_REASON}"
    )

FILTER_LABELS = {
    1: "F1 EMA",
    2: "F2 SOS",
    3: "F3 Biquad",
    4: "F4 SOS",
    5: "F5 FB First",
    6: "F6 Deriche-ish",
    7: "F7 Sharper EMA",
    8: "F8 State",
}

DEFAULT_SHOWCASE_FILTERS = [1, 2, 5, 6, 7]


def normalize01(x):
    x = np.asarray(x, dtype=np.float32)
    lo, hi = float(x.min()), float(x.max())
    return (x - lo) / (hi - lo + 1e-8)


def show(img, title="", ax=None):
    if ax is None:
        _, ax = plt.subplots(figsize=(6, 4))
    ax.imshow(np.clip(img, 0.0, 1.0))
    ax.set_title(title)
    ax.axis("off")


def _dtype_for_precision(precision):
    if precision == "f64":
        return np.float64
    return np.float32


def _iir2d_core_cuda_2d(x2d, filter_id=4, border_mode="mirror", precision="f32", border_const=0.0):
    if not IIR2D_CORE_READY:
        raise RuntimeError("GPU core path is not ready in this runtime.")

    dtype = _dtype_for_precision(precision)
    x = np.asarray(x2d, dtype=dtype, order="C")
    if x.ndim != 2:
        raise ValueError("Expected 2D channel input")

    h, w = x.shape
    out = np.empty_like(x)
    nbytes = int(x.nbytes)

    params = IIR2D_Params(
        width=int(w),
        height=int(h),
        filter_id=int(filter_id),
        border_mode=int(BORDER_MAP[border_mode]),
        border_const=float(border_const),
        precision=int(PRECISION_MAP[precision]),
    )

    d_in = ctypes.c_void_p()
    d_out = ctypes.c_void_p()
    try:
        cuda_check(IIR2D_CUDART, IIR2D_CUDART.cudaMalloc(ctypes.byref(d_in), nbytes), "cudaMalloc(d_in)")
        cuda_check(IIR2D_CUDART, IIR2D_CUDART.cudaMalloc(ctypes.byref(d_out), nbytes), "cudaMalloc(d_out)")

        cuda_check(
            IIR2D_CUDART,
            IIR2D_CUDART.cudaMemcpy(
                d_in,
                x.ctypes.data_as(ctypes.c_void_p),
                nbytes,
                CUDA_MEMCPY_HOST_TO_DEVICE,
            ),
            "cudaMemcpy(H2D)",
        )

        rc = IIR2D_CORE_LIB.iir2d_forward_cuda(d_in, d_out, ctypes.byref(params))
        if rc != 0:
            msg = IIR2D_CORE_LIB.iir2d_status_string(rc)
            text = msg.decode("utf-8") if msg else str(rc)
            raise RuntimeError(f"iir2d_forward_cuda failed: {text} ({rc})")

        cuda_check(IIR2D_CUDART, IIR2D_CUDART.cudaDeviceSynchronize(), "cudaDeviceSynchronize")

        cuda_check(
            IIR2D_CUDART,
            IIR2D_CUDART.cudaMemcpy(
                out.ctypes.data_as(ctypes.c_void_p),
                d_out,
                nbytes,
                CUDA_MEMCPY_DEVICE_TO_HOST,
            ),
            "cudaMemcpy(D2H)",
        )

        return out
    finally:
        if d_in.value:
            IIR2D_CUDART.cudaFree(d_in)
        if d_out.value:
            IIR2D_CUDART.cudaFree(d_out)


def apply_iir2d_rgb(img, filter_id=4, border_mode="mirror", precision="f32", border_const=0.0, backend="gpu"):
    out_dtype = np.float64 if precision == "f64" else np.float32
    out = np.empty(img.shape, dtype=out_dtype)

    if backend == "gpu":
        for c in range(3):
            out[..., c] = _iir2d_core_cuda_2d(
                img[..., c],
                filter_id=filter_id,
                border_mode=border_mode,
                precision=precision,
                border_const=border_const,
            )
        return np.clip(out, 0.0, 1.0)

    if backend in ("reference", "cpu"):
        for c in range(3):
            out[..., c] = iir2d_cpu_reference(
                img[..., c],
                filter_id=filter_id,
                border_mode=border_mode,
                border_const=float(border_const),
                precision=precision,
            )
        return np.clip(out, 0.0, 1.0)

    raise ValueError(f"Invalid backend {backend!r}")


def make_cosmic_portrait(h=384, w=608):
    y, x = np.mgrid[-1:1:complex(0, h), -1:1:complex(0, w)]
    radial = np.sqrt(x * x + y * y)
    swirl = np.sin(8 * radial - 3 * np.arctan2(y, x))

    bg_r = normalize01(0.2 + 0.1 * np.cos(3 * x) + 0.2 * swirl)
    bg_g = normalize01(0.1 + 0.2 * np.sin(2 * y + 3 * x) + 0.15 * swirl)
    bg_b = normalize01(0.35 + 0.4 * np.cos(2 * radial) + 0.25 * swirl)
    bg = np.stack([bg_r, bg_g, bg_b], axis=-1)

    head = np.exp(-((x / 0.52) ** 2 + ((y + 0.02) / 0.72) ** 2) * 2.5)
    beard = np.exp(-((x / 0.45) ** 2 + ((y - 0.43) / 0.30) ** 2) * 5.0)
    hair = np.exp(-((x / 0.70) ** 2 + ((y + 0.25) / 0.55) ** 2) * 2.6) * (0.6 + 0.4 * np.sin(18 * x + 8 * y))
    eye_l = np.exp(-(((x + 0.18) / 0.07) ** 2 + ((y + 0.06) / 0.05) ** 2) * 8)
    eye_r = np.exp(-(((x - 0.18) / 0.07) ** 2 + ((y + 0.06) / 0.05) ** 2) * 8)

    skin = np.stack([0.95 * head, 0.78 * head, 0.76 * head], axis=-1)
    beard_rgb = np.stack([0.25 * beard, 0.28 * beard, 0.22 * beard], axis=-1)
    hair_rgb = np.stack([0.45 * hair, 0.30 * hair, 0.18 * hair], axis=-1)
    eyes = np.stack([0.45 * (eye_l + eye_r), 0.25 * (eye_l + eye_r), 0.70 * (eye_l + eye_r)], axis=-1)

    return np.clip(0.55 * bg + skin + beard_rgb + hair_rgb + eyes, 0.0, 1.0)


def make_mountain_scroll(h=360, w=620):
    y, x = np.mgrid[0:1:complex(0, h), 0:1:complex(0, w)]
    ridge = 0.45 * np.sin(10 * x + 4 * np.sin(5 * x)) + 0.25 * np.sin(24 * x + 6 * y)
    clouds = 0.4 * np.cos(7 * y + 3 * np.sin(8 * x))
    texture = 0.15 * np.sin(80 * x * y) + 0.12 * np.cos(60 * (x - y))
    base = normalize01(ridge + clouds + texture)

    r = normalize01(base * 0.9 + 0.2 * np.sin(8 * y))
    g = normalize01(base * 0.8 + 0.25 * np.cos(10 * x))
    b = normalize01(base * 1.05 + 0.3 * np.cos(6 * y))

    img = np.stack([r, g, b], axis=-1)

    for cx in [0.28, 0.5, 0.72]:
        mask = (np.abs(x - cx) < 0.012) & (y > 0.38) & (y < 0.70)
        img[mask] *= np.array([0.25, 0.23, 0.22])
        roof = (np.abs(x - cx) < 0.02) & (y > 0.34) & (y < 0.40) & (np.abs(x - cx) < (0.02 - (y - 0.34) * 0.3))
        img[roof] *= np.array([0.35, 0.2, 0.2])

    return np.clip(img, 0.0, 1.0)


def make_microbe_swarm(h=390, w=610, n=180):
    rng = np.random.default_rng(7)
    y, x = np.mgrid[0:h, 0:w]
    x = x.astype(np.float32)
    y = y.astype(np.float32)
    img = np.zeros((h, w, 3), dtype=np.float32)

    xn = (x / w) * 2 - 1
    yn = (y / h) * 2 - 1
    rad = np.sqrt(xn * xn + yn * yn)
    img[..., 1] = normalize01(np.exp(-(rad ** 2) * 1.8) * 0.8)
    img[..., 2] = normalize01(np.exp(-(rad ** 2) * 2.6) * 0.5)

    for _ in range(n):
        cx = rng.uniform(0, w)
        cy = rng.uniform(0, h)
        rx = rng.uniform(5, 16)
        ry = rng.uniform(5, 16)
        blob = np.exp(-(((x - cx) / rx) ** 2 + ((y - cy) / ry) ** 2))
        color = np.array([
            rng.uniform(0.3, 1.0),
            rng.uniform(0.2, 0.95),
            rng.uniform(0.1, 0.7),
        ], dtype=np.float32)
        img += blob[..., None] * color[None, None, :] * 0.42

    chips = (((x // 32 + y // 32) % 2) == 0).astype(np.float32)
    img += np.stack([0.1 * chips, 0.07 * chips, 0.02 * chips], axis=-1)

    return np.clip(normalize01(img), 0.0, 1.0)

In [None]:
scenes = {
    "Cosmic Portrait": make_cosmic_portrait(),
    "Mountain Scroll": make_mountain_scroll(),
    "Microbe Swarm": make_microbe_swarm(),
}

fig, axes = plt.subplots(1, 3, figsize=(18, 6))
for ax, (name, img) in zip(axes, scenes.items()):
    show(img, name, ax=ax)
plt.tight_layout()

## Default Filter Tour (Native GPU Core)

This gallery runs directly through the CUDA core (`iir2d_forward_cuda`).
CPU is only used later for explicit reference checks.

In [None]:
scene_name = "Cosmic Portrait"
src_img = scenes[scene_name]
showcase_filters = DEFAULT_SHOWCASE_FILTERS

n_panels = len(showcase_filters) + 1
cols = 3
rows = math.ceil(n_panels / cols)
fig, axes = plt.subplots(rows, cols, figsize=(5 * cols, 4.4 * rows))
flat_axes = np.array(axes).reshape(-1)

show(src_img, "Original", ax=flat_axes[0])
for idx, fid in enumerate(showcase_filters, start=1):
    out = apply_iir2d_rgb(src_img, filter_id=fid, border_mode="mirror", precision="f32")
    show(out, FILTER_LABELS[fid], ax=flat_axes[idx])

for ax in flat_axes[n_panels:]:
    ax.axis("off")

plt.suptitle(f"IIR2D Default Visual Tour — {scene_name}", fontsize=16)
plt.tight_layout()

## Checkerboard Lab (Interactive)

This section intentionally explores checkerboard/patch artifacts for block-scan filters (`F3/F4/F8`) on the native GPU core.
Use sliders for image dimensions and scene frequency to make boundary artifacts more or less visible.

In [None]:
def make_boundary_stress_scene(height=512, width=512, freq=1.0):
    y, x = np.mgrid[-1:1:complex(0, height), -1:1:complex(0, width)]
    radial = np.sqrt(x * x + y * y)
    ang = np.arctan2(y, x)

    f = float(freq)
    r = normalize01(0.55 * np.cos(6 * f * ang) + 0.45 * np.sin(4 * f * radial + 8 * f * x))
    g = normalize01(0.50 * np.sin(7 * f * ang + 5 * f * radial) + 0.35 * np.cos(10 * f * y))
    b = normalize01(0.40 * np.cos(9 * f * radial) + 0.50 * np.sin(5 * f * x - 3 * f * y))

    checker = (((np.floor((x + 1.0) * width / 32) + np.floor((y + 1.0) * height / 32)) % 2) * 0.08).astype(np.float32)
    img = np.stack([r + checker, g + checker, b + checker], axis=-1)
    return np.clip(img, 0.0, 1.0)


def boundary_discontinuity(img, grid=256):
    h, w = img.shape[:2]
    vals = []
    for bx in range(grid, w, grid):
        vals.append(np.mean(np.abs(img[:, bx, :] - img[:, bx - 1, :])))
    for by in range(grid, h, grid):
        vals.append(np.mean(np.abs(img[by, :, :] - img[by - 1, :, :])))
    return float(np.mean(vals)) if vals else 0.0


def render_checkerboard_lab(filter_id=8, width=512, height=512, freq=1.0, border_mode="mirror"):
    scene = make_boundary_stress_scene(height=height, width=width, freq=freq)
    blockscan = apply_iir2d_rgb(scene, filter_id=filter_id, border_mode=border_mode, precision="f32", backend="gpu")
    stable = apply_iir2d_rgb(scene, filter_id=2, border_mode=border_mode, precision="f32", backend="gpu")

    disc = boundary_discontinuity(blockscan, grid=256)

    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    show(scene, f"Original {width}x{height}", ax=axes[0])
    show(blockscan, f"{FILTER_LABELS[filter_id]} | boundary jump={disc:.5f}", ax=axes[1])
    show(stable, "Stable reference (F2 GPU)", ax=axes[2])

    for bx in range(256, width, 256):
        axes[1].axvline(bx - 0.5, color="white", alpha=0.25, linewidth=0.8)
    for by in range(256, height, 256):
        axes[1].axhline(by - 0.5, color="white", alpha=0.25, linewidth=0.8)

    plt.tight_layout()
    plt.show()


if not IIR2D_CORE_READY:
    raise RuntimeError("Checkerboard lab requires native GPU core.")

try:
    import ipywidgets as widgets

    widgets.interact(
        render_checkerboard_lab,
        filter_id=widgets.Dropdown(options=[3, 4, 8], value=8, description="filter"),
        width=widgets.IntSlider(value=512, min=256, max=896, step=64, description="width"),
        height=widgets.IntSlider(value=512, min=256, max=896, step=64, description="height"),
        freq=widgets.FloatSlider(value=1.0, min=0.5, max=2.0, step=0.1, description="freq"),
        border_mode=widgets.Dropdown(options=["mirror", "clamp", "wrap", "constant"], value="mirror", description="border"),
    )
except Exception:
    for dims in ((512, 512), (640, 512), (768, 640)):
        render_checkerboard_lab(filter_id=8, width=dims[0], height=dims[1], freq=1.0, border_mode="mirror")

In [None]:
src_img = scenes["Mountain Scroll"]
border_modes = ["clamp", "mirror", "wrap", "constant"]

fig, axes = plt.subplots(1, 4, figsize=(20, 5))
for ax, border in zip(axes, border_modes):
    out = apply_iir2d_rgb(src_img, filter_id=2, border_mode=border, precision="f32", border_const=0.08)
    show(out, f"Border: {border}", ax=ax)

plt.suptitle("Border Semantics Matter (Filter 2)", fontsize=15)
plt.tight_layout()

In [None]:
src_img = scenes["Microbe Swarm"]

out_f32 = apply_iir2d_rgb(src_img, filter_id=2, border_mode="mirror", precision="f32")
out_mixed = apply_iir2d_rgb(src_img, filter_id=2, border_mode="mirror", precision="mixed")
out_f64 = apply_iir2d_rgb(src_img.astype(np.float64), filter_id=2, border_mode="mirror", precision="f64")

delta_mixed = np.abs(out_f32 - out_mixed).mean()
delta_f64 = np.abs(out_f32 - out_f64.astype(np.float32)).mean()

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
show(out_f32, "f32", ax=axes[0])
show(out_mixed, f"mixed (mean |d|={delta_mixed:.6f})", ax=axes[1])
show(out_f64, f"f64 (mean |d|={delta_f64:.6f})", ax=axes[2])
plt.suptitle("Precision Modes: Visual + Numeric Drift", fontsize=15)
plt.tight_layout()

In [None]:
def make_videoish_sequence(base, n=12):
    rng = np.random.default_rng(13)
    frames = []
    for t in range(n):
        dx = int(5 * math.sin(2 * math.pi * t / n))
        dy = int(4 * math.cos(2 * math.pi * t / n))
        shifted = np.roll(np.roll(base, dy, axis=0), dx, axis=1)
        noise = rng.normal(0.0, 0.03, size=base.shape).astype(np.float32)
        frame = np.clip(0.9 * shifted + noise, 0.0, 1.0)
        frames.append(frame)
    return frames


def temporal_energy(frames):
    diffs = [np.mean(np.abs(frames[i + 1] - frames[i])) for i in range(len(frames) - 1)]
    return float(np.mean(diffs))


seq_in = make_videoish_sequence(scenes["Cosmic Portrait"], n=12)
seq_out = [apply_iir2d_rgb(f, filter_id=2, border_mode="mirror", precision="f32") for f in seq_in]

e_in = temporal_energy(seq_in)
e_out = temporal_energy(seq_out)

fig, axes = plt.subplots(2, 6, figsize=(22, 8))
for i in range(6):
    show(seq_in[i], f"In t={i}", ax=axes[0, i])
    show(seq_out[i], f"Out t={i}", ax=axes[1, i])
plt.suptitle(
    f"Video-ish sequence (top=raw, bottom=filtered) | mean temporal energy: {e_in:.4f} -> {e_out:.4f}",
    fontsize=14,
)
plt.tight_layout()

## Live Video Scrubber (Side-by-Side + Per-Frame Metrics)

Scrub frame-by-frame through original vs filtered sequences and inspect quality metrics per frame.

In [None]:
if not IIR2D_CORE_READY:
    raise RuntimeError("Video scrubber requires native GPU core.")


def _psnr(a, b, peak=1.0):
    mse = float(np.mean((a - b) ** 2))
    if mse <= 1e-12:
        return 99.0
    return 10.0 * math.log10((peak * peak) / mse)


def _frame_metrics(a, b):
    diff = a - b
    mae = float(np.mean(np.abs(diff)))
    rmse = float(np.sqrt(np.mean(diff * diff)))
    psnr = _psnr(a, b, peak=1.0)
    return {"mae": mae, "rmse": rmse, "psnr": psnr}


def _temporal_jump(seq, idx):
    if idx == 0:
        return 0.0
    return float(np.mean(np.abs(seq[idx] - seq[idx - 1])))


seq_scrub_in = make_videoish_sequence(scenes["Cosmic Portrait"], n=16)
seq_scrub_out = [
    apply_iir2d_rgb(f, filter_id=2, border_mode="mirror", precision="f32", backend="gpu")
    for f in seq_scrub_in
]

per_frame = [_frame_metrics(a, b) for a, b in zip(seq_scrub_in, seq_scrub_out)]


def render_scrubber(frame_idx=0, show_diff=True):
    i = int(frame_idx)
    src_f = seq_scrub_in[i]
    out_f = seq_scrub_out[i]
    m = per_frame[i]
    tj_in = _temporal_jump(seq_scrub_in, i)
    tj_out = _temporal_jump(seq_scrub_out, i)

    if show_diff:
        fig, axes = plt.subplots(1, 3, figsize=(18, 6))
        show(src_f, f"Original t={i}", ax=axes[0])
        show(out_f, f"IIR2D F2 t={i}", ax=axes[1])
        diff = normalize01(np.abs(out_f - src_f))
        show(diff, "|Diff| (normalized)", ax=axes[2])
    else:
        fig, axes = plt.subplots(1, 2, figsize=(12, 6))
        show(src_f, f"Original t={i}", ax=axes[0])
        show(out_f, f"IIR2D F2 t={i}", ax=axes[1])

    fig.suptitle(
        f"Frame {i}: MAE={m['mae']:.4f}, RMSE={m['rmse']:.4f}, PSNR={m['psnr']:.2f} dB | "
        f"temporal jump in/out={tj_in:.4f}/{tj_out:.4f}",
        fontsize=12,
    )
    plt.tight_layout()
    plt.show()


try:
    import ipywidgets as widgets

    widgets.interact(
        render_scrubber,
        frame_idx=widgets.IntSlider(value=0, min=0, max=len(seq_scrub_in) - 1, step=1, description="frame"),
        show_diff=widgets.Checkbox(value=True, description="show diff"),
    )
except Exception:
    for i in (0, 5, 10, 15):
        render_scrubber(frame_idx=i, show_diff=True)

In [None]:
if not IIR2D_CORE_READY:
    raise RuntimeError("Benchmark cell requires native GPU core.")

x = np.random.default_rng(0).random((256, 256, 3), dtype=np.float32)

results_gpu = []
for fid in range(1, 9):
    t0 = time.perf_counter()
    for _ in range(3):
        _ = apply_iir2d_rgb(x, filter_id=fid, border_mode="mirror", precision="f32", backend="gpu")
    dt = (time.perf_counter() - t0) / 3.0
    results_gpu.append((fid, dt * 1000.0))

print("Native GPU core timing (256x256x3, avg of 3 runs):")
for fid, ms in results_gpu:
    print(f"  Filter {fid}: {ms:8.2f} ms")

# CPU reference is parity-only.
ref_t0 = time.perf_counter()
_ = apply_iir2d_rgb(x, filter_id=2, border_mode="mirror", precision="f32", backend="reference")
ref_ms = (time.perf_counter() - ref_t0) * 1000.0
print(f"Reference sample (CPU, Filter 2): {ref_ms:.2f} ms")

## CPU vs GPU Timing (Reference vs Production Path)

This benchmark compares:
- **GPU**: native CUDA core (`iir2d_forward_cuda`) — production path
- **CPU**: canonical reference implementation — correctness/parity path

Use this to quantify why the GPU core is the primary offering.

In [None]:
if not IIR2D_CORE_READY:
    raise RuntimeError("CPU-vs-GPU benchmark requires native GPU core.")

bench_img = np.random.default_rng(123).random((320, 320, 3), dtype=np.float32)
filter_ids = list(range(1, 9))

# Keep counts modest for notebook responsiveness while still illustrative.
gpu_warmup = 1
gpu_iters = 3
cpu_iters = 1

rows = []
for fid in filter_ids:
    for _ in range(gpu_warmup):
        _ = apply_iir2d_rgb(bench_img, filter_id=fid, border_mode="mirror", precision="f32", backend="gpu")

    t0 = time.perf_counter()
    for _ in range(gpu_iters):
        _ = apply_iir2d_rgb(bench_img, filter_id=fid, border_mode="mirror", precision="f32", backend="gpu")
    gpu_ms = (time.perf_counter() - t0) * 1000.0 / gpu_iters

    t0 = time.perf_counter()
    for _ in range(cpu_iters):
        _ = apply_iir2d_rgb(bench_img, filter_id=fid, border_mode="mirror", precision="f32", backend="reference")
    cpu_ms = (time.perf_counter() - t0) * 1000.0 / cpu_iters

    speedup = cpu_ms / max(gpu_ms, 1e-9)
    rows.append((fid, gpu_ms, cpu_ms, speedup))

print("320x320x3, mirror border, f32")
print(f"{'Filter':>6} {'GPU ms':>10} {'CPU ms':>10} {'CPU/GPU':>10}")
for fid, gpu_ms, cpu_ms, speedup in rows:
    print(f"{fid:>6d} {gpu_ms:>10.2f} {cpu_ms:>10.2f} {speedup:>10.2f}x")

mean_speedup = float(np.mean([r[3] for r in rows]))
med_speedup = float(np.median([r[3] for r in rows]))
print(f"\nMean speedup:   {mean_speedup:.2f}x")
print(f"Median speedup: {med_speedup:.2f}x")

labels = [f"F{fid}" for fid, *_ in rows]
gpu_vals = [r[1] for r in rows]
cpu_vals = [r[2] for r in rows]

x = np.arange(len(labels))
width = 0.38
fig, ax = plt.subplots(figsize=(12, 5))
ax.bar(x - width / 2, gpu_vals, width, label="GPU core", color="#0ea5e9")
ax.bar(x + width / 2, cpu_vals, width, label="CPU reference", color="#f97316")
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.set_ylabel("Latency (ms)")
ax.set_title("IIR2D CPU vs GPU Timing by Filter")
ax.legend()
ax.grid(alpha=0.25, axis="y")
plt.tight_layout()
plt.show()

## Export Selected Demos (README-Ready PNG/MP4)

This cell exports representative notebook outputs into `docs/notebooks/exports/` for direct repository use.

In [None]:
if not IIR2D_CORE_READY:
    raise RuntimeError("Export section requires native GPU core.")

import imageio.v2 as imageio

export_dir = Path("docs/notebooks/exports")
export_dir.mkdir(parents=True, exist_ok=True)


def _save_png(path: Path, img: np.ndarray):
    arr = np.clip(img, 0.0, 1.0)
    arr8 = (arr * 255.0).astype(np.uint8)
    imageio.imwrite(path, arr8)


def _make_side_by_side_frame(a: np.ndarray, b: np.ndarray):
    h = min(a.shape[0], b.shape[0])
    w = min(a.shape[1], b.shape[1])
    aa = a[:h, :w]
    bb = b[:h, :w]
    sep = np.zeros((h, 6, 3), dtype=np.float32)
    sep[..., 1] = 1.0
    return np.concatenate([aa, sep, bb], axis=1)


def _write_mp4(path: Path, frames, fps=8):
    with imageio.get_writer(path, fps=fps, codec="libx264", quality=7) as w:
        for fr in frames:
            arr = np.clip(fr, 0.0, 1.0)
            w.append_data((arr * 255.0).astype(np.uint8))


scene = scenes["Cosmic Portrait"]
scene_f2 = apply_iir2d_rgb(scene, filter_id=2, border_mode="mirror", precision="f32", backend="gpu")
scene_f4 = apply_iir2d_rgb(scene, filter_id=4, border_mode="mirror", precision="f32", backend="gpu")

_save_png(export_dir / "readme_cosmic_original.png", scene)
_save_png(export_dir / "readme_cosmic_f2_gpu.png", scene_f2)
_save_png(export_dir / "readme_cosmic_f4_gpu.png", scene_f4)

# export a filter-tour figure png
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
show(scene, "Original", ax=axes[0])
show(scene_f2, "F2 GPU", ax=axes[1])
show(scene_f4, "F4 GPU", ax=axes[2])
fig.tight_layout()
fig_path = export_dir / "readme_filter_triptych.png"
fig.savefig(fig_path, dpi=170, bbox_inches="tight")
plt.close(fig)

# export side-by-side mp4 from scrubber sequence
video_frames = [_make_side_by_side_frame(a, b) for a, b in zip(seq_scrub_in, seq_scrub_out)]
video_path = export_dir / "readme_video_compare_f2.mp4"
_write_mp4(video_path, video_frames, fps=8)

print("Exported assets:")
for path in sorted(export_dir.glob("readme_*")):
    print(f"- {path} ({path.stat().st_size} bytes)")

## Auto-Generate Benchmark Claims Packet

Runs the official CUDA benchmark harness on a compact matrix and builds a claims packet markdown file.

In [None]:
if not IIR2D_CORE_READY:
    raise RuntimeError("Claims packet generation requires native GPU core.")

import shlex

claims_dir = Path("docs/notebooks/exports")
claims_dir.mkdir(parents=True, exist_ok=True)

bench_csv = claims_dir / "notebook_core_bench.csv"
claims_md = claims_dir / "notebook_claims_packet.md"

bench_cmd = [
    sys.executable,
    "scripts/benchmark_core_cuda.py",
    "--sizes", "256x256,512x512",
    "--filter_ids", "1,2,4,8",
    "--border_modes", "mirror",
    "--precisions", "f32",
    "--warmup", "2",
    "--iters", "6",
    "--out_csv", str(bench_csv),
]

print("Running benchmark harness...")
subprocess.check_call(bench_cmd)

bench_cmd_str = " ".join(shlex.quote(tok) for tok in bench_cmd)
packet_cmd = [
    sys.executable,
    "scripts/build_benchmark_claims_packet.py",
    "--in_csv", str(bench_csv),
    "--out_md", str(claims_md),
    "--benchmark_command", bench_cmd_str,
]

print("Building claims packet...")
subprocess.check_call(packet_cmd)

print(f"Claims packet: {claims_md}")
print("Preview:")
print("-" * 80)
print("\n".join(claims_md.read_text(encoding="utf-8").splitlines()[:28]))
print("-" * 80)

In [None]:
# Native core sanity check + reference parity spot-check
if GPU_AVAILABLE and not IIR2D_CORE_READY:
    raise RuntimeError(
        "GPU detected but native iir2d core is not ready. "
        f"reason={IIR2D_CORE_REASON}"
    )

if not IIR2D_CORE_READY:
    print("No CUDA GPU runtime detected; GPU core demo unavailable in this environment.")
else:
    x_np = scenes["Cosmic Portrait"][..., 0].astype(np.float32)

    y_gpu = _iir2d_core_cuda_2d(x_np, filter_id=2, border_mode="mirror", precision="f32")
    y_ref = iir2d_cpu_reference(x_np, filter_id=2, border_mode="mirror", precision="f32")

    mean_abs = float(np.mean(np.abs(y_gpu - y_ref)))

    print("GPU core: active")
    print("core lib:", IIR2D_CORE_LIB_PATH)
    print("output stats:", float(y_gpu.min()), float(y_gpu.max()), float(y_gpu.mean()))
    print(f"reference parity spot-check (mean abs diff): {mean_abs:.6e}")

## Takeaways

- This notebook is native-core GPU-first: demos call `iir2d_forward_cuda` directly.
- JAX is intentionally removed from the execution path.
- CPU is used for reference/parity and explicit CPU-vs-GPU comparison only.
- Border semantics materially affect outputs.
- Checkerboard-like patches in `F3/F4/F8` come from block-scan boundary composition and are isolated to the interactive lab.
- The notebook now includes scrubber metrics, export tooling, and claims packet generation for commercialization-ready outputs.

Next steps if needed:
1. tune export presets for GitHub-dark/light thumbnail readability,
2. add optional ffmpeg overlays for titles/watermarks,
3. expand claims matrix sizes for full protocol coverage.