# Colab Quickstart (5-10 min)

This notebook installs the package from Git and writes a minimal experiment config next to the notebook.

It runs the full end-to-end pipeline on a tiny fixture dataset (no NinaPro download):
`prepare -> splits -> traineval -> report`, then runs `size` for quick sizing estimates.

Outputs are written under `runs/colab_quickstart/`.

> This is a tutorial run (smoke-like config), not a benchmark for reporting results.


In [8]:
import subprocess
import sys
from pathlib import Path


In [9]:
def sh(cmd):
    print("+", " ".join(map(str, cmd)))
    subprocess.check_call(cmd)


GIT_URL = (
    "git+https://github.com/geronimobergk/semg-protocol-sensitivity.git"
    "@fix/colab-notebook"
)

try:
    import tinyml_semg_classifier  # noqa F401

    print("Package already available - skipping install")
except Exception:
    repo_root = Path.cwd().resolve()
    if (repo_root / "pyproject.toml").exists():
        sh([sys.executable, "-m", "pip", "install", "-e", str(repo_root)])
    elif (repo_root.parent / "pyproject.toml").exists():
        sh([sys.executable, "-m", "pip", "install", "-e", str(repo_root.parent)])
    else:
        sh([sys.executable, "-m", "pip", "install", GIT_URL])


Package already available - skipping install


In [10]:
# sanity check
import torch

print(
    "python:",
    sys.version.split()[0],
    "| torch:",
    torch.__version__,
    "| cuda:",
    torch.cuda.is_available(),
)


python: 3.14.0 | torch: 2.9.1 | cuda: False


## Write a tiny experiment config

We generate a small fixture and write a minimal smoke-like config next to the notebook.


In [11]:
import textwrap

import numpy as np


NOTEBOOK_DIR = Path.cwd().resolve()
OUT_ROOT = (NOTEBOOK_DIR / "runs" / "colab_quickstart").resolve()
OUT_ROOT.mkdir(parents=True, exist_ok=True)

fixture_path = OUT_ROOT / "fixture_tiny.npz"


def fixture_needs_regen(path: Path) -> bool:
    if not path.exists():
        return True
    try:
        with np.load(path) as data:
            required = [
                "X",
                "subject_id",
                "rep_id",
                "gesture_id",
                "exercise_id",
                "sample_start",
                "sample_end",
            ]
            if any(name not in data for name in required):
                return True
            arrays = [np.asarray(data[name]) for name in required[1:]]
            num_rows = arrays[0].shape[0]
            if any(arr.shape[0] != num_rows for arr in arrays):
                return True
            rows = set(zip(*(arr.tolist() for arr in arrays)))
            return len(rows) != num_rows
    except Exception:
        return True


if fixture_needs_regen(fixture_path):
    rng = np.random.default_rng(0)
    subjects = [1, 2]
    reps = [1, 2, 3]
    gestures = [1, 2]
    windows_per_combo = 5
    records = [
        (subject, rep, gesture)
        for subject in subjects
        for rep in reps
        for gesture in gestures
        for _ in range(windows_per_combo)
    ]
    num_windows = len(records)
    num_electrodes = 2
    num_samples = 8

    X = rng.standard_normal((num_windows, num_electrodes, num_samples)).astype(
        "float32"
    )
    subject_id = np.array([r[0] for r in records], dtype="int32")
    rep_id = np.array([r[1] for r in records], dtype="int32")
    gesture_id = np.array([r[2] for r in records], dtype="int32")
    exercise_id = np.ones(num_windows, dtype="int32")
    sample_start = np.arange(num_windows, dtype="int32") * num_samples
    sample_end = sample_start + (num_samples - 1)

    np.savez(
        fixture_path,
        X=X,
        subject_id=subject_id,
        rep_id=rep_id,
        gesture_id=gesture_id,
        exercise_id=exercise_id,
        sample_start=sample_start,
        sample_end=sample_end,
    )

    print("Wrote fixture:", fixture_path)


config_path = NOTEBOOK_DIR / "colab_quickstart.yaml"
artifacts_root = str(OUT_ROOT / "artifacts")
runs_root = str(OUT_ROOT / "runs")
reports_root = str(OUT_ROOT / "reports")

config_text = textwrap.dedent(
    """    profile: colab_quickstart

    experiment:
      id: colab_quickstart
      artifacts_root: "{artifacts_root}"
      runs_root: "{runs_root}"
      reports_root: "{reports_root}"

    dataset:
      source: fixture
      fixture_path: "{fixture_path}"
      sampling_rate_hz: 2000
      channels: 2
      subjects: [1, 2]

    preprocess:
      id: fixture_tiny
      window_ms: 4
      hop_ms: 2
      window_samples: 8
      hop_samples: 4
      cache: false
      output:
        windows_path: "{{artifacts_root}}/data/{{preprocess_id}}/windows_s{{subject_id}}.npy"
        meta_path: "{{artifacts_root}}/data/{{preprocess_id}}/meta.json"

    manifest:
      id: fixture_tiny
      output:
        manifest_csv: "{{artifacts_root}}/manifests/{{manifest_id}}/manifest.csv"

    splits:
      cache: false
      allow_missing_classes: true

    protocols:
      pooled_repdisjoint:
        type: pooled_repdisjoint
        output_dir: "{artifacts_root}/splits/protocol=pooled_repdisjoint"
        reps:
          all: [1, 2, 3]
          test: [2]
          val: [3]
      loso:
        type: loso
        output_dir: "{artifacts_root}/splits/protocol=loso"
        subjects: [1, 2]
        reps:
          all: [1, 2, 3]
          test: [2]
          val: [3]

    models:
      tiny_cnn:
        architecture: ST_CNN_GN
        params:
          num_electrodes: 2
          num_samples: 8
          conv_channels: [4]
          kernel_size: [3, 3]
          pool_sizes: [[1, 1]]
          conv_dropout: 0.0
          gn_groups: 1
          head_hidden: [8, 4]
          head_dropout: 0.0
          num_classes: 2

    train:
      device: cpu
      seeds: [0]
      max_steps: 10
      max_epochs: 1
      batch_size: 4
      num_workers: 0
      log_every: 1
      optimizer:
        name: adamw
        lr: 0.001
        weight_decay: 0.0
      checkpoint:
        primary: last
        save_best: true
        save_last: true
      early_stopping:
        enabled: false

    eval:
      max_batches: 5
      latency:
        enabled: false

    plan:
      models: [tiny_cnn]
      protocols: [pooled_repdisjoint, loso]
      max_jobs: 1
    """
).format(
    artifacts_root=artifacts_root,
    runs_root=runs_root,
    reports_root=reports_root,
    fixture_path=fixture_path,
)

config_path.write_text(config_text, encoding="utf-8")

print("Config written to:", config_path)
print("Outputs root:", OUT_ROOT)


Config written to: /Users/geronimo/Projects/semg-protocol-sensitivity/notebooks/colab_quickstart.yaml
Outputs root: /Users/geronimo/Projects/semg-protocol-sensitivity/notebooks/runs/colab_quickstart


## Run the tiny end-to-end pipeline

This executes:

- `prepare` (fixture preprocessing)
- `splits` (pooled rep-disjoint + LOSO)
- `traineval` (tiny CNN, capped steps)
- `report` (aggregated tables)

Config: `colab_quickstart.yaml`.


In [12]:
from tinyml_semg_classifier.cli import run_pipeline
from tinyml_semg_classifier.config import load_config


cfg = load_config(str(config_path))
run_pipeline(cfg)


## Inspect outputs

We print the generated protocol tables and one example `metrics.json` to confirm the pipeline produced results end-to-end.


In [13]:
import json

reports_root = OUT_ROOT / "reports"
tables_path = reports_root / "protocol_tables.md"

print("protocol_tables.md ->", tables_path)
print(tables_path.read_text(encoding="utf-8"))

metrics_paths = sorted((OUT_ROOT / "runs").rglob("metrics.json"))
print("metrics.json files:", len(metrics_paths))
if metrics_paths:
    sample = metrics_paths[0]
    print("Example run ->", sample)
    payload = json.loads(sample.read_text(encoding="utf-8"))
    print(json.dumps(payload, indent=2)[:2000])


protocol_tables.md -> /Users/geronimo/Projects/semg-protocol-sensitivity/notebooks/runs/colab_quickstart/reports/protocol_tables.md
#### Table 1 – Performance across evaluation protocols (mean ± std)

| Protocol                     | Model       | Balanced Acc. [%] | Macro-F1 [%] |
| ---------------------------- | ----------- | ----------------- | ------------ |
| Single-subject, rep-disjoint | ST-CNN | n/a | n/a |
| Pooled, rep-disjoint | ST-CNN | **45.0 ± 0.0** | 43.7 ± 0.0 |
| Cross-subject (LOSO) | ST-CNN | **40.0 ± 0.0** | 38.8 ± 1.3 |

### Table 2 – Protocol-Dependent Model Ranking

| Evaluation Protocol                 | Primary Generalization Axis | Better Model | Δ Balanced Accuracy (pp) | Ranking Stability |
| ----------------------------------- | --------------------------- | ------------ | ------------------------ | ----------------- |
| Single-subject, repetition-disjoint | Repetitions (within-user) | n/a | n/a | n/a |
| Pooled, repetition-disjoint | Repetitions (seen user

## Sizing: `size`

`size` benchmarks a few steps, probes concurrency, and estimates wall-time plus resources.

We keep this tiny and CPU-only for Colab speed.


In [14]:
import json

from tinyml_semg_classifier.cli import size
from tinyml_semg_classifier.config import load_config


cfg = load_config(str(config_path))

size(
    cfg,
    warmup_steps=1,
    bench_train_steps=5,
    bench_val_steps=5,
    device="cpu",
    max_k=1,
    max_gpus=1,
    alpha=1.0,
)

sizing_path = OUT_ROOT / "artifacts" / "sizing" / "sizing.json"
sizing = json.loads(sizing_path.read_text(encoding="utf-8"))

print("sizing.json ->", sizing_path)
print(json.dumps(sizing, indent=2)[:2000])

recommendation = sizing.get("recommendation") or {}
if recommendation:
    print("Recommendation:", recommendation)

walltime_by_gpus = sizing.get("walltime_by_gpus") or []
if walltime_by_gpus:
    print("Walltime by GPUs:", walltime_by_gpus)


sizing.json -> /Users/geronimo/Projects/semg-protocol-sensitivity/notebooks/runs/colab_quickstart/artifacts/sizing/sizing.json
{
  "baseline_per_model": {
    "tiny_cnn": {
      "rss_peak_gb": 0.3553924560546875,
      "samples_sec": 2895.87384846481,
      "vram_peak_gb": 0.0
    }
  },
  "hardware_detected": {
    "cpu_cores": 8,
    "disk_free_gb": 25.99346923828125,
    "gpu_name": null,
    "ram_total_gb": 16.0,
    "vram_total_gb": null
  },
  "probe_jobs_per_gpu": [],
  "recommendation": {
    "cpu_cores": 1,
    "jobs_per_gpu": 1,
    "ram_total_gb": 0.5330886840820312,
    "ssd_gb": 0.13186692167073488,
    "vram_per_gpu_gb": 0.0
  },
  "walltime_by_gpus": [
    {
      "concurrency": 1,
      "gpus": 1,
      "wall_hours": 6.677479298827502e-06
    }
  ],
  "workload": {
    "runs": {
      "models": [
        "tiny_cnn"
      ],
      "num_models": 1,
      "num_seeds": 1,
      "num_splits": 3,
      "total_runs": 3
    },
    "splits": {
      "count": 3,
      "num_test_

## Next steps

- Edit `colab_quickstart.yaml` to change models, protocols, or seeds.
- Switch `profile` to `dev_mini` for a larger fixture run.
- Point `dataset.source` to NinaPro data for a full run.
