# Colab Quickstart (5–10 min)

This notebook is an executable entry point to the repository.

It runs the full end-to-end pipeline on a tiny **fixture** dataset (no NinaPro download):
`prepare → splits → traineval → report`, then runs `bench` + `estimate`.

Outputs are written to `runs/colab_quickstart/` so your working tree stays clean.

> This is a smoke test / tutorial run (`--profile smoke`), not a benchmark for reporting results.

In [None]:
import importlib
import os
import subprocess
import sys
import shutil
from pathlib import Path

REPO_URL = "https://github.com/geronimos/tinyml-semig-classifier.git"
REPO_DIR = Path("tinyml-semig-classifier")


def _run(cmd: list[str]) -> None:
    print("+", " ".join(cmd))
    subprocess.run(cmd, check=True)


def _is_repo_root(path: Path) -> bool:
    return (
        (path / "pyproject.toml").exists()
        and (path / "tinyml_semg_classifier").is_dir()
        and (path / "configs/experiments/protocol_sensitivity_semg_cnn.yml").exists()
    )


def _find_repo_root(start: Path) -> Path | None:
    for candidate in [start, *start.parents]:
        if _is_repo_root(candidate):
            return candidate
    return None


repo_root = _find_repo_root(Path.cwd())
if repo_root is None:
    if not REPO_DIR.exists():
        _run(["git", "clone", "--depth", "1", REPO_URL, str(REPO_DIR)])
    repo_root = REPO_DIR.resolve()

os.chdir(repo_root)
commit = subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip()
print("Repo:", Path.cwd())
print("Commit:", commit)

# Install the project into the *current* Python environment (kernel).
# We prefer `uv` because many uv-managed environments do not include `pip`.
uv = shutil.which("uv")
if uv is None:
    try:
        _run([sys.executable, "-m", "pip", "install", "-q", "uv"])
    except Exception as exc:
        raise RuntimeError(
            "Missing `uv` and could not install it. Install uv from https://docs.astral.sh/uv/ "
            "or run this notebook in an environment with pip."
        ) from exc
    uv = shutil.which("uv")
if uv is None:
    raise RuntimeError("`uv` is required but was not found on PATH.")

_run([uv, "pip", "install", "--python", sys.executable, "-e", "."])

torch = importlib.import_module("torch")

print("Python:", sys.version.split()[0])
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

## Configure outputs (via `--overrides`)

The base experiment config writes to `artifacts/`, `runs/`, and `reports/`. For a tutorial run, we redirect **all** outputs under `runs/colab_quickstart/`.

This keeps committed report files untouched while still showcasing the full pipeline.

In [None]:
from pathlib import Path


OUT_ROOT = Path("runs/colab_quickstart").resolve()
OUT_ROOT.mkdir(parents=True, exist_ok=True)
overrides_path = OUT_ROOT / "overrides_colab_quickstart.yaml"
overrides_path.write_text(
    """experiment:
  artifacts_root: \"{artifacts_root}\"
  runs_root: \"{runs_root}\"
  reports_root: \"{reports_root}\"\n
""".format(
        artifacts_root=OUT_ROOT / "artifacts",
        runs_root=OUT_ROOT / "runs",
        reports_root=OUT_ROOT / "reports",
    ),
    encoding="utf-8",
)

print("Overrides written to:", overrides_path)
print("Outputs root:", OUT_ROOT)

## Run the tiny end-to-end pipeline

This executes:

- `prepare` (fixture preprocessing)
- `splits` (pooled rep-disjoint + LOSO)
- `traineval` (tiny CNN, capped steps)
- `report` (aggregated tables)

All with `--profile smoke`.

In [None]:
import subprocess
import sys

BASE_CONFIG = "configs/experiments/protocol_sensitivity_semg_cnn.yml"
PROFILE = "smoke"

cmd = [
    sys.executable,
    "-m",
    "tinyml_semg_classifier.cli",
    "run",
    "-c",
    BASE_CONFIG,
    "--profile",
    PROFILE,
    "--overrides",
    str(overrides_path),
]
print("Running:", " ".join(cmd))
subprocess.run(cmd, check=True)

## Inspect outputs

We print the generated protocol tables and one example `metrics.json` to confirm the pipeline produced results end-to-end.

In [None]:
import json
from pathlib import Path

reports_root = OUT_ROOT / "reports"
tables_path = reports_root / "protocol_tables.md"

print("protocol_tables.md ->", tables_path)
print(tables_path.read_text(encoding="utf-8"))

metrics_paths = sorted((OUT_ROOT / "runs").rglob("metrics.json"))
print("metrics.json files:", len(metrics_paths))
if metrics_paths:
    sample = metrics_paths[0]
    print("Example run ->", sample)
    payload = json.loads(sample.read_text(encoding="utf-8"))
    print(json.dumps(payload, indent=2)[:2000])

## Sizing: `bench` + `estimate`

`bench` measures step time (and GPU memory if on CUDA). `estimate` uses split counts + the bench results to compute end-to-end runtime and resource estimates.

We keep this tiny and CPU-only for Colab speed.

In [None]:
import json
import subprocess
import sys

cmd_bench = [
    sys.executable,
    "-m",
    "tinyml_semg_classifier.cli",
    "bench",
    "-c",
    BASE_CONFIG,
    "--profile",
    PROFILE,
    "--overrides",
    str(overrides_path),
    "--warmup-steps",
    "1",
    "--measure-steps",
    "5",
    "--device",
    "cpu",
]
print("Running:", " ".join(cmd_bench))
subprocess.run(cmd_bench, check=True)

cmd_estimate = [
    sys.executable,
    "-m",
    "tinyml_semg_classifier.cli",
    "estimate",
    "-c",
    BASE_CONFIG,
    "--profile",
    PROFILE,
    "--overrides",
    str(overrides_path),
    "--gpus",
    "1",
    "--alpha",
    "1.0",
]
print("Running:", " ".join(cmd_estimate))
subprocess.run(cmd_estimate, check=True)

bench_path = OUT_ROOT / "artifacts" / "sizing" / "bench.json"
estimate_path = OUT_ROOT / "artifacts" / "sizing" / "estimate.json"

bench = json.loads(bench_path.read_text(encoding="utf-8"))
estimate = json.loads(estimate_path.read_text(encoding="utf-8"))

print("bench.json ->", bench_path)
print(json.dumps(bench, indent=2)[:2000])

print("estimate.json ->", estimate_path)
print("Wall-time hours (estimate):", estimate["time_hours"]["wall"])
print("Peak VRAM GB (estimate):", estimate["resources"]["gpu_vram_gb"])

## Next steps

- Switch `PROFILE` to `dev_mini` for a slightly larger fixture run.
- Run without a profile (or with `dry_run`) for real NinaPro data.
- Use `configs/experiments/protocol_sensitivity_semg_cnn.yml` to change protocols, models, or seeds.