From 1e3e7d24e50fad6686c321c6d9364b36c6a450bf Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Thu, 26 Feb 2026 09:18:46 -0800
Subject: [PATCH 1/5] Add Helion Kernel Challenge competition with 9 problems

New competition inspired by Helion kernel ideas covering attention mechanisms,
sampling strategies, quantization, and sequence modeling operators from
production LLM architectures.

Problems:
- GQA: Causal Grouped Query Attention (Llama 3 style)
- MLA: Multi-Head Latent Attention decode (DeepSeek-V2/V3)
- KDA: Kimi Delta Attention (linear attention + delta rule)
- Causal Conv1d: Causal depthwise 1D convolution (Mamba)
- FP8 Quant: Per-token-group FP8 E4M3 quantization
- INT8 Quant: Per-token INT8 symmetric quantization
- Min-P: Adaptive probability threshold sampling
- Top-K: Top-k sampling via binary search
- Top-P: Nucleus sampling via binary search

Deadline: March 14, 2026 midnight PST
---
 problems/helion.yaml                          |  49 +++
 problems/helion/causal_conv1d_py/reference.py |  35 ++
 .../helion/causal_conv1d_py/submission.py     |  14 +
 problems/helion/causal_conv1d_py/task.py      |  12 +
 problems/helion/causal_conv1d_py/task.yml     |  51 +++
 problems/helion/eval.py                       | 375 ++++++++++++++++++
 problems/helion/fp8_quant_py/reference.py     |  59 +++
 problems/helion/fp8_quant_py/submission.py    |  25 ++
 problems/helion/fp8_quant_py/task.py          |  11 +
 problems/helion/fp8_quant_py/task.yml         |  58 +++
 problems/helion/gqa_py/reference.py           |  44 ++
 problems/helion/gqa_py/submission.py          |  20 +
 problems/helion/gqa_py/task.py                |  13 +
 problems/helion/gqa_py/task.yml               |  52 +++
 problems/helion/int8_quant_py/reference.py    |  56 +++
 problems/helion/int8_quant_py/submission.py   |  22 +
 problems/helion/int8_quant_py/task.py         |  10 +
 problems/helion/int8_quant_py/task.yml        |  57 +++
 problems/helion/kda_py/reference.py           |  66 +++
 problems/helion/kda_py/submission.py          |  37 ++
 problems/helion/kda_py/task.py                |  13 +
 problems/helion/kda_py/task.yml               |  57 +++
 problems/helion/min_p_py/reference.py         |  37 ++
 problems/helion/min_p_py/submission.py        |  16 +
 problems/helion/min_p_py/task.py              |  10 +
 problems/helion/min_p_py/task.yml             |  56 +++
 problems/helion/mla_py/reference.py           |  42 ++
 problems/helion/mla_py/submission.py          |  18 +
 problems/helion/mla_py/task.py                |  13 +
 problems/helion/mla_py/task.yml               |  55 +++
 problems/helion/template.py                   |   5 +
 problems/helion/top_k_py/reference.py         |  39 ++
 problems/helion/top_k_py/submission.py        |  22 +
 problems/helion/top_k_py/task.py              |  11 +
 problems/helion/top_k_py/task.yml             |  52 +++
 problems/helion/top_p_py/reference.py         |  45 +++
 problems/helion/top_p_py/submission.py        |  20 +
 problems/helion/top_p_py/task.py              |  10 +
 problems/helion/top_p_py/task.yml             |  55 +++
 problems/helion/utils.py                      | 176 ++++++++
 40 files changed, 1818 insertions(+)
 create mode 100644 problems/helion.yaml
 create mode 100644 problems/helion/causal_conv1d_py/reference.py
 create mode 100644 problems/helion/causal_conv1d_py/submission.py
 create mode 100644 problems/helion/causal_conv1d_py/task.py
 create mode 100644 problems/helion/causal_conv1d_py/task.yml
 create mode 100644 problems/helion/eval.py
 create mode 100644 problems/helion/fp8_quant_py/reference.py
 create mode 100644 problems/helion/fp8_quant_py/submission.py
 create mode 100644 problems/helion/fp8_quant_py/task.py
 create mode 100644 problems/helion/fp8_quant_py/task.yml
 create mode 100644 problems/helion/gqa_py/reference.py
 create mode 100644 problems/helion/gqa_py/submission.py
 create mode 100644 problems/helion/gqa_py/task.py
 create mode 100644 problems/helion/gqa_py/task.yml
 create mode 100644 problems/helion/int8_quant_py/reference.py
 create mode 100644 problems/helion/int8_quant_py/submission.py
 create mode 100644 problems/helion/int8_quant_py/task.py
 create mode 100644 problems/helion/int8_quant_py/task.yml
 create mode 100644 problems/helion/kda_py/reference.py
 create mode 100644 problems/helion/kda_py/submission.py
 create mode 100644 problems/helion/kda_py/task.py
 create mode 100644 problems/helion/kda_py/task.yml
 create mode 100644 problems/helion/min_p_py/reference.py
 create mode 100644 problems/helion/min_p_py/submission.py
 create mode 100644 problems/helion/min_p_py/task.py
 create mode 100644 problems/helion/min_p_py/task.yml
 create mode 100644 problems/helion/mla_py/reference.py
 create mode 100644 problems/helion/mla_py/submission.py
 create mode 100644 problems/helion/mla_py/task.py
 create mode 100644 problems/helion/mla_py/task.yml
 create mode 100644 problems/helion/template.py
 create mode 100644 problems/helion/top_k_py/reference.py
 create mode 100644 problems/helion/top_k_py/submission.py
 create mode 100644 problems/helion/top_k_py/task.py
 create mode 100644 problems/helion/top_k_py/task.yml
 create mode 100644 problems/helion/top_p_py/reference.py
 create mode 100644 problems/helion/top_p_py/submission.py
 create mode 100644 problems/helion/top_p_py/task.py
 create mode 100644 problems/helion/top_p_py/task.yml
 create mode 100644 problems/helion/utils.py

diff --git a/problems/helion.yaml b/problems/helion.yaml
new file mode 100644
index 00000000..8fe5ce9a
--- /dev/null
+++ b/problems/helion.yaml
@@ -0,0 +1,49 @@
+name: Helion Kernel Challenge
+deadline: "2026-03-14"
+description: "GPU kernel challenges inspired by Helion kernel ideas — attention mechanisms, sampling strategies, quantization, and sequence modeling operators from production LLM architectures."
+problems:
+  - directory: helion/gqa_py
+    name: gqa
+    deadline: "2026-03-14 00:00"
+    gpus:
+      - NVIDIA
+  - directory: helion/mla_py
+    name: mla
+    deadline: "2026-03-14 00:00"
+    gpus:
+      - NVIDIA
+  - directory: helion/kda_py
+    name: kda
+    deadline: "2026-03-14 00:00"
+    gpus:
+      - NVIDIA
+  - directory: helion/causal_conv1d_py
+    name: causal_conv1d
+    deadline: "2026-03-14 00:00"
+    gpus:
+      - NVIDIA
+  - directory: helion/fp8_quant_py
+    name: fp8_quant
+    deadline: "2026-03-14 00:00"
+    gpus:
+      - NVIDIA
+  - directory: helion/int8_quant_py
+    name: int8_quant
+    deadline: "2026-03-14 00:00"
+    gpus:
+      - NVIDIA
+  - directory: helion/min_p_py
+    name: min_p
+    deadline: "2026-03-14 00:00"
+    gpus:
+      - NVIDIA
+  - directory: helion/top_k_py
+    name: top_k
+    deadline: "2026-03-14 00:00"
+    gpus:
+      - NVIDIA
+  - directory: helion/top_p_py
+    name: top_p
+    deadline: "2026-03-14 00:00"
+    gpus:
+      - NVIDIA
diff --git a/problems/helion/causal_conv1d_py/reference.py b/problems/helion/causal_conv1d_py/reference.py
new file mode 100644
index 00000000..e132fbf5
--- /dev/null
+++ b/problems/helion/causal_conv1d_py/reference.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn.functional as F
+from task import input_t, output_t
+from utils import make_match_reference, DeterministicContext
+
+
+def generate_input(B: int, D: int, S: int, W: int, seed: int) -> input_t:
+    gen = torch.Generator(device="cuda")
+    gen.manual_seed(seed)
+    x = torch.randn(B, D, S, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    weight = torch.randn(D, W, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    bias = torch.randn(D, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    return x, weight, bias
+
+
+def ref_kernel(data: input_t) -> output_t:
+    with DeterministicContext():
+        x, weight, bias = data
+        B, D, S = x.shape
+        W = weight.shape[1]
+
+        # Causal (left) padding
+        x_padded = F.pad(x, (W - 1, 0))
+
+        # Depthwise conv1d (groups=D)
+        output = F.conv1d(
+            x_padded,
+            weight.unsqueeze(1),  # [D, 1, W]
+            bias=bias,
+            groups=D,
+        )
+        return output
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-4, atol=1e-4)
diff --git a/problems/helion/causal_conv1d_py/submission.py b/problems/helion/causal_conv1d_py/submission.py
new file mode 100644
index 00000000..ba89f5ad
--- /dev/null
+++ b/problems/helion/causal_conv1d_py/submission.py
@@ -0,0 +1,14 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    import torch
+    import torch.nn.functional as F
+
+    x, weight, bias = data
+    W = weight.shape[1]
+    D = x.shape[1]
+
+    x_padded = F.pad(x, (W - 1, 0))
+    output = F.conv1d(x_padded, weight.unsqueeze(1), bias=bias, groups=D)
+    return output
diff --git a/problems/helion/causal_conv1d_py/task.py b/problems/helion/causal_conv1d_py/task.py
new file mode 100644
index 00000000..00a02fe6
--- /dev/null
+++ b/problems/helion/causal_conv1d_py/task.py
@@ -0,0 +1,12 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    B: int
+    D: int
+    S: int
+    W: int
+    seed: int
diff --git a/problems/helion/causal_conv1d_py/task.yml b/problems/helion/causal_conv1d_py/task.yml
new file mode 100644
index 00000000..8ef81809
--- /dev/null
+++ b/problems/helion/causal_conv1d_py/task.yml
@@ -0,0 +1,51 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a causal depthwise 1D convolution kernel.
+
+  This is a core component of Mamba/Mamba-2 architectures. Each channel is
+  convolved independently (depthwise) with causal (left) zero-padding so that
+  output[t] depends only on input[t-W+1:t+1].
+
+  For each batch b, channel d, and time t:
+    out[b, d, t] = bias[d] + sum_{k=0}^{W-1} weight[d, k] * x[b, d, t - W + 1 + k]
+  where out-of-bounds values are treated as zero.
+
+  Input: tuple(x, weight, bias) where:
+    - x: torch.Tensor of shape [B, D, S] (float32)
+    - weight: torch.Tensor of shape [D, W] (float32)
+    - bias: torch.Tensor of shape [D] (float32)
+
+  Output: torch.Tensor of shape [B, D, S] (float32)
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"B": 1, "D": 64, "S": 64, "W": 4, "seed": 4242}
+  - {"B": 2, "D": 128, "S": 128, "W": 4, "seed": 5236}
+  - {"B": 1, "D": 256, "S": 256, "W": 3, "seed": 1001}
+  - {"B": 1, "D": 128, "S": 64, "W": 8, "seed": 5531}
+  - {"B": 4, "D": 64, "S": 128, "W": 4, "seed": 9173}
+
+benchmarks:
+  - {"B": 1, "D": 768, "S": 512, "W": 4, "seed": 31232}
+  - {"B": 1, "D": 768, "S": 2048, "W": 4, "seed": 4052}
+  - {"B": 1, "D": 1536, "S": 2048, "W": 4, "seed": 2146}
+  - {"B": 1, "D": 2560, "S": 2048, "W": 4, "seed": 3129}
+  - {"B": 1, "D": 2560, "S": 4096, "W": 4, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 420
+ranking_by: "geom"
diff --git a/problems/helion/eval.py b/problems/helion/eval.py
new file mode 100644
index 00000000..981b9322
--- /dev/null
+++ b/problems/helion/eval.py
@@ -0,0 +1,375 @@
+import base64
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+
+import torch.cuda
+
+from utils import set_seed, clear_l2_cache
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, 'w')
+        os.set_inheritable(fd, False)
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+    
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+    
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a+b)*(a+b+1)//2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg)**2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
+                 worst=float(worst))
+
+
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    submission_output = custom_kernel(_clone_data(data))
+    torch.cuda.synchronize()
+    return check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        good, message = run_single_test(pool, test)
+        if not good:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", message)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    check_copy = _clone_data(data)
+    #  first, one obligatory correctness check
+    output = custom_kernel(data)
+    good, message = check_implementation(check_copy, output)
+    if not good:
+        return message
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    bm_start_time = time.perf_counter_ns()
+    for i in range(max_repeats):
+        if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
+            data = generate_input(**test.args)
+            check_copy = _clone_data(data)
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        clear_l2_cache()
+
+        start_event.record()
+        output = custom_kernel(data)
+        end_event.record()
+        torch.cuda.synchronize()
+        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
+
+        if recheck:
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
+
+        del output
+        durations.append(duration)
+
+        if i > 1:
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
+            stats = calculate_stats(durations)
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns or total_bm_duration > 120e9:
+                break
+
+    return calculate_stats(durations)
+
+
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int,
+                         max_time_ns: float):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    # warm up
+    run_single_benchmark(pool, tests[0], False, 100, 10e7)
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(f"benchmark.{idx}.report", base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"))
+    logger.log("check", "pass")
+    return 0
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+    tests = get_test_cases(sys.argv[2], seed)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+        mp_context = multiprocessing.get_context('spawn')
+        with mp_context.Pool(1) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # warmup
+                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(f"benchmark.{i}.{field.name}", getattr(result, field.name))
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(f"benchmark.{i}.error", str(result))  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
+            else:
+                # TODO: Implement script mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/problems/helion/fp8_quant_py/reference.py b/problems/helion/fp8_quant_py/reference.py
new file mode 100644
index 00000000..b8792248
--- /dev/null
+++ b/problems/helion/fp8_quant_py/reference.py
@@ -0,0 +1,59 @@
+import torch
+from task import input_t, output_t
+from utils import verbose_allclose
+
+FP8_MAX = 448.0
+FP8_MIN = -448.0
+FP8_EPS = 1e-10
+
+
+def generate_input(num_tokens: int, hidden_dim: int, group_size: int, seed: int) -> input_t:
+    gen = torch.Generator(device="cuda")
+    gen.manual_seed(seed)
+    x = torch.randn(num_tokens, hidden_dim, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    x_q = torch.empty(num_tokens, hidden_dim, dtype=torch.float32, device="cuda").contiguous()
+    x_s = torch.empty(num_tokens, hidden_dim // group_size, dtype=torch.float32, device="cuda").contiguous()
+    return x, x_q, x_s
+
+
+def ref_kernel(data: input_t) -> output_t:
+    x, x_q, x_s = data
+    num_tokens, hidden_dim = x.shape
+    num_groups = x_s.shape[1]
+    group_size = hidden_dim // num_groups
+
+    x_f32 = x.float()
+    x_grouped = x_f32.reshape(num_tokens, num_groups, group_size)
+
+    # Per-group absmax
+    absmax = x_grouped.abs().amax(dim=-1).clamp(min=FP8_EPS)
+
+    # Scale = absmax / fp8_max
+    scale = absmax / FP8_MAX
+
+    # Quantize
+    quantized = (x_grouped / scale.unsqueeze(-1)).clamp(FP8_MIN, FP8_MAX)
+    quantized = quantized.reshape(num_tokens, hidden_dim)
+
+    x_q[...] = quantized
+    x_s[...] = scale
+    return x_q, x_s
+
+
+def check_implementation(data, output):
+    expected = ref_kernel(data)
+    expected_q, expected_s = expected
+    received_q, received_s = output
+
+    reasons_q = verbose_allclose(received_q, expected_q, rtol=1e-3, atol=1e-3)
+    reasons_s = verbose_allclose(received_s, expected_s, rtol=1e-4, atol=1e-6)
+
+    reasons = []
+    if reasons_q:
+        reasons.append("quantized values mismatch: " + " ".join(reasons_q))
+    if reasons_s:
+        reasons.append("scales mismatch: " + " ".join(reasons_s))
+
+    if reasons:
+        return False, " | ".join(reasons)
+    return True, ""
diff --git a/problems/helion/fp8_quant_py/submission.py b/problems/helion/fp8_quant_py/submission.py
new file mode 100644
index 00000000..39cf1d08
--- /dev/null
+++ b/problems/helion/fp8_quant_py/submission.py
@@ -0,0 +1,25 @@
+from task import input_t, output_t
+
+
+FP8_MAX = 448.0
+FP8_MIN = -448.0
+FP8_EPS = 1e-10
+
+
+def custom_kernel(data: input_t) -> output_t:
+    x, x_q, x_s = data
+    num_tokens, hidden_dim = x.shape
+    num_groups = x_s.shape[1]
+    group_size = hidden_dim // num_groups
+
+    x_f32 = x.float()
+    x_grouped = x_f32.reshape(num_tokens, num_groups, group_size)
+
+    absmax = x_grouped.abs().amax(dim=-1).clamp(min=FP8_EPS)
+    scale = absmax / FP8_MAX
+    quantized = (x_grouped / scale.unsqueeze(-1)).clamp(FP8_MIN, FP8_MAX)
+    quantized = quantized.reshape(num_tokens, hidden_dim)
+
+    x_q[...] = quantized
+    x_s[...] = scale
+    return x_q, x_s
diff --git a/problems/helion/fp8_quant_py/task.py b/problems/helion/fp8_quant_py/task.py
new file mode 100644
index 00000000..8fb6c1f0
--- /dev/null
+++ b/problems/helion/fp8_quant_py/task.py
@@ -0,0 +1,11 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=tuple[torch.Tensor, torch.Tensor])
+
+class TestSpec(TypedDict):
+    num_tokens: int
+    hidden_dim: int
+    group_size: int
+    seed: int
diff --git a/problems/helion/fp8_quant_py/task.yml b/problems/helion/fp8_quant_py/task.yml
new file mode 100644
index 00000000..d8288ead
--- /dev/null
+++ b/problems/helion/fp8_quant_py/task.yml
@@ -0,0 +1,58 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a per-token-group FP8 E4M3 quantization kernel.
+
+  This is THE standard activation quantization method in production LLM inference
+  (DeepSeek-V3, Llama 3, Qwen3). It dynamically quantizes activations to FP8
+  format with per-group scale factors for W8A8 quantized inference.
+
+  For each group of `group_size` contiguous elements:
+    1. absmax = max(|x_group|)
+    2. scale = max(absmax, eps) / 448.0
+    3. x_q = clamp(x / scale, -448.0, 448.0)
+
+  Where 448.0 is the max representable value in FP8 E4M3 format.
+
+  NOTE: Output is float32 clamped to FP8 range (for broad GPU compatibility).
+
+  Input: tuple(x, x_q, x_s) where:
+    - x: torch.Tensor of shape [num_tokens, hidden_dim] (float32)
+    - x_q: pre-allocated output [num_tokens, hidden_dim] (float32)
+    - x_s: pre-allocated scales [num_tokens, hidden_dim // group_size] (float32)
+
+  Output: tuple(x_q, x_s) where:
+    - x_q: quantized values [num_tokens, hidden_dim] (float32, clamped to FP8 range)
+    - x_s: per-group scale factors [num_tokens, hidden_dim // group_size] (float32)
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"num_tokens": 1, "hidden_dim": 256, "group_size": 64, "seed": 4242}
+  - {"num_tokens": 4, "hidden_dim": 512, "group_size": 128, "seed": 5236}
+  - {"num_tokens": 16, "hidden_dim": 1024, "group_size": 64, "seed": 1001}
+  - {"num_tokens": 1, "hidden_dim": 4096, "group_size": 128, "seed": 5531}
+  - {"num_tokens": 8, "hidden_dim": 4096, "group_size": 128, "seed": 9173}
+
+benchmarks:
+  - {"num_tokens": 1, "hidden_dim": 4096, "group_size": 128, "seed": 31232}
+  - {"num_tokens": 16, "hidden_dim": 4096, "group_size": 128, "seed": 4052}
+  - {"num_tokens": 256, "hidden_dim": 4096, "group_size": 128, "seed": 2146}
+  - {"num_tokens": 256, "hidden_dim": 8192, "group_size": 128, "seed": 3129}
+  - {"num_tokens": 4096, "hidden_dim": 7168, "group_size": 128, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 420
+ranking_by: "geom"
diff --git a/problems/helion/gqa_py/reference.py b/problems/helion/gqa_py/reference.py
new file mode 100644
index 00000000..b5675ed9
--- /dev/null
+++ b/problems/helion/gqa_py/reference.py
@@ -0,0 +1,44 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference, DeterministicContext
+
+
+def generate_input(B: int, H_q: int, H_kv: int, S: int, D: int, seed: int) -> input_t:
+    gen = torch.Generator(device="cuda")
+    gen.manual_seed(seed)
+    q = torch.randn(B, H_q, S, D, dtype=torch.float16, device="cuda", generator=gen).contiguous()
+    k = torch.randn(B, H_kv, S, D, dtype=torch.float16, device="cuda", generator=gen).contiguous()
+    v = torch.randn(B, H_kv, S, D, dtype=torch.float16, device="cuda", generator=gen).contiguous()
+    return q, k, v
+
+
+def ref_kernel(data: input_t) -> output_t:
+    with DeterministicContext():
+        q, k, v = data
+        B, H_q, S, D = q.shape
+        H_kv = k.shape[1]
+        kv_group_num = H_q // H_kv
+        sm_scale = D ** -0.5
+
+        # Expand K and V to match Q's head count
+        k_expanded = k.repeat_interleave(kv_group_num, dim=1)
+        v_expanded = v.repeat_interleave(kv_group_num, dim=1)
+
+        # Compute attention scores: [B, H_q, S, S]
+        attn_scores = torch.matmul(q.float(), k_expanded.float().transpose(-2, -1)) * sm_scale
+
+        # Apply causal mask
+        causal_mask = torch.triu(
+            torch.ones(S, S, dtype=torch.bool, device=q.device),
+            diagonal=1
+        )
+        attn_scores = attn_scores.masked_fill(causal_mask, float("-inf"))
+
+        # Softmax and apply to values
+        attn_weights = torch.softmax(attn_scores, dim=-1)
+        output = torch.matmul(attn_weights, v_expanded.float())
+
+        return output.to(q.dtype)
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=1e-2)
diff --git a/problems/helion/gqa_py/submission.py b/problems/helion/gqa_py/submission.py
new file mode 100644
index 00000000..25baeebb
--- /dev/null
+++ b/problems/helion/gqa_py/submission.py
@@ -0,0 +1,20 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    import torch
+    q, k, v = data
+    B, H_q, S, D = q.shape
+    H_kv = k.shape[1]
+    kv_group_num = H_q // H_kv
+    sm_scale = D ** -0.5
+
+    k_expanded = k.repeat_interleave(kv_group_num, dim=1)
+    v_expanded = v.repeat_interleave(kv_group_num, dim=1)
+
+    attn_scores = torch.matmul(q.float(), k_expanded.float().transpose(-2, -1)) * sm_scale
+    causal_mask = torch.triu(torch.ones(S, S, dtype=torch.bool, device=q.device), diagonal=1)
+    attn_scores = attn_scores.masked_fill(causal_mask, float("-inf"))
+    attn_weights = torch.softmax(attn_scores, dim=-1)
+    output = torch.matmul(attn_weights, v_expanded.float())
+    return output.to(q.dtype)
diff --git a/problems/helion/gqa_py/task.py b/problems/helion/gqa_py/task.py
new file mode 100644
index 00000000..6f3b7f9e
--- /dev/null
+++ b/problems/helion/gqa_py/task.py
@@ -0,0 +1,13 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    B: int
+    H_q: int
+    H_kv: int
+    S: int
+    D: int
+    seed: int
diff --git a/problems/helion/gqa_py/task.yml b/problems/helion/gqa_py/task.yml
new file mode 100644
index 00000000..05f5d175
--- /dev/null
+++ b/problems/helion/gqa_py/task.yml
@@ -0,0 +1,52 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a causal Grouped Query Attention (GQA) kernel.
+
+  GQA shares KV heads across groups of query heads, reducing KV cache from
+  2 * H_q * D to 2 * H_kv * D per token. Multiple query heads share the same
+  K and V, with kv_group_num = H_q // H_kv.
+
+  For each query head h with kv_head = h // kv_group_num:
+    O[h] = softmax(Q[h] @ K[kv_head]^T / sqrt(D), causal_mask) @ V[kv_head]
+
+  Input: tuple(q, k, v) where:
+    - q: torch.Tensor of shape [B, H_q, S, D] (float16)
+    - k: torch.Tensor of shape [B, H_kv, S, D] (float16)
+    - v: torch.Tensor of shape [B, H_kv, S, D] (float16)
+
+  Output: torch.Tensor of shape [B, H_q, S, D] (float16)
+
+  The attention is causal: position i can only attend to positions <= i.
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"B": 1, "H_q": 8, "H_kv": 2, "S": 64, "D": 64, "seed": 4242}
+  - {"B": 2, "H_q": 8, "H_kv": 4, "S": 64, "D": 64, "seed": 5236}
+  - {"B": 1, "H_q": 8, "H_kv": 8, "S": 64, "D": 64, "seed": 1001}
+  - {"B": 1, "H_q": 8, "H_kv": 1, "S": 64, "D": 64, "seed": 5531}
+  - {"B": 1, "H_q": 16, "H_kv": 4, "S": 128, "D": 64, "seed": 9173}
+
+benchmarks:
+  - {"B": 1, "H_q": 32, "H_kv": 8, "S": 512, "D": 128, "seed": 31232}
+  - {"B": 1, "H_q": 32, "H_kv": 8, "S": 1024, "D": 128, "seed": 4052}
+  - {"B": 1, "H_q": 32, "H_kv": 8, "S": 2048, "D": 128, "seed": 2146}
+  - {"B": 1, "H_q": 64, "H_kv": 8, "S": 1024, "D": 128, "seed": 3129}
+  - {"B": 1, "H_q": 64, "H_kv": 8, "S": 2048, "D": 128, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 420
+ranking_by: "geom"
diff --git a/problems/helion/int8_quant_py/reference.py b/problems/helion/int8_quant_py/reference.py
new file mode 100644
index 00000000..20f54276
--- /dev/null
+++ b/problems/helion/int8_quant_py/reference.py
@@ -0,0 +1,56 @@
+import torch
+from task import input_t, output_t
+from utils import verbose_allclose
+
+INT8_MAX = 127
+INT8_MIN = -128
+INT8_EPS = 1e-10
+
+
+def generate_input(num_tokens: int, hidden_dim: int, seed: int) -> input_t:
+    gen = torch.Generator(device="cuda")
+    gen.manual_seed(seed)
+    x = torch.randn(num_tokens, hidden_dim, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    x_q = torch.empty(num_tokens, hidden_dim, dtype=torch.float32, device="cuda").contiguous()
+    x_s = torch.empty(num_tokens, dtype=torch.float32, device="cuda").contiguous()
+    return x, x_q, x_s
+
+
+def ref_kernel(data: input_t) -> output_t:
+    x, x_q, x_s = data
+
+    x_f32 = x.float()
+
+    # Per-token absmax
+    absmax = x_f32.abs().amax(dim=-1).clamp(min=INT8_EPS)
+
+    # Scale = absmax / 127
+    scale = absmax / INT8_MAX
+
+    # Quantize
+    quantized = torch.round(x_f32 / scale.unsqueeze(-1))
+    quantized = quantized.clamp(INT8_MIN, INT8_MAX)
+
+    x_q[...] = quantized
+    x_s[...] = scale
+    return x_q, x_s
+
+
+def check_implementation(data, output):
+    expected = ref_kernel(data)
+    expected_q, expected_s = expected
+    received_q, received_s = output
+
+    reasons_s = verbose_allclose(received_s, expected_s, rtol=1e-4, atol=1e-6)
+    # Allow +/- 1 LSB for quantized values due to rounding differences
+    reasons_q = verbose_allclose(received_q, expected_q, rtol=0, atol=1.0)
+
+    reasons = []
+    if reasons_q:
+        reasons.append("quantized values mismatch: " + " ".join(reasons_q))
+    if reasons_s:
+        reasons.append("scales mismatch: " + " ".join(reasons_s))
+
+    if reasons:
+        return False, " | ".join(reasons)
+    return True, ""
diff --git a/problems/helion/int8_quant_py/submission.py b/problems/helion/int8_quant_py/submission.py
new file mode 100644
index 00000000..615e33ac
--- /dev/null
+++ b/problems/helion/int8_quant_py/submission.py
@@ -0,0 +1,22 @@
+from task import input_t, output_t
+
+
+INT8_MAX = 127
+INT8_MIN = -128
+INT8_EPS = 1e-10
+
+
+def custom_kernel(data: input_t) -> output_t:
+    import torch
+
+    x, x_q, x_s = data
+
+    x_f32 = x.float()
+    absmax = x_f32.abs().amax(dim=-1).clamp(min=INT8_EPS)
+    scale = absmax / INT8_MAX
+    quantized = torch.round(x_f32 / scale.unsqueeze(-1))
+    quantized = quantized.clamp(INT8_MIN, INT8_MAX)
+
+    x_q[...] = quantized
+    x_s[...] = scale
+    return x_q, x_s
diff --git a/problems/helion/int8_quant_py/task.py b/problems/helion/int8_quant_py/task.py
new file mode 100644
index 00000000..a71f48b8
--- /dev/null
+++ b/problems/helion/int8_quant_py/task.py
@@ -0,0 +1,10 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=tuple[torch.Tensor, torch.Tensor])
+
+class TestSpec(TypedDict):
+    num_tokens: int
+    hidden_dim: int
+    seed: int
diff --git a/problems/helion/int8_quant_py/task.yml b/problems/helion/int8_quant_py/task.yml
new file mode 100644
index 00000000..4d325291
--- /dev/null
+++ b/problems/helion/int8_quant_py/task.yml
@@ -0,0 +1,57 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a per-token INT8 symmetric quantization kernel.
+
+  This is the standard activation quantization for INT8 inference in production
+  LLM engines (vLLM, SGLang). It dynamically quantizes each token's activations
+  to INT8 with a per-token scale factor.
+
+  For each token (row) of length hidden_dim:
+    1. absmax = max(|x_row|)
+    2. scale = max(absmax, eps) / 127
+    3. x_q = round(x / scale)
+    4. x_q = clamp(x_q, -128, 127)
+
+  NOTE: Output is float32 with integer values (for broad GPU compatibility).
+
+  Input: tuple(x, x_q, x_s) where:
+    - x: torch.Tensor of shape [num_tokens, hidden_dim] (float32)
+    - x_q: pre-allocated output [num_tokens, hidden_dim] (float32)
+    - x_s: pre-allocated scales [num_tokens] (float32)
+
+  Output: tuple(x_q, x_s) where:
+    - x_q: quantized values [num_tokens, hidden_dim] (float32, integer-valued)
+    - x_s: per-token scale factors [num_tokens] (float32)
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"num_tokens": 1, "hidden_dim": 256, "seed": 4242}
+  - {"num_tokens": 4, "hidden_dim": 512, "seed": 5236}
+  - {"num_tokens": 16, "hidden_dim": 1024, "seed": 1001}
+  - {"num_tokens": 1, "hidden_dim": 4096, "seed": 5531}
+  - {"num_tokens": 8, "hidden_dim": 4096, "seed": 9173}
+
+benchmarks:
+  - {"num_tokens": 1, "hidden_dim": 4096, "seed": 31232}
+  - {"num_tokens": 16, "hidden_dim": 4096, "seed": 4052}
+  - {"num_tokens": 256, "hidden_dim": 4096, "seed": 2146}
+  - {"num_tokens": 256, "hidden_dim": 8192, "seed": 3129}
+  - {"num_tokens": 4096, "hidden_dim": 7168, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 420
+ranking_by: "geom"
diff --git a/problems/helion/kda_py/reference.py b/problems/helion/kda_py/reference.py
new file mode 100644
index 00000000..74ccf361
--- /dev/null
+++ b/problems/helion/kda_py/reference.py
@@ -0,0 +1,66 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference, DeterministicContext
+
+
+def generate_input(B: int, T: int, H: int, K: int, V: int, seed: int) -> input_t:
+    gen = torch.Generator(device="cuda")
+    gen.manual_seed(seed)
+    q = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    k = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    # L2-normalize k along last dim
+    k = k / (k.norm(dim=-1, keepdim=True) + 1e-6)
+    v = torch.randn(B, T, H, V, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    g = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    beta = torch.sigmoid(torch.randn(B, T, H, dtype=torch.float32, device="cuda", generator=gen)).contiguous()
+    return q, k, v, g, beta
+
+
+def ref_kernel(data: input_t) -> output_t:
+    with DeterministicContext():
+        q, k, v, g, beta = data
+        B, T, H, K = q.shape
+        V_dim = v.shape[-1]
+        scale = K ** -0.5
+
+        # Reshape for batched matmul: [B*H, T, K/V]
+        q_r = q.permute(0, 2, 1, 3).reshape(B * H, T, K)
+        k_r = k.permute(0, 2, 1, 3).reshape(B * H, T, K)
+        v_r = v.permute(0, 2, 1, 3).reshape(B * H, T, V_dim)
+        g_r = g.permute(0, 2, 1, 3).reshape(B * H, T, K)
+        beta_r = beta.permute(0, 2, 1).reshape(B * H, T)
+
+        # Initialize hidden state [B*H, K, V]
+        S = torch.zeros(B * H, K, V_dim, dtype=torch.float32, device=q.device)
+        outputs = []
+
+        for t in range(T):
+            # Per-channel decay
+            decay = torch.exp(g_r[:, t, :])  # [B*H, K]
+            S = S * decay.unsqueeze(-1)  # [B*H, K, V]
+
+            # Prediction: k_t^T @ S -> [B*H, V]
+            k_t = k_r[:, t, :]  # [B*H, K]
+            predicted = torch.bmm(k_t.unsqueeze(1), S).squeeze(1)  # [B*H, V]
+
+            # Delta: v_t - predicted
+            v_t = v_r[:, t, :]  # [B*H, V]
+            delta = v_t - predicted  # [B*H, V]
+
+            # Correction: S += k_t @ (beta_t * delta)^T
+            b_t = beta_r[:, t].unsqueeze(-1)  # [B*H, 1]
+            correction = torch.bmm(k_t.unsqueeze(-1), (b_t * delta).unsqueeze(1))  # [B*H, K, V]
+            S = S + correction
+
+            # Output: scale * q_t @ S
+            q_t = q_r[:, t, :]  # [B*H, K]
+            o_t = scale * torch.bmm(q_t.unsqueeze(1), S).squeeze(1)  # [B*H, V]
+            outputs.append(o_t)
+
+        # Stack and reshape: [B*H, T, V] -> [B, T, H, V]
+        output = torch.stack(outputs, dim=1)
+        output = output.reshape(B, H, T, V_dim).permute(0, 2, 1, 3).contiguous()
+        return output
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
diff --git a/problems/helion/kda_py/submission.py b/problems/helion/kda_py/submission.py
new file mode 100644
index 00000000..834b0833
--- /dev/null
+++ b/problems/helion/kda_py/submission.py
@@ -0,0 +1,37 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    import torch
+
+    q, k, v, g, beta = data
+    B, T, H, K = q.shape
+    V_dim = v.shape[-1]
+    scale = K ** -0.5
+
+    q_r = q.permute(0, 2, 1, 3).reshape(B * H, T, K)
+    k_r = k.permute(0, 2, 1, 3).reshape(B * H, T, K)
+    v_r = v.permute(0, 2, 1, 3).reshape(B * H, T, V_dim)
+    g_r = g.permute(0, 2, 1, 3).reshape(B * H, T, K)
+    beta_r = beta.permute(0, 2, 1).reshape(B * H, T)
+
+    S = torch.zeros(B * H, K, V_dim, dtype=torch.float32, device=q.device)
+    outputs = []
+
+    for t in range(T):
+        decay = torch.exp(g_r[:, t, :])
+        S = S * decay.unsqueeze(-1)
+        k_t = k_r[:, t, :]
+        predicted = torch.bmm(k_t.unsqueeze(1), S).squeeze(1)
+        v_t = v_r[:, t, :]
+        delta = v_t - predicted
+        b_t = beta_r[:, t].unsqueeze(-1)
+        correction = torch.bmm(k_t.unsqueeze(-1), (b_t * delta).unsqueeze(1))
+        S = S + correction
+        q_t = q_r[:, t, :]
+        o_t = scale * torch.bmm(q_t.unsqueeze(1), S).squeeze(1)
+        outputs.append(o_t)
+
+    output = torch.stack(outputs, dim=1)
+    output = output.reshape(B, H, T, V_dim).permute(0, 2, 1, 3).contiguous()
+    return output
diff --git a/problems/helion/kda_py/task.py b/problems/helion/kda_py/task.py
new file mode 100644
index 00000000..08d4b4f6
--- /dev/null
+++ b/problems/helion/kda_py/task.py
@@ -0,0 +1,13 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    B: int
+    T: int
+    H: int
+    K: int
+    V: int
+    seed: int
diff --git a/problems/helion/kda_py/task.yml b/problems/helion/kda_py/task.yml
new file mode 100644
index 00000000..dd5fb3f9
--- /dev/null
+++ b/problems/helion/kda_py/task.yml
@@ -0,0 +1,57 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement Kimi Delta Attention (KDA), a linear attention mechanism with
+  per-channel decay gates and delta rule error correction.
+
+  KDA combines ideas from Gated Linear Attention (GLA) and DeltaNet:
+    - Per-channel exponential decay gates (from GLA)
+    - Delta rule error correction: predict, compute error, update state
+
+  Recurrence per timestep t:
+    S_t = diag(exp(g_t)) @ S_{t-1}           # per-channel decay
+    predicted = k_t^T @ S_t                   # prediction
+    delta = v_t - predicted                   # error
+    S_t = S_t + k_t @ (beta_t * delta)^T     # correction
+    o_t = (1/sqrt(K)) * q_t @ S_t            # output
+
+  Input: tuple(q, k, v, g, beta) where:
+    - q: torch.Tensor of shape [B, T, H, K] (float32)
+    - k: torch.Tensor of shape [B, T, H, K] (float32, L2-normalized)
+    - v: torch.Tensor of shape [B, T, H, V] (float32)
+    - g: torch.Tensor of shape [B, T, H, K] (float32, per-channel log-space gates)
+    - beta: torch.Tensor of shape [B, T, H] (float32, update strength in [0,1])
+
+  Output: torch.Tensor of shape [B, T, H, V] (float32)
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"B": 1, "T": 32, "H": 4, "K": 32, "V": 32, "seed": 4242}
+  - {"B": 2, "T": 32, "H": 4, "K": 32, "V": 32, "seed": 5236}
+  - {"B": 1, "T": 64, "H": 4, "K": 64, "V": 64, "seed": 1001}
+  - {"B": 1, "T": 64, "H": 8, "K": 32, "V": 32, "seed": 5531}
+  - {"B": 1, "T": 128, "H": 4, "K": 64, "V": 64, "seed": 9173}
+
+benchmarks:
+  - {"B": 1, "T": 256, "H": 4, "K": 64, "V": 64, "seed": 31232}
+  - {"B": 1, "T": 512, "H": 8, "K": 128, "V": 128, "seed": 4052}
+  - {"B": 1, "T": 1024, "H": 8, "K": 128, "V": 128, "seed": 2146}
+  - {"B": 1, "T": 1024, "H": 16, "K": 128, "V": 256, "seed": 3129}
+  - {"B": 1, "T": 2048, "H": 16, "K": 128, "V": 256, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 300
+ranked_timeout: 600
+ranking_by: "geom"
diff --git a/problems/helion/min_p_py/reference.py b/problems/helion/min_p_py/reference.py
new file mode 100644
index 00000000..4173253f
--- /dev/null
+++ b/problems/helion/min_p_py/reference.py
@@ -0,0 +1,37 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference, DeterministicContext
+
+
+def generate_input(batch_size: int, vocab_size: int, seed: int) -> input_t:
+    gen = torch.Generator(device="cuda")
+    gen.manual_seed(seed)
+    logits = torch.randn(batch_size, vocab_size, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    # min_p values between 0.05 and 0.2
+    min_p = (torch.rand(batch_size, device="cuda", generator=gen) * 0.15 + 0.05).to(torch.float32).contiguous()
+    return logits, min_p
+
+
+def ref_kernel(data: input_t) -> output_t:
+    with DeterministicContext():
+        logits, min_p = data
+        logits = logits.float()
+
+        # 1. Softmax
+        probs = torch.softmax(logits, dim=-1)
+
+        # 2. Find max prob and compute threshold
+        max_probs = probs.max(dim=-1, keepdim=True).values
+        threshold = min_p[:, None] * max_probs
+
+        # 3. Apply threshold
+        filtered = torch.where(probs >= threshold, probs, torch.zeros_like(probs))
+
+        # 4. Renormalize
+        filtered_sum = filtered.sum(dim=-1, keepdim=True)
+        result = filtered / filtered_sum
+
+        return result
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-5)
diff --git a/problems/helion/min_p_py/submission.py b/problems/helion/min_p_py/submission.py
new file mode 100644
index 00000000..080acf6b
--- /dev/null
+++ b/problems/helion/min_p_py/submission.py
@@ -0,0 +1,16 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    import torch
+
+    logits, min_p = data
+    logits = logits.float()
+
+    probs = torch.softmax(logits, dim=-1)
+    max_probs = probs.max(dim=-1, keepdim=True).values
+    threshold = min_p[:, None] * max_probs
+    filtered = torch.where(probs >= threshold, probs, torch.zeros_like(probs))
+    filtered_sum = filtered.sum(dim=-1, keepdim=True)
+    result = filtered / filtered_sum
+    return result
diff --git a/problems/helion/min_p_py/task.py b/problems/helion/min_p_py/task.py
new file mode 100644
index 00000000..708b66a3
--- /dev/null
+++ b/problems/helion/min_p_py/task.py
@@ -0,0 +1,10 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    batch_size: int
+    vocab_size: int
+    seed: int
diff --git a/problems/helion/min_p_py/task.yml b/problems/helion/min_p_py/task.yml
new file mode 100644
index 00000000..6c092780
--- /dev/null
+++ b/problems/helion/min_p_py/task.yml
@@ -0,0 +1,56 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a min-p sampling kernel.
+
+  Min-p is an adaptive probability filtering method that keeps all tokens
+  whose probability is at least min_p times the maximum probability. It is
+  simpler than top-p (no sort needed) and more adaptive than top-k.
+
+  Algorithm:
+    1. probs = softmax(logits)
+    2. max_prob = max(probs)
+    3. threshold = min_p * max_prob
+    4. filtered = where(probs >= threshold, probs, 0)
+    5. output = filtered / sum(filtered)
+
+  When the model is confident (high max_prob), the threshold is high and few
+  tokens survive. When uncertain, more tokens survive.
+
+  Input: tuple(logits, min_p) where:
+    - logits: torch.Tensor of shape [batch_size, vocab_size] (float32)
+    - min_p: torch.Tensor of shape [batch_size] (float32, values in [0, 1])
+
+  Output: torch.Tensor of shape [batch_size, vocab_size] (float32, renormalized probabilities)
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"batch_size": 1, "vocab_size": 256, "seed": 4242}
+  - {"batch_size": 4, "vocab_size": 1024, "seed": 5236}
+  - {"batch_size": 2, "vocab_size": 4096, "seed": 1001}
+  - {"batch_size": 1, "vocab_size": 32000, "seed": 5531}
+  - {"batch_size": 8, "vocab_size": 32000, "seed": 9173}
+
+benchmarks:
+  - {"batch_size": 1, "vocab_size": 32000, "seed": 31232}
+  - {"batch_size": 8, "vocab_size": 32000, "seed": 4052}
+  - {"batch_size": 1, "vocab_size": 128256, "seed": 2146}
+  - {"batch_size": 8, "vocab_size": 128256, "seed": 3129}
+  - {"batch_size": 1, "vocab_size": 151936, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 420
+ranking_by: "geom"
diff --git a/problems/helion/mla_py/reference.py b/problems/helion/mla_py/reference.py
new file mode 100644
index 00000000..62eb0a2d
--- /dev/null
+++ b/problems/helion/mla_py/reference.py
@@ -0,0 +1,42 @@
+import torch
+import math
+from task import input_t, output_t
+from utils import make_match_reference, DeterministicContext
+
+
+def generate_input(B: int, H: int, S: int, d_c: int, d_r: int, seed: int) -> input_t:
+    gen = torch.Generator(device="cuda")
+    gen.manual_seed(seed)
+    q_nope = torch.randn(B, H, d_c, dtype=torch.float16, device="cuda", generator=gen).contiguous()
+    q_pe = torch.randn(B, H, d_r, dtype=torch.float16, device="cuda", generator=gen).contiguous()
+    kv_c = torch.randn(B, S, d_c, dtype=torch.float16, device="cuda", generator=gen).contiguous()
+    k_pe = torch.randn(B, S, d_r, dtype=torch.float16, device="cuda", generator=gen).contiguous()
+    return q_nope, q_pe, kv_c, k_pe
+
+
+def ref_kernel(data: input_t) -> output_t:
+    with DeterministicContext():
+        q_nope, q_pe, kv_c, k_pe = data
+        B, H, d_c = q_nope.shape
+        d_r = q_pe.shape[-1]
+        sm_scale = 1.0 / math.sqrt(d_c + d_r)
+
+        # Content score: [B, H, S] = [B, H, d_c] @ [B, d_c, S]
+        score_content = torch.bmm(q_nope.float(), kv_c.float().transpose(-2, -1))
+
+        # Position score: [B, H, S] = [B, H, d_r] @ [B, d_r, S]
+        score_position = torch.bmm(q_pe.float(), k_pe.float().transpose(-2, -1))
+
+        # Combined score with scaling
+        scores = (score_content + score_position) * sm_scale
+
+        # Softmax
+        attn_weights = torch.softmax(scores, dim=-1)
+
+        # Output: [B, H, d_c] = [B, H, S] @ [B, S, d_c]
+        output = torch.bmm(attn_weights, kv_c.float())
+
+        return output.to(q_nope.dtype)
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=1e-2)
diff --git a/problems/helion/mla_py/submission.py b/problems/helion/mla_py/submission.py
new file mode 100644
index 00000000..a3aed999
--- /dev/null
+++ b/problems/helion/mla_py/submission.py
@@ -0,0 +1,18 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    import torch
+    import math
+
+    q_nope, q_pe, kv_c, k_pe = data
+    B, H, d_c = q_nope.shape
+    d_r = q_pe.shape[-1]
+    sm_scale = 1.0 / math.sqrt(d_c + d_r)
+
+    score_content = torch.bmm(q_nope.float(), kv_c.float().transpose(-2, -1))
+    score_position = torch.bmm(q_pe.float(), k_pe.float().transpose(-2, -1))
+    scores = (score_content + score_position) * sm_scale
+    attn_weights = torch.softmax(scores, dim=-1)
+    output = torch.bmm(attn_weights, kv_c.float())
+    return output.to(q_nope.dtype)
diff --git a/problems/helion/mla_py/task.py b/problems/helion/mla_py/task.py
new file mode 100644
index 00000000..0451a805
--- /dev/null
+++ b/problems/helion/mla_py/task.py
@@ -0,0 +1,13 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    B: int
+    H: int
+    S: int
+    d_c: int
+    d_r: int
+    seed: int
diff --git a/problems/helion/mla_py/task.yml b/problems/helion/mla_py/task.yml
new file mode 100644
index 00000000..6a084cbf
--- /dev/null
+++ b/problems/helion/mla_py/task.yml
@@ -0,0 +1,55 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a Multi-Head Latent Attention (MLA) decode kernel.
+
+  MLA is from DeepSeek-V2/V3. It compresses the KV cache to a low-dimensional
+  latent space with decoupled RoPE:
+
+    score = (Q_nope @ KV_c^T + Q_pe @ K_pe^T) * sm_scale
+    O = softmax(score) @ KV_c
+
+  The value reuses the compressed latent (V = KV_c), achieving 32x KV cache
+  compression on DeepSeek-V3.
+
+  sm_scale = 1 / sqrt(d_c + d_r)
+
+  Input: tuple(q_nope, q_pe, kv_c, k_pe) where:
+    - q_nope: torch.Tensor of shape [B, H, d_c] (float16) — content query
+    - q_pe: torch.Tensor of shape [B, H, d_r] (float16) — position query
+    - kv_c: torch.Tensor of shape [B, S, d_c] (float16) — compressed KV cache
+    - k_pe: torch.Tensor of shape [B, S, d_r] (float16) — position keys
+
+  Output: torch.Tensor of shape [B, H, d_c] (float16)
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"B": 1, "H": 8, "S": 64, "d_c": 128, "d_r": 32, "seed": 4242}
+  - {"B": 2, "H": 8, "S": 64, "d_c": 128, "d_r": 32, "seed": 5236}
+  - {"B": 1, "H": 16, "S": 128, "d_c": 256, "d_r": 64, "seed": 1001}
+  - {"B": 1, "H": 8, "S": 256, "d_c": 128, "d_r": 32, "seed": 5531}
+  - {"B": 1, "H": 32, "S": 128, "d_c": 128, "d_r": 64, "seed": 9173}
+
+benchmarks:
+  - {"B": 1, "H": 32, "S": 256, "d_c": 512, "d_r": 64, "seed": 31232}
+  - {"B": 1, "H": 32, "S": 512, "d_c": 512, "d_r": 64, "seed": 4052}
+  - {"B": 1, "H": 32, "S": 1024, "d_c": 512, "d_r": 64, "seed": 2146}
+  - {"B": 1, "H": 32, "S": 2048, "d_c": 512, "d_r": 64, "seed": 3129}
+  - {"B": 1, "H": 128, "S": 1024, "d_c": 512, "d_r": 64, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 420
+ranking_by: "geom"
diff --git a/problems/helion/template.py b/problems/helion/template.py
new file mode 100644
index 00000000..4aec6a6c
--- /dev/null
+++ b/problems/helion/template.py
@@ -0,0 +1,5 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    pass
diff --git a/problems/helion/top_k_py/reference.py b/problems/helion/top_k_py/reference.py
new file mode 100644
index 00000000..2aa66d1e
--- /dev/null
+++ b/problems/helion/top_k_py/reference.py
@@ -0,0 +1,39 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference, DeterministicContext
+
+
+def generate_input(batch_size: int, vocab_size: int, k: int, seed: int) -> input_t:
+    gen = torch.Generator(device="cuda")
+    gen.manual_seed(seed)
+    logits = torch.randn(batch_size, vocab_size, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    top_k = torch.full((batch_size,), k, dtype=torch.int32, device="cuda")
+    return logits, top_k
+
+
+def ref_kernel(data: input_t) -> output_t:
+    with DeterministicContext():
+        logits, top_k = data
+        logits = logits.float()
+        batch_size, vocab_size = logits.shape
+
+        # 1. Softmax
+        probs = torch.softmax(logits, dim=-1)
+
+        # 2. Top-k filtering (per-batch k)
+        filtered = torch.zeros_like(probs)
+        for b in range(batch_size):
+            k = min(top_k[b].item(), vocab_size)
+            topk_vals, _ = torch.topk(probs[b], k)
+            threshold = topk_vals[-1]
+            mask = probs[b] >= threshold
+            filtered[b] = torch.where(mask, probs[b], torch.zeros_like(probs[b]))
+
+        # 3. Renormalize
+        filtered_sum = filtered.sum(dim=-1, keepdim=True)
+        result = filtered / filtered_sum
+
+        return result
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-5)
diff --git a/problems/helion/top_k_py/submission.py b/problems/helion/top_k_py/submission.py
new file mode 100644
index 00000000..e5ac2e04
--- /dev/null
+++ b/problems/helion/top_k_py/submission.py
@@ -0,0 +1,22 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    import torch
+
+    logits, top_k = data
+    logits = logits.float()
+    batch_size, vocab_size = logits.shape
+
+    probs = torch.softmax(logits, dim=-1)
+    filtered = torch.zeros_like(probs)
+    for b in range(batch_size):
+        k = min(top_k[b].item(), vocab_size)
+        topk_vals, _ = torch.topk(probs[b], k)
+        threshold = topk_vals[-1]
+        mask = probs[b] >= threshold
+        filtered[b] = torch.where(mask, probs[b], torch.zeros_like(probs[b]))
+
+    filtered_sum = filtered.sum(dim=-1, keepdim=True)
+    result = filtered / filtered_sum
+    return result
diff --git a/problems/helion/top_k_py/task.py b/problems/helion/top_k_py/task.py
new file mode 100644
index 00000000..02ee370a
--- /dev/null
+++ b/problems/helion/top_k_py/task.py
@@ -0,0 +1,11 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    batch_size: int
+    vocab_size: int
+    k: int
+    seed: int
diff --git a/problems/helion/top_k_py/task.yml b/problems/helion/top_k_py/task.yml
new file mode 100644
index 00000000..7bf194e7
--- /dev/null
+++ b/problems/helion/top_k_py/task.yml
@@ -0,0 +1,52 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a top-k sampling kernel.
+
+  Top-k keeps only the k highest-probability tokens and zeros out the rest.
+  A key insight: instead of O(V log V) sort, use binary search on the
+  probability threshold to find the k-th largest in O(V) time.
+
+  Algorithm:
+    1. probs = softmax(logits)
+    2. Binary search for threshold T where count(probs >= T) >= k
+    3. filtered = where(probs >= threshold, probs, 0)
+    4. output = filtered / sum(filtered)
+
+  Input: tuple(logits, top_k) where:
+    - logits: torch.Tensor of shape [batch_size, vocab_size] (float32)
+    - top_k: torch.Tensor of shape [batch_size] (int32)
+
+  Output: torch.Tensor of shape [batch_size, vocab_size] (float32, renormalized probabilities)
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"batch_size": 1, "vocab_size": 256, "k": 50, "seed": 4242}
+  - {"batch_size": 4, "vocab_size": 1024, "k": 100, "seed": 5236}
+  - {"batch_size": 2, "vocab_size": 4096, "k": 10, "seed": 1001}
+  - {"batch_size": 1, "vocab_size": 32000, "k": 50, "seed": 5531}
+  - {"batch_size": 1, "vocab_size": 256, "k": 1, "seed": 9173}
+
+benchmarks:
+  - {"batch_size": 1, "vocab_size": 32000, "k": 50, "seed": 31232}
+  - {"batch_size": 8, "vocab_size": 32000, "k": 50, "seed": 4052}
+  - {"batch_size": 1, "vocab_size": 128256, "k": 50, "seed": 2146}
+  - {"batch_size": 8, "vocab_size": 128256, "k": 50, "seed": 3129}
+  - {"batch_size": 1, "vocab_size": 151936, "k": 50, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 420
+ranking_by: "geom"
diff --git a/problems/helion/top_p_py/reference.py b/problems/helion/top_p_py/reference.py
new file mode 100644
index 00000000..40bb1ca6
--- /dev/null
+++ b/problems/helion/top_p_py/reference.py
@@ -0,0 +1,45 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference, DeterministicContext
+
+
+def generate_input(batch_size: int, vocab_size: int, seed: int) -> input_t:
+    gen = torch.Generator(device="cuda")
+    gen.manual_seed(seed)
+    logits = torch.randn(batch_size, vocab_size, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    # top_p values between 0.85 and 0.95
+    top_p = (torch.rand(batch_size, device="cuda", generator=gen) * 0.1 + 0.85).to(torch.float32).contiguous()
+    return logits, top_p
+
+
+def ref_kernel(data: input_t) -> output_t:
+    with DeterministicContext():
+        logits, top_p = data
+        logits = logits.float()
+
+        # 1. Softmax
+        probs = torch.softmax(logits, dim=-1)
+
+        # 2. Sort descending
+        sorted_probs, _sorted_indices = torch.sort(probs, descending=True, dim=-1)
+
+        # 3. Cumulative sum
+        cumsum = torch.cumsum(sorted_probs, dim=-1)
+
+        # 4. Find threshold per batch element
+        shifted_cumsum = cumsum - sorted_probs
+        nucleus_mask = shifted_cumsum <= top_p[:, None]
+        masked_sorted = torch.where(nucleus_mask, sorted_probs, torch.ones_like(sorted_probs))
+        threshold = masked_sorted.amin(dim=-1, keepdim=True)
+
+        # 5. Apply threshold to original probs
+        filtered = torch.where(probs >= threshold, probs, torch.zeros_like(probs))
+
+        # 6. Renormalize
+        filtered_sum = filtered.sum(dim=-1, keepdim=True)
+        result = filtered / filtered_sum
+
+        return result
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-5)
diff --git a/problems/helion/top_p_py/submission.py b/problems/helion/top_p_py/submission.py
new file mode 100644
index 00000000..48e3fd93
--- /dev/null
+++ b/problems/helion/top_p_py/submission.py
@@ -0,0 +1,20 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    import torch
+
+    logits, top_p = data
+    logits = logits.float()
+
+    probs = torch.softmax(logits, dim=-1)
+    sorted_probs, _ = torch.sort(probs, descending=True, dim=-1)
+    cumsum = torch.cumsum(sorted_probs, dim=-1)
+    shifted_cumsum = cumsum - sorted_probs
+    nucleus_mask = shifted_cumsum <= top_p[:, None]
+    masked_sorted = torch.where(nucleus_mask, sorted_probs, torch.ones_like(sorted_probs))
+    threshold = masked_sorted.amin(dim=-1, keepdim=True)
+    filtered = torch.where(probs >= threshold, probs, torch.zeros_like(probs))
+    filtered_sum = filtered.sum(dim=-1, keepdim=True)
+    result = filtered / filtered_sum
+    return result
diff --git a/problems/helion/top_p_py/task.py b/problems/helion/top_p_py/task.py
new file mode 100644
index 00000000..708b66a3
--- /dev/null
+++ b/problems/helion/top_p_py/task.py
@@ -0,0 +1,10 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    batch_size: int
+    vocab_size: int
+    seed: int
diff --git a/problems/helion/top_p_py/task.yml b/problems/helion/top_p_py/task.yml
new file mode 100644
index 00000000..4b2f0318
--- /dev/null
+++ b/problems/helion/top_p_py/task.yml
@@ -0,0 +1,55 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a top-p (nucleus) sampling kernel.
+
+  Top-p keeps the smallest set of tokens whose cumulative probability mass
+  exceeds the threshold top_p. This is the most widely used sampling strategy
+  across production LLMs (Qwen3 top_p=0.95, DeepSeek-V3 top_p=0.95).
+
+  A key insight: instead of O(V log V) sort, use binary search on the
+  probability threshold to find the nucleus boundary in O(V) time.
+
+  Algorithm:
+    1. probs = softmax(logits)
+    2. Binary search for threshold T where sum(probs >= T) >= top_p
+    3. filtered = where(probs >= threshold, probs, 0)
+    4. output = filtered / sum(filtered)
+
+  Input: tuple(logits, top_p) where:
+    - logits: torch.Tensor of shape [batch_size, vocab_size] (float32)
+    - top_p: torch.Tensor of shape [batch_size] (float32, values in [0, 1])
+
+  Output: torch.Tensor of shape [batch_size, vocab_size] (float32, renormalized probabilities)
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"batch_size": 1, "vocab_size": 256, "seed": 4242}
+  - {"batch_size": 4, "vocab_size": 1024, "seed": 5236}
+  - {"batch_size": 2, "vocab_size": 4096, "seed": 1001}
+  - {"batch_size": 1, "vocab_size": 32000, "seed": 5531}
+  - {"batch_size": 8, "vocab_size": 32000, "seed": 9173}
+
+benchmarks:
+  - {"batch_size": 1, "vocab_size": 32000, "seed": 31232}
+  - {"batch_size": 8, "vocab_size": 32000, "seed": 4052}
+  - {"batch_size": 1, "vocab_size": 128256, "seed": 2146}
+  - {"batch_size": 8, "vocab_size": 128256, "seed": 3129}
+  - {"batch_size": 1, "vocab_size": 151936, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 420
+ranking_by: "geom"
diff --git a/problems/helion/utils.py b/problems/helion/utils.py
new file mode 100644
index 00000000..e8a9082f
--- /dev/null
+++ b/problems/helion/utils.py
@@ -0,0 +1,176 @@
+import os
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> list[str]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received - expected)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> tuple[bool, str]:
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return False, "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+
+    return True, ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped
+
+
+class DeterministicContext:
+    def __init__(self):
+        self.allow_tf32 = None
+        self.deterministic = None
+        self.cublas = None
+
+    def __enter__(self):
+        self.cublas = os.environ.get('CUBLAS_WORKSPACE_CONFIG', '')
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        torch.use_deterministic_algorithms(True)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+        torch.use_deterministic_algorithms(False)
+        os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
+
+def clear_l2_cache():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
+    # write stuff to
+    dummy.fill_(42)
+    del dummy
\ No newline at end of file

From 509b97754e6177502e28f6772f45d76accf0ebab Mon Sep 17 00:00:00 2001
From: Will Feng <yfeng.us@gmail.com>
Date: Tue, 3 Mar 2026 11:35:47 -0800
Subject: [PATCH 2/5] remove unused kernels

---
 problems/helion/gqa_py/reference.py         | 44 --------------
 problems/helion/gqa_py/submission.py        | 20 -------
 problems/helion/gqa_py/task.py              | 13 ----
 problems/helion/gqa_py/task.yml             | 52 ----------------
 problems/helion/int8_quant_py/reference.py  | 56 -----------------
 problems/helion/int8_quant_py/submission.py | 22 -------
 problems/helion/int8_quant_py/task.py       | 10 ----
 problems/helion/int8_quant_py/task.yml      | 57 ------------------
 problems/helion/kda_py/reference.py         | 66 ---------------------
 problems/helion/kda_py/submission.py        | 37 ------------
 problems/helion/kda_py/task.py              | 13 ----
 problems/helion/kda_py/task.yml             | 57 ------------------
 problems/helion/min_p_py/reference.py       | 37 ------------
 problems/helion/min_p_py/submission.py      | 16 -----
 problems/helion/min_p_py/task.py            | 10 ----
 problems/helion/min_p_py/task.yml           | 56 -----------------
 problems/helion/mla_py/reference.py         | 42 -------------
 problems/helion/mla_py/submission.py        | 18 ------
 problems/helion/mla_py/task.py              | 13 ----
 problems/helion/mla_py/task.yml             | 55 -----------------
 problems/helion/top_k_py/reference.py       | 39 ------------
 problems/helion/top_k_py/submission.py      | 22 -------
 problems/helion/top_k_py/task.py            | 11 ----
 problems/helion/top_k_py/task.yml           | 52 ----------------
 problems/helion/top_p_py/reference.py       | 45 --------------
 problems/helion/top_p_py/submission.py      | 20 -------
 problems/helion/top_p_py/task.py            | 10 ----
 problems/helion/top_p_py/task.yml           | 55 -----------------
 28 files changed, 948 deletions(-)
 delete mode 100644 problems/helion/gqa_py/reference.py
 delete mode 100644 problems/helion/gqa_py/submission.py
 delete mode 100644 problems/helion/gqa_py/task.py
 delete mode 100644 problems/helion/gqa_py/task.yml
 delete mode 100644 problems/helion/int8_quant_py/reference.py
 delete mode 100644 problems/helion/int8_quant_py/submission.py
 delete mode 100644 problems/helion/int8_quant_py/task.py
 delete mode 100644 problems/helion/int8_quant_py/task.yml
 delete mode 100644 problems/helion/kda_py/reference.py
 delete mode 100644 problems/helion/kda_py/submission.py
 delete mode 100644 problems/helion/kda_py/task.py
 delete mode 100644 problems/helion/kda_py/task.yml
 delete mode 100644 problems/helion/min_p_py/reference.py
 delete mode 100644 problems/helion/min_p_py/submission.py
 delete mode 100644 problems/helion/min_p_py/task.py
 delete mode 100644 problems/helion/min_p_py/task.yml
 delete mode 100644 problems/helion/mla_py/reference.py
 delete mode 100644 problems/helion/mla_py/submission.py
 delete mode 100644 problems/helion/mla_py/task.py
 delete mode 100644 problems/helion/mla_py/task.yml
 delete mode 100644 problems/helion/top_k_py/reference.py
 delete mode 100644 problems/helion/top_k_py/submission.py
 delete mode 100644 problems/helion/top_k_py/task.py
 delete mode 100644 problems/helion/top_k_py/task.yml
 delete mode 100644 problems/helion/top_p_py/reference.py
 delete mode 100644 problems/helion/top_p_py/submission.py
 delete mode 100644 problems/helion/top_p_py/task.py
 delete mode 100644 problems/helion/top_p_py/task.yml

diff --git a/problems/helion/gqa_py/reference.py b/problems/helion/gqa_py/reference.py
deleted file mode 100644
index b5675ed9..00000000
--- a/problems/helion/gqa_py/reference.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-from task import input_t, output_t
-from utils import make_match_reference, DeterministicContext
-
-
-def generate_input(B: int, H_q: int, H_kv: int, S: int, D: int, seed: int) -> input_t:
-    gen = torch.Generator(device="cuda")
-    gen.manual_seed(seed)
-    q = torch.randn(B, H_q, S, D, dtype=torch.float16, device="cuda", generator=gen).contiguous()
-    k = torch.randn(B, H_kv, S, D, dtype=torch.float16, device="cuda", generator=gen).contiguous()
-    v = torch.randn(B, H_kv, S, D, dtype=torch.float16, device="cuda", generator=gen).contiguous()
-    return q, k, v
-
-
-def ref_kernel(data: input_t) -> output_t:
-    with DeterministicContext():
-        q, k, v = data
-        B, H_q, S, D = q.shape
-        H_kv = k.shape[1]
-        kv_group_num = H_q // H_kv
-        sm_scale = D ** -0.5
-
-        # Expand K and V to match Q's head count
-        k_expanded = k.repeat_interleave(kv_group_num, dim=1)
-        v_expanded = v.repeat_interleave(kv_group_num, dim=1)
-
-        # Compute attention scores: [B, H_q, S, S]
-        attn_scores = torch.matmul(q.float(), k_expanded.float().transpose(-2, -1)) * sm_scale
-
-        # Apply causal mask
-        causal_mask = torch.triu(
-            torch.ones(S, S, dtype=torch.bool, device=q.device),
-            diagonal=1
-        )
-        attn_scores = attn_scores.masked_fill(causal_mask, float("-inf"))
-
-        # Softmax and apply to values
-        attn_weights = torch.softmax(attn_scores, dim=-1)
-        output = torch.matmul(attn_weights, v_expanded.float())
-
-        return output.to(q.dtype)
-
-
-check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=1e-2)
diff --git a/problems/helion/gqa_py/submission.py b/problems/helion/gqa_py/submission.py
deleted file mode 100644
index 25baeebb..00000000
--- a/problems/helion/gqa_py/submission.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from task import input_t, output_t
-
-
-def custom_kernel(data: input_t) -> output_t:
-    import torch
-    q, k, v = data
-    B, H_q, S, D = q.shape
-    H_kv = k.shape[1]
-    kv_group_num = H_q // H_kv
-    sm_scale = D ** -0.5
-
-    k_expanded = k.repeat_interleave(kv_group_num, dim=1)
-    v_expanded = v.repeat_interleave(kv_group_num, dim=1)
-
-    attn_scores = torch.matmul(q.float(), k_expanded.float().transpose(-2, -1)) * sm_scale
-    causal_mask = torch.triu(torch.ones(S, S, dtype=torch.bool, device=q.device), diagonal=1)
-    attn_scores = attn_scores.masked_fill(causal_mask, float("-inf"))
-    attn_weights = torch.softmax(attn_scores, dim=-1)
-    output = torch.matmul(attn_weights, v_expanded.float())
-    return output.to(q.dtype)
diff --git a/problems/helion/gqa_py/task.py b/problems/helion/gqa_py/task.py
deleted file mode 100644
index 6f3b7f9e..00000000
--- a/problems/helion/gqa_py/task.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from typing import TypedDict, TypeVar
-import torch
-
-input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor])
-output_t = TypeVar("output_t", bound=torch.Tensor)
-
-class TestSpec(TypedDict):
-    B: int
-    H_q: int
-    H_kv: int
-    S: int
-    D: int
-    seed: int
diff --git a/problems/helion/gqa_py/task.yml b/problems/helion/gqa_py/task.yml
deleted file mode 100644
index 05f5d175..00000000
--- a/problems/helion/gqa_py/task.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-files:
-  - {"name": "submission.py", "source": "@SUBMISSION@"}
-  - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "../utils.py"}
-  - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval.py"}
-
-lang: "py"
-
-description: |
-  Implement a causal Grouped Query Attention (GQA) kernel.
-
-  GQA shares KV heads across groups of query heads, reducing KV cache from
-  2 * H_q * D to 2 * H_kv * D per token. Multiple query heads share the same
-  K and V, with kv_group_num = H_q // H_kv.
-
-  For each query head h with kv_head = h // kv_group_num:
-    O[h] = softmax(Q[h] @ K[kv_head]^T / sqrt(D), causal_mask) @ V[kv_head]
-
-  Input: tuple(q, k, v) where:
-    - q: torch.Tensor of shape [B, H_q, S, D] (float16)
-    - k: torch.Tensor of shape [B, H_kv, S, D] (float16)
-    - v: torch.Tensor of shape [B, H_kv, S, D] (float16)
-
-  Output: torch.Tensor of shape [B, H_q, S, D] (float16)
-
-  The attention is causal: position i can only attend to positions <= i.
-
-config:
-  main: "eval.py"
-
-templates:
-  Python: "../template.py"
-
-tests:
-  - {"B": 1, "H_q": 8, "H_kv": 2, "S": 64, "D": 64, "seed": 4242}
-  - {"B": 2, "H_q": 8, "H_kv": 4, "S": 64, "D": 64, "seed": 5236}
-  - {"B": 1, "H_q": 8, "H_kv": 8, "S": 64, "D": 64, "seed": 1001}
-  - {"B": 1, "H_q": 8, "H_kv": 1, "S": 64, "D": 64, "seed": 5531}
-  - {"B": 1, "H_q": 16, "H_kv": 4, "S": 128, "D": 64, "seed": 9173}
-
-benchmarks:
-  - {"B": 1, "H_q": 32, "H_kv": 8, "S": 512, "D": 128, "seed": 31232}
-  - {"B": 1, "H_q": 32, "H_kv": 8, "S": 1024, "D": 128, "seed": 4052}
-  - {"B": 1, "H_q": 32, "H_kv": 8, "S": 2048, "D": 128, "seed": 2146}
-  - {"B": 1, "H_q": 64, "H_kv": 8, "S": 1024, "D": 128, "seed": 3129}
-  - {"B": 1, "H_q": 64, "H_kv": 8, "S": 2048, "D": 128, "seed": 54352}
-
-test_timeout: 180
-benchmark_timeout: 180
-ranked_timeout: 420
-ranking_by: "geom"
diff --git a/problems/helion/int8_quant_py/reference.py b/problems/helion/int8_quant_py/reference.py
deleted file mode 100644
index 20f54276..00000000
--- a/problems/helion/int8_quant_py/reference.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import torch
-from task import input_t, output_t
-from utils import verbose_allclose
-
-INT8_MAX = 127
-INT8_MIN = -128
-INT8_EPS = 1e-10
-
-
-def generate_input(num_tokens: int, hidden_dim: int, seed: int) -> input_t:
-    gen = torch.Generator(device="cuda")
-    gen.manual_seed(seed)
-    x = torch.randn(num_tokens, hidden_dim, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    x_q = torch.empty(num_tokens, hidden_dim, dtype=torch.float32, device="cuda").contiguous()
-    x_s = torch.empty(num_tokens, dtype=torch.float32, device="cuda").contiguous()
-    return x, x_q, x_s
-
-
-def ref_kernel(data: input_t) -> output_t:
-    x, x_q, x_s = data
-
-    x_f32 = x.float()
-
-    # Per-token absmax
-    absmax = x_f32.abs().amax(dim=-1).clamp(min=INT8_EPS)
-
-    # Scale = absmax / 127
-    scale = absmax / INT8_MAX
-
-    # Quantize
-    quantized = torch.round(x_f32 / scale.unsqueeze(-1))
-    quantized = quantized.clamp(INT8_MIN, INT8_MAX)
-
-    x_q[...] = quantized
-    x_s[...] = scale
-    return x_q, x_s
-
-
-def check_implementation(data, output):
-    expected = ref_kernel(data)
-    expected_q, expected_s = expected
-    received_q, received_s = output
-
-    reasons_s = verbose_allclose(received_s, expected_s, rtol=1e-4, atol=1e-6)
-    # Allow +/- 1 LSB for quantized values due to rounding differences
-    reasons_q = verbose_allclose(received_q, expected_q, rtol=0, atol=1.0)
-
-    reasons = []
-    if reasons_q:
-        reasons.append("quantized values mismatch: " + " ".join(reasons_q))
-    if reasons_s:
-        reasons.append("scales mismatch: " + " ".join(reasons_s))
-
-    if reasons:
-        return False, " | ".join(reasons)
-    return True, ""
diff --git a/problems/helion/int8_quant_py/submission.py b/problems/helion/int8_quant_py/submission.py
deleted file mode 100644
index 615e33ac..00000000
--- a/problems/helion/int8_quant_py/submission.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from task import input_t, output_t
-
-
-INT8_MAX = 127
-INT8_MIN = -128
-INT8_EPS = 1e-10
-
-
-def custom_kernel(data: input_t) -> output_t:
-    import torch
-
-    x, x_q, x_s = data
-
-    x_f32 = x.float()
-    absmax = x_f32.abs().amax(dim=-1).clamp(min=INT8_EPS)
-    scale = absmax / INT8_MAX
-    quantized = torch.round(x_f32 / scale.unsqueeze(-1))
-    quantized = quantized.clamp(INT8_MIN, INT8_MAX)
-
-    x_q[...] = quantized
-    x_s[...] = scale
-    return x_q, x_s
diff --git a/problems/helion/int8_quant_py/task.py b/problems/helion/int8_quant_py/task.py
deleted file mode 100644
index a71f48b8..00000000
--- a/problems/helion/int8_quant_py/task.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from typing import TypedDict, TypeVar
-import torch
-
-input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor])
-output_t = TypeVar("output_t", bound=tuple[torch.Tensor, torch.Tensor])
-
-class TestSpec(TypedDict):
-    num_tokens: int
-    hidden_dim: int
-    seed: int
diff --git a/problems/helion/int8_quant_py/task.yml b/problems/helion/int8_quant_py/task.yml
deleted file mode 100644
index 4d325291..00000000
--- a/problems/helion/int8_quant_py/task.yml
+++ /dev/null
@@ -1,57 +0,0 @@
-files:
-  - {"name": "submission.py", "source": "@SUBMISSION@"}
-  - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "../utils.py"}
-  - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval.py"}
-
-lang: "py"
-
-description: |
-  Implement a per-token INT8 symmetric quantization kernel.
-
-  This is the standard activation quantization for INT8 inference in production
-  LLM engines (vLLM, SGLang). It dynamically quantizes each token's activations
-  to INT8 with a per-token scale factor.
-
-  For each token (row) of length hidden_dim:
-    1. absmax = max(|x_row|)
-    2. scale = max(absmax, eps) / 127
-    3. x_q = round(x / scale)
-    4. x_q = clamp(x_q, -128, 127)
-
-  NOTE: Output is float32 with integer values (for broad GPU compatibility).
-
-  Input: tuple(x, x_q, x_s) where:
-    - x: torch.Tensor of shape [num_tokens, hidden_dim] (float32)
-    - x_q: pre-allocated output [num_tokens, hidden_dim] (float32)
-    - x_s: pre-allocated scales [num_tokens] (float32)
-
-  Output: tuple(x_q, x_s) where:
-    - x_q: quantized values [num_tokens, hidden_dim] (float32, integer-valued)
-    - x_s: per-token scale factors [num_tokens] (float32)
-
-config:
-  main: "eval.py"
-
-templates:
-  Python: "../template.py"
-
-tests:
-  - {"num_tokens": 1, "hidden_dim": 256, "seed": 4242}
-  - {"num_tokens": 4, "hidden_dim": 512, "seed": 5236}
-  - {"num_tokens": 16, "hidden_dim": 1024, "seed": 1001}
-  - {"num_tokens": 1, "hidden_dim": 4096, "seed": 5531}
-  - {"num_tokens": 8, "hidden_dim": 4096, "seed": 9173}
-
-benchmarks:
-  - {"num_tokens": 1, "hidden_dim": 4096, "seed": 31232}
-  - {"num_tokens": 16, "hidden_dim": 4096, "seed": 4052}
-  - {"num_tokens": 256, "hidden_dim": 4096, "seed": 2146}
-  - {"num_tokens": 256, "hidden_dim": 8192, "seed": 3129}
-  - {"num_tokens": 4096, "hidden_dim": 7168, "seed": 54352}
-
-test_timeout: 180
-benchmark_timeout: 180
-ranked_timeout: 420
-ranking_by: "geom"
diff --git a/problems/helion/kda_py/reference.py b/problems/helion/kda_py/reference.py
deleted file mode 100644
index 74ccf361..00000000
--- a/problems/helion/kda_py/reference.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import torch
-from task import input_t, output_t
-from utils import make_match_reference, DeterministicContext
-
-
-def generate_input(B: int, T: int, H: int, K: int, V: int, seed: int) -> input_t:
-    gen = torch.Generator(device="cuda")
-    gen.manual_seed(seed)
-    q = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    k = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    # L2-normalize k along last dim
-    k = k / (k.norm(dim=-1, keepdim=True) + 1e-6)
-    v = torch.randn(B, T, H, V, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    g = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    beta = torch.sigmoid(torch.randn(B, T, H, dtype=torch.float32, device="cuda", generator=gen)).contiguous()
-    return q, k, v, g, beta
-
-
-def ref_kernel(data: input_t) -> output_t:
-    with DeterministicContext():
-        q, k, v, g, beta = data
-        B, T, H, K = q.shape
-        V_dim = v.shape[-1]
-        scale = K ** -0.5
-
-        # Reshape for batched matmul: [B*H, T, K/V]
-        q_r = q.permute(0, 2, 1, 3).reshape(B * H, T, K)
-        k_r = k.permute(0, 2, 1, 3).reshape(B * H, T, K)
-        v_r = v.permute(0, 2, 1, 3).reshape(B * H, T, V_dim)
-        g_r = g.permute(0, 2, 1, 3).reshape(B * H, T, K)
-        beta_r = beta.permute(0, 2, 1).reshape(B * H, T)
-
-        # Initialize hidden state [B*H, K, V]
-        S = torch.zeros(B * H, K, V_dim, dtype=torch.float32, device=q.device)
-        outputs = []
-
-        for t in range(T):
-            # Per-channel decay
-            decay = torch.exp(g_r[:, t, :])  # [B*H, K]
-            S = S * decay.unsqueeze(-1)  # [B*H, K, V]
-
-            # Prediction: k_t^T @ S -> [B*H, V]
-            k_t = k_r[:, t, :]  # [B*H, K]
-            predicted = torch.bmm(k_t.unsqueeze(1), S).squeeze(1)  # [B*H, V]
-
-            # Delta: v_t - predicted
-            v_t = v_r[:, t, :]  # [B*H, V]
-            delta = v_t - predicted  # [B*H, V]
-
-            # Correction: S += k_t @ (beta_t * delta)^T
-            b_t = beta_r[:, t].unsqueeze(-1)  # [B*H, 1]
-            correction = torch.bmm(k_t.unsqueeze(-1), (b_t * delta).unsqueeze(1))  # [B*H, K, V]
-            S = S + correction
-
-            # Output: scale * q_t @ S
-            q_t = q_r[:, t, :]  # [B*H, K]
-            o_t = scale * torch.bmm(q_t.unsqueeze(1), S).squeeze(1)  # [B*H, V]
-            outputs.append(o_t)
-
-        # Stack and reshape: [B*H, T, V] -> [B, T, H, V]
-        output = torch.stack(outputs, dim=1)
-        output = output.reshape(B, H, T, V_dim).permute(0, 2, 1, 3).contiguous()
-        return output
-
-
-check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
diff --git a/problems/helion/kda_py/submission.py b/problems/helion/kda_py/submission.py
deleted file mode 100644
index 834b0833..00000000
--- a/problems/helion/kda_py/submission.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from task import input_t, output_t
-
-
-def custom_kernel(data: input_t) -> output_t:
-    import torch
-
-    q, k, v, g, beta = data
-    B, T, H, K = q.shape
-    V_dim = v.shape[-1]
-    scale = K ** -0.5
-
-    q_r = q.permute(0, 2, 1, 3).reshape(B * H, T, K)
-    k_r = k.permute(0, 2, 1, 3).reshape(B * H, T, K)
-    v_r = v.permute(0, 2, 1, 3).reshape(B * H, T, V_dim)
-    g_r = g.permute(0, 2, 1, 3).reshape(B * H, T, K)
-    beta_r = beta.permute(0, 2, 1).reshape(B * H, T)
-
-    S = torch.zeros(B * H, K, V_dim, dtype=torch.float32, device=q.device)
-    outputs = []
-
-    for t in range(T):
-        decay = torch.exp(g_r[:, t, :])
-        S = S * decay.unsqueeze(-1)
-        k_t = k_r[:, t, :]
-        predicted = torch.bmm(k_t.unsqueeze(1), S).squeeze(1)
-        v_t = v_r[:, t, :]
-        delta = v_t - predicted
-        b_t = beta_r[:, t].unsqueeze(-1)
-        correction = torch.bmm(k_t.unsqueeze(-1), (b_t * delta).unsqueeze(1))
-        S = S + correction
-        q_t = q_r[:, t, :]
-        o_t = scale * torch.bmm(q_t.unsqueeze(1), S).squeeze(1)
-        outputs.append(o_t)
-
-    output = torch.stack(outputs, dim=1)
-    output = output.reshape(B, H, T, V_dim).permute(0, 2, 1, 3).contiguous()
-    return output
diff --git a/problems/helion/kda_py/task.py b/problems/helion/kda_py/task.py
deleted file mode 100644
index 08d4b4f6..00000000
--- a/problems/helion/kda_py/task.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from typing import TypedDict, TypeVar
-import torch
-
-input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
-output_t = TypeVar("output_t", bound=torch.Tensor)
-
-class TestSpec(TypedDict):
-    B: int
-    T: int
-    H: int
-    K: int
-    V: int
-    seed: int
diff --git a/problems/helion/kda_py/task.yml b/problems/helion/kda_py/task.yml
deleted file mode 100644
index dd5fb3f9..00000000
--- a/problems/helion/kda_py/task.yml
+++ /dev/null
@@ -1,57 +0,0 @@
-files:
-  - {"name": "submission.py", "source": "@SUBMISSION@"}
-  - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "../utils.py"}
-  - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval.py"}
-
-lang: "py"
-
-description: |
-  Implement Kimi Delta Attention (KDA), a linear attention mechanism with
-  per-channel decay gates and delta rule error correction.
-
-  KDA combines ideas from Gated Linear Attention (GLA) and DeltaNet:
-    - Per-channel exponential decay gates (from GLA)
-    - Delta rule error correction: predict, compute error, update state
-
-  Recurrence per timestep t:
-    S_t = diag(exp(g_t)) @ S_{t-1}           # per-channel decay
-    predicted = k_t^T @ S_t                   # prediction
-    delta = v_t - predicted                   # error
-    S_t = S_t + k_t @ (beta_t * delta)^T     # correction
-    o_t = (1/sqrt(K)) * q_t @ S_t            # output
-
-  Input: tuple(q, k, v, g, beta) where:
-    - q: torch.Tensor of shape [B, T, H, K] (float32)
-    - k: torch.Tensor of shape [B, T, H, K] (float32, L2-normalized)
-    - v: torch.Tensor of shape [B, T, H, V] (float32)
-    - g: torch.Tensor of shape [B, T, H, K] (float32, per-channel log-space gates)
-    - beta: torch.Tensor of shape [B, T, H] (float32, update strength in [0,1])
-
-  Output: torch.Tensor of shape [B, T, H, V] (float32)
-
-config:
-  main: "eval.py"
-
-templates:
-  Python: "../template.py"
-
-tests:
-  - {"B": 1, "T": 32, "H": 4, "K": 32, "V": 32, "seed": 4242}
-  - {"B": 2, "T": 32, "H": 4, "K": 32, "V": 32, "seed": 5236}
-  - {"B": 1, "T": 64, "H": 4, "K": 64, "V": 64, "seed": 1001}
-  - {"B": 1, "T": 64, "H": 8, "K": 32, "V": 32, "seed": 5531}
-  - {"B": 1, "T": 128, "H": 4, "K": 64, "V": 64, "seed": 9173}
-
-benchmarks:
-  - {"B": 1, "T": 256, "H": 4, "K": 64, "V": 64, "seed": 31232}
-  - {"B": 1, "T": 512, "H": 8, "K": 128, "V": 128, "seed": 4052}
-  - {"B": 1, "T": 1024, "H": 8, "K": 128, "V": 128, "seed": 2146}
-  - {"B": 1, "T": 1024, "H": 16, "K": 128, "V": 256, "seed": 3129}
-  - {"B": 1, "T": 2048, "H": 16, "K": 128, "V": 256, "seed": 54352}
-
-test_timeout: 180
-benchmark_timeout: 300
-ranked_timeout: 600
-ranking_by: "geom"
diff --git a/problems/helion/min_p_py/reference.py b/problems/helion/min_p_py/reference.py
deleted file mode 100644
index 4173253f..00000000
--- a/problems/helion/min_p_py/reference.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import torch
-from task import input_t, output_t
-from utils import make_match_reference, DeterministicContext
-
-
-def generate_input(batch_size: int, vocab_size: int, seed: int) -> input_t:
-    gen = torch.Generator(device="cuda")
-    gen.manual_seed(seed)
-    logits = torch.randn(batch_size, vocab_size, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    # min_p values between 0.05 and 0.2
-    min_p = (torch.rand(batch_size, device="cuda", generator=gen) * 0.15 + 0.05).to(torch.float32).contiguous()
-    return logits, min_p
-
-
-def ref_kernel(data: input_t) -> output_t:
-    with DeterministicContext():
-        logits, min_p = data
-        logits = logits.float()
-
-        # 1. Softmax
-        probs = torch.softmax(logits, dim=-1)
-
-        # 2. Find max prob and compute threshold
-        max_probs = probs.max(dim=-1, keepdim=True).values
-        threshold = min_p[:, None] * max_probs
-
-        # 3. Apply threshold
-        filtered = torch.where(probs >= threshold, probs, torch.zeros_like(probs))
-
-        # 4. Renormalize
-        filtered_sum = filtered.sum(dim=-1, keepdim=True)
-        result = filtered / filtered_sum
-
-        return result
-
-
-check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-5)
diff --git a/problems/helion/min_p_py/submission.py b/problems/helion/min_p_py/submission.py
deleted file mode 100644
index 080acf6b..00000000
--- a/problems/helion/min_p_py/submission.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from task import input_t, output_t
-
-
-def custom_kernel(data: input_t) -> output_t:
-    import torch
-
-    logits, min_p = data
-    logits = logits.float()
-
-    probs = torch.softmax(logits, dim=-1)
-    max_probs = probs.max(dim=-1, keepdim=True).values
-    threshold = min_p[:, None] * max_probs
-    filtered = torch.where(probs >= threshold, probs, torch.zeros_like(probs))
-    filtered_sum = filtered.sum(dim=-1, keepdim=True)
-    result = filtered / filtered_sum
-    return result
diff --git a/problems/helion/min_p_py/task.py b/problems/helion/min_p_py/task.py
deleted file mode 100644
index 708b66a3..00000000
--- a/problems/helion/min_p_py/task.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from typing import TypedDict, TypeVar
-import torch
-
-input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
-output_t = TypeVar("output_t", bound=torch.Tensor)
-
-class TestSpec(TypedDict):
-    batch_size: int
-    vocab_size: int
-    seed: int
diff --git a/problems/helion/min_p_py/task.yml b/problems/helion/min_p_py/task.yml
deleted file mode 100644
index 6c092780..00000000
--- a/problems/helion/min_p_py/task.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-files:
-  - {"name": "submission.py", "source": "@SUBMISSION@"}
-  - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "../utils.py"}
-  - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval.py"}
-
-lang: "py"
-
-description: |
-  Implement a min-p sampling kernel.
-
-  Min-p is an adaptive probability filtering method that keeps all tokens
-  whose probability is at least min_p times the maximum probability. It is
-  simpler than top-p (no sort needed) and more adaptive than top-k.
-
-  Algorithm:
-    1. probs = softmax(logits)
-    2. max_prob = max(probs)
-    3. threshold = min_p * max_prob
-    4. filtered = where(probs >= threshold, probs, 0)
-    5. output = filtered / sum(filtered)
-
-  When the model is confident (high max_prob), the threshold is high and few
-  tokens survive. When uncertain, more tokens survive.
-
-  Input: tuple(logits, min_p) where:
-    - logits: torch.Tensor of shape [batch_size, vocab_size] (float32)
-    - min_p: torch.Tensor of shape [batch_size] (float32, values in [0, 1])
-
-  Output: torch.Tensor of shape [batch_size, vocab_size] (float32, renormalized probabilities)
-
-config:
-  main: "eval.py"
-
-templates:
-  Python: "../template.py"
-
-tests:
-  - {"batch_size": 1, "vocab_size": 256, "seed": 4242}
-  - {"batch_size": 4, "vocab_size": 1024, "seed": 5236}
-  - {"batch_size": 2, "vocab_size": 4096, "seed": 1001}
-  - {"batch_size": 1, "vocab_size": 32000, "seed": 5531}
-  - {"batch_size": 8, "vocab_size": 32000, "seed": 9173}
-
-benchmarks:
-  - {"batch_size": 1, "vocab_size": 32000, "seed": 31232}
-  - {"batch_size": 8, "vocab_size": 32000, "seed": 4052}
-  - {"batch_size": 1, "vocab_size": 128256, "seed": 2146}
-  - {"batch_size": 8, "vocab_size": 128256, "seed": 3129}
-  - {"batch_size": 1, "vocab_size": 151936, "seed": 54352}
-
-test_timeout: 180
-benchmark_timeout: 180
-ranked_timeout: 420
-ranking_by: "geom"
diff --git a/problems/helion/mla_py/reference.py b/problems/helion/mla_py/reference.py
deleted file mode 100644
index 62eb0a2d..00000000
--- a/problems/helion/mla_py/reference.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import torch
-import math
-from task import input_t, output_t
-from utils import make_match_reference, DeterministicContext
-
-
-def generate_input(B: int, H: int, S: int, d_c: int, d_r: int, seed: int) -> input_t:
-    gen = torch.Generator(device="cuda")
-    gen.manual_seed(seed)
-    q_nope = torch.randn(B, H, d_c, dtype=torch.float16, device="cuda", generator=gen).contiguous()
-    q_pe = torch.randn(B, H, d_r, dtype=torch.float16, device="cuda", generator=gen).contiguous()
-    kv_c = torch.randn(B, S, d_c, dtype=torch.float16, device="cuda", generator=gen).contiguous()
-    k_pe = torch.randn(B, S, d_r, dtype=torch.float16, device="cuda", generator=gen).contiguous()
-    return q_nope, q_pe, kv_c, k_pe
-
-
-def ref_kernel(data: input_t) -> output_t:
-    with DeterministicContext():
-        q_nope, q_pe, kv_c, k_pe = data
-        B, H, d_c = q_nope.shape
-        d_r = q_pe.shape[-1]
-        sm_scale = 1.0 / math.sqrt(d_c + d_r)
-
-        # Content score: [B, H, S] = [B, H, d_c] @ [B, d_c, S]
-        score_content = torch.bmm(q_nope.float(), kv_c.float().transpose(-2, -1))
-
-        # Position score: [B, H, S] = [B, H, d_r] @ [B, d_r, S]
-        score_position = torch.bmm(q_pe.float(), k_pe.float().transpose(-2, -1))
-
-        # Combined score with scaling
-        scores = (score_content + score_position) * sm_scale
-
-        # Softmax
-        attn_weights = torch.softmax(scores, dim=-1)
-
-        # Output: [B, H, d_c] = [B, H, S] @ [B, S, d_c]
-        output = torch.bmm(attn_weights, kv_c.float())
-
-        return output.to(q_nope.dtype)
-
-
-check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=1e-2)
diff --git a/problems/helion/mla_py/submission.py b/problems/helion/mla_py/submission.py
deleted file mode 100644
index a3aed999..00000000
--- a/problems/helion/mla_py/submission.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from task import input_t, output_t
-
-
-def custom_kernel(data: input_t) -> output_t:
-    import torch
-    import math
-
-    q_nope, q_pe, kv_c, k_pe = data
-    B, H, d_c = q_nope.shape
-    d_r = q_pe.shape[-1]
-    sm_scale = 1.0 / math.sqrt(d_c + d_r)
-
-    score_content = torch.bmm(q_nope.float(), kv_c.float().transpose(-2, -1))
-    score_position = torch.bmm(q_pe.float(), k_pe.float().transpose(-2, -1))
-    scores = (score_content + score_position) * sm_scale
-    attn_weights = torch.softmax(scores, dim=-1)
-    output = torch.bmm(attn_weights, kv_c.float())
-    return output.to(q_nope.dtype)
diff --git a/problems/helion/mla_py/task.py b/problems/helion/mla_py/task.py
deleted file mode 100644
index 0451a805..00000000
--- a/problems/helion/mla_py/task.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from typing import TypedDict, TypeVar
-import torch
-
-input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
-output_t = TypeVar("output_t", bound=torch.Tensor)
-
-class TestSpec(TypedDict):
-    B: int
-    H: int
-    S: int
-    d_c: int
-    d_r: int
-    seed: int
diff --git a/problems/helion/mla_py/task.yml b/problems/helion/mla_py/task.yml
deleted file mode 100644
index 6a084cbf..00000000
--- a/problems/helion/mla_py/task.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-files:
-  - {"name": "submission.py", "source": "@SUBMISSION@"}
-  - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "../utils.py"}
-  - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval.py"}
-
-lang: "py"
-
-description: |
-  Implement a Multi-Head Latent Attention (MLA) decode kernel.
-
-  MLA is from DeepSeek-V2/V3. It compresses the KV cache to a low-dimensional
-  latent space with decoupled RoPE:
-
-    score = (Q_nope @ KV_c^T + Q_pe @ K_pe^T) * sm_scale
-    O = softmax(score) @ KV_c
-
-  The value reuses the compressed latent (V = KV_c), achieving 32x KV cache
-  compression on DeepSeek-V3.
-
-  sm_scale = 1 / sqrt(d_c + d_r)
-
-  Input: tuple(q_nope, q_pe, kv_c, k_pe) where:
-    - q_nope: torch.Tensor of shape [B, H, d_c] (float16) — content query
-    - q_pe: torch.Tensor of shape [B, H, d_r] (float16) — position query
-    - kv_c: torch.Tensor of shape [B, S, d_c] (float16) — compressed KV cache
-    - k_pe: torch.Tensor of shape [B, S, d_r] (float16) — position keys
-
-  Output: torch.Tensor of shape [B, H, d_c] (float16)
-
-config:
-  main: "eval.py"
-
-templates:
-  Python: "../template.py"
-
-tests:
-  - {"B": 1, "H": 8, "S": 64, "d_c": 128, "d_r": 32, "seed": 4242}
-  - {"B": 2, "H": 8, "S": 64, "d_c": 128, "d_r": 32, "seed": 5236}
-  - {"B": 1, "H": 16, "S": 128, "d_c": 256, "d_r": 64, "seed": 1001}
-  - {"B": 1, "H": 8, "S": 256, "d_c": 128, "d_r": 32, "seed": 5531}
-  - {"B": 1, "H": 32, "S": 128, "d_c": 128, "d_r": 64, "seed": 9173}
-
-benchmarks:
-  - {"B": 1, "H": 32, "S": 256, "d_c": 512, "d_r": 64, "seed": 31232}
-  - {"B": 1, "H": 32, "S": 512, "d_c": 512, "d_r": 64, "seed": 4052}
-  - {"B": 1, "H": 32, "S": 1024, "d_c": 512, "d_r": 64, "seed": 2146}
-  - {"B": 1, "H": 32, "S": 2048, "d_c": 512, "d_r": 64, "seed": 3129}
-  - {"B": 1, "H": 128, "S": 1024, "d_c": 512, "d_r": 64, "seed": 54352}
-
-test_timeout: 180
-benchmark_timeout: 180
-ranked_timeout: 420
-ranking_by: "geom"
diff --git a/problems/helion/top_k_py/reference.py b/problems/helion/top_k_py/reference.py
deleted file mode 100644
index 2aa66d1e..00000000
--- a/problems/helion/top_k_py/reference.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import torch
-from task import input_t, output_t
-from utils import make_match_reference, DeterministicContext
-
-
-def generate_input(batch_size: int, vocab_size: int, k: int, seed: int) -> input_t:
-    gen = torch.Generator(device="cuda")
-    gen.manual_seed(seed)
-    logits = torch.randn(batch_size, vocab_size, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    top_k = torch.full((batch_size,), k, dtype=torch.int32, device="cuda")
-    return logits, top_k
-
-
-def ref_kernel(data: input_t) -> output_t:
-    with DeterministicContext():
-        logits, top_k = data
-        logits = logits.float()
-        batch_size, vocab_size = logits.shape
-
-        # 1. Softmax
-        probs = torch.softmax(logits, dim=-1)
-
-        # 2. Top-k filtering (per-batch k)
-        filtered = torch.zeros_like(probs)
-        for b in range(batch_size):
-            k = min(top_k[b].item(), vocab_size)
-            topk_vals, _ = torch.topk(probs[b], k)
-            threshold = topk_vals[-1]
-            mask = probs[b] >= threshold
-            filtered[b] = torch.where(mask, probs[b], torch.zeros_like(probs[b]))
-
-        # 3. Renormalize
-        filtered_sum = filtered.sum(dim=-1, keepdim=True)
-        result = filtered / filtered_sum
-
-        return result
-
-
-check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-5)
diff --git a/problems/helion/top_k_py/submission.py b/problems/helion/top_k_py/submission.py
deleted file mode 100644
index e5ac2e04..00000000
--- a/problems/helion/top_k_py/submission.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from task import input_t, output_t
-
-
-def custom_kernel(data: input_t) -> output_t:
-    import torch
-
-    logits, top_k = data
-    logits = logits.float()
-    batch_size, vocab_size = logits.shape
-
-    probs = torch.softmax(logits, dim=-1)
-    filtered = torch.zeros_like(probs)
-    for b in range(batch_size):
-        k = min(top_k[b].item(), vocab_size)
-        topk_vals, _ = torch.topk(probs[b], k)
-        threshold = topk_vals[-1]
-        mask = probs[b] >= threshold
-        filtered[b] = torch.where(mask, probs[b], torch.zeros_like(probs[b]))
-
-    filtered_sum = filtered.sum(dim=-1, keepdim=True)
-    result = filtered / filtered_sum
-    return result
diff --git a/problems/helion/top_k_py/task.py b/problems/helion/top_k_py/task.py
deleted file mode 100644
index 02ee370a..00000000
--- a/problems/helion/top_k_py/task.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from typing import TypedDict, TypeVar
-import torch
-
-input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
-output_t = TypeVar("output_t", bound=torch.Tensor)
-
-class TestSpec(TypedDict):
-    batch_size: int
-    vocab_size: int
-    k: int
-    seed: int
diff --git a/problems/helion/top_k_py/task.yml b/problems/helion/top_k_py/task.yml
deleted file mode 100644
index 7bf194e7..00000000
--- a/problems/helion/top_k_py/task.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-files:
-  - {"name": "submission.py", "source": "@SUBMISSION@"}
-  - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "../utils.py"}
-  - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval.py"}
-
-lang: "py"
-
-description: |
-  Implement a top-k sampling kernel.
-
-  Top-k keeps only the k highest-probability tokens and zeros out the rest.
-  A key insight: instead of O(V log V) sort, use binary search on the
-  probability threshold to find the k-th largest in O(V) time.
-
-  Algorithm:
-    1. probs = softmax(logits)
-    2. Binary search for threshold T where count(probs >= T) >= k
-    3. filtered = where(probs >= threshold, probs, 0)
-    4. output = filtered / sum(filtered)
-
-  Input: tuple(logits, top_k) where:
-    - logits: torch.Tensor of shape [batch_size, vocab_size] (float32)
-    - top_k: torch.Tensor of shape [batch_size] (int32)
-
-  Output: torch.Tensor of shape [batch_size, vocab_size] (float32, renormalized probabilities)
-
-config:
-  main: "eval.py"
-
-templates:
-  Python: "../template.py"
-
-tests:
-  - {"batch_size": 1, "vocab_size": 256, "k": 50, "seed": 4242}
-  - {"batch_size": 4, "vocab_size": 1024, "k": 100, "seed": 5236}
-  - {"batch_size": 2, "vocab_size": 4096, "k": 10, "seed": 1001}
-  - {"batch_size": 1, "vocab_size": 32000, "k": 50, "seed": 5531}
-  - {"batch_size": 1, "vocab_size": 256, "k": 1, "seed": 9173}
-
-benchmarks:
-  - {"batch_size": 1, "vocab_size": 32000, "k": 50, "seed": 31232}
-  - {"batch_size": 8, "vocab_size": 32000, "k": 50, "seed": 4052}
-  - {"batch_size": 1, "vocab_size": 128256, "k": 50, "seed": 2146}
-  - {"batch_size": 8, "vocab_size": 128256, "k": 50, "seed": 3129}
-  - {"batch_size": 1, "vocab_size": 151936, "k": 50, "seed": 54352}
-
-test_timeout: 180
-benchmark_timeout: 180
-ranked_timeout: 420
-ranking_by: "geom"
diff --git a/problems/helion/top_p_py/reference.py b/problems/helion/top_p_py/reference.py
deleted file mode 100644
index 40bb1ca6..00000000
--- a/problems/helion/top_p_py/reference.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import torch
-from task import input_t, output_t
-from utils import make_match_reference, DeterministicContext
-
-
-def generate_input(batch_size: int, vocab_size: int, seed: int) -> input_t:
-    gen = torch.Generator(device="cuda")
-    gen.manual_seed(seed)
-    logits = torch.randn(batch_size, vocab_size, dtype=torch.float32, device="cuda", generator=gen).contiguous()
-    # top_p values between 0.85 and 0.95
-    top_p = (torch.rand(batch_size, device="cuda", generator=gen) * 0.1 + 0.85).to(torch.float32).contiguous()
-    return logits, top_p
-
-
-def ref_kernel(data: input_t) -> output_t:
-    with DeterministicContext():
-        logits, top_p = data
-        logits = logits.float()
-
-        # 1. Softmax
-        probs = torch.softmax(logits, dim=-1)
-
-        # 2. Sort descending
-        sorted_probs, _sorted_indices = torch.sort(probs, descending=True, dim=-1)
-
-        # 3. Cumulative sum
-        cumsum = torch.cumsum(sorted_probs, dim=-1)
-
-        # 4. Find threshold per batch element
-        shifted_cumsum = cumsum - sorted_probs
-        nucleus_mask = shifted_cumsum <= top_p[:, None]
-        masked_sorted = torch.where(nucleus_mask, sorted_probs, torch.ones_like(sorted_probs))
-        threshold = masked_sorted.amin(dim=-1, keepdim=True)
-
-        # 5. Apply threshold to original probs
-        filtered = torch.where(probs >= threshold, probs, torch.zeros_like(probs))
-
-        # 6. Renormalize
-        filtered_sum = filtered.sum(dim=-1, keepdim=True)
-        result = filtered / filtered_sum
-
-        return result
-
-
-check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-5)
diff --git a/problems/helion/top_p_py/submission.py b/problems/helion/top_p_py/submission.py
deleted file mode 100644
index 48e3fd93..00000000
--- a/problems/helion/top_p_py/submission.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from task import input_t, output_t
-
-
-def custom_kernel(data: input_t) -> output_t:
-    import torch
-
-    logits, top_p = data
-    logits = logits.float()
-
-    probs = torch.softmax(logits, dim=-1)
-    sorted_probs, _ = torch.sort(probs, descending=True, dim=-1)
-    cumsum = torch.cumsum(sorted_probs, dim=-1)
-    shifted_cumsum = cumsum - sorted_probs
-    nucleus_mask = shifted_cumsum <= top_p[:, None]
-    masked_sorted = torch.where(nucleus_mask, sorted_probs, torch.ones_like(sorted_probs))
-    threshold = masked_sorted.amin(dim=-1, keepdim=True)
-    filtered = torch.where(probs >= threshold, probs, torch.zeros_like(probs))
-    filtered_sum = filtered.sum(dim=-1, keepdim=True)
-    result = filtered / filtered_sum
-    return result
diff --git a/problems/helion/top_p_py/task.py b/problems/helion/top_p_py/task.py
deleted file mode 100644
index 708b66a3..00000000
--- a/problems/helion/top_p_py/task.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from typing import TypedDict, TypeVar
-import torch
-
-input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
-output_t = TypeVar("output_t", bound=torch.Tensor)
-
-class TestSpec(TypedDict):
-    batch_size: int
-    vocab_size: int
-    seed: int
diff --git a/problems/helion/top_p_py/task.yml b/problems/helion/top_p_py/task.yml
deleted file mode 100644
index 4b2f0318..00000000
--- a/problems/helion/top_p_py/task.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-files:
-  - {"name": "submission.py", "source": "@SUBMISSION@"}
-  - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "../utils.py"}
-  - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval.py"}
-
-lang: "py"
-
-description: |
-  Implement a top-p (nucleus) sampling kernel.
-
-  Top-p keeps the smallest set of tokens whose cumulative probability mass
-  exceeds the threshold top_p. This is the most widely used sampling strategy
-  across production LLMs (Qwen3 top_p=0.95, DeepSeek-V3 top_p=0.95).
-
-  A key insight: instead of O(V log V) sort, use binary search on the
-  probability threshold to find the nucleus boundary in O(V) time.
-
-  Algorithm:
-    1. probs = softmax(logits)
-    2. Binary search for threshold T where sum(probs >= T) >= top_p
-    3. filtered = where(probs >= threshold, probs, 0)
-    4. output = filtered / sum(filtered)
-
-  Input: tuple(logits, top_p) where:
-    - logits: torch.Tensor of shape [batch_size, vocab_size] (float32)
-    - top_p: torch.Tensor of shape [batch_size] (float32, values in [0, 1])
-
-  Output: torch.Tensor of shape [batch_size, vocab_size] (float32, renormalized probabilities)
-
-config:
-  main: "eval.py"
-
-templates:
-  Python: "../template.py"
-
-tests:
-  - {"batch_size": 1, "vocab_size": 256, "seed": 4242}
-  - {"batch_size": 4, "vocab_size": 1024, "seed": 5236}
-  - {"batch_size": 2, "vocab_size": 4096, "seed": 1001}
-  - {"batch_size": 1, "vocab_size": 32000, "seed": 5531}
-  - {"batch_size": 8, "vocab_size": 32000, "seed": 9173}
-
-benchmarks:
-  - {"batch_size": 1, "vocab_size": 32000, "seed": 31232}
-  - {"batch_size": 8, "vocab_size": 32000, "seed": 4052}
-  - {"batch_size": 1, "vocab_size": 128256, "seed": 2146}
-  - {"batch_size": 8, "vocab_size": 128256, "seed": 3129}
-  - {"batch_size": 1, "vocab_size": 151936, "seed": 54352}
-
-test_timeout: 180
-benchmark_timeout: 180
-ranked_timeout: 420
-ranking_by: "geom"

From 2abe9d63ffae00537f426ca37bec0e7c8f08251d Mon Sep 17 00:00:00 2001
From: Will Feng <yfeng.us@gmail.com>
Date: Tue, 3 Mar 2026 12:33:06 -0800
Subject: [PATCH 3/5] add gated deltanet kernels

---
 .../reference.py                              | 79 +++++++++++++++++++
 .../submission.py                             | 39 +++++++++
 .../gated_deltanet_chunk_fwd_h_py/task.py     | 14 ++++
 .../gated_deltanet_chunk_fwd_h_py/task.yml    | 69 ++++++++++++++++
 .../reference.py                              | 59 ++++++++++++++
 .../submission.py                             | 38 +++++++++
 .../gated_deltanet_chunk_fwd_o_py/task.py     | 13 +++
 .../gated_deltanet_chunk_fwd_o_py/task.yml    | 61 ++++++++++++++
 .../reference.py                              | 61 ++++++++++++++
 .../submission.py                             | 25 ++++++
 .../gated_deltanet_recompute_w_u_py/task.py   | 13 +++
 .../gated_deltanet_recompute_w_u_py/task.yml  | 66 ++++++++++++++++
 12 files changed, 537 insertions(+)
 create mode 100644 problems/helion/gated_deltanet_chunk_fwd_h_py/reference.py
 create mode 100644 problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py
 create mode 100644 problems/helion/gated_deltanet_chunk_fwd_h_py/task.py
 create mode 100644 problems/helion/gated_deltanet_chunk_fwd_h_py/task.yml
 create mode 100644 problems/helion/gated_deltanet_chunk_fwd_o_py/reference.py
 create mode 100644 problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py
 create mode 100644 problems/helion/gated_deltanet_chunk_fwd_o_py/task.py
 create mode 100644 problems/helion/gated_deltanet_chunk_fwd_o_py/task.yml
 create mode 100644 problems/helion/gated_deltanet_recompute_w_u_py/reference.py
 create mode 100644 problems/helion/gated_deltanet_recompute_w_u_py/submission.py
 create mode 100644 problems/helion/gated_deltanet_recompute_w_u_py/task.py
 create mode 100644 problems/helion/gated_deltanet_recompute_w_u_py/task.yml

diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/reference.py b/problems/helion/gated_deltanet_chunk_fwd_h_py/reference.py
new file mode 100644
index 00000000..ecee1896
--- /dev/null
+++ b/problems/helion/gated_deltanet_chunk_fwd_h_py/reference.py
@@ -0,0 +1,79 @@
+import torch
+from task import input_t, output_t
+from utils import verbose_allclose
+
+CHUNK_SIZE = 64
+
+
+def generate_input(B: int, T: int, H: int, K: int, V: int, use_initial_state: bool, seed: int) -> input_t:
+    gen = torch.Generator(device="cuda")
+    gen.manual_seed(seed)
+    k = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    w = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    u = torch.randn(B, T, H, V, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    # Use negative values for g to keep exp(g) bounded in (0, 1] and prevent overflow
+    g = -torch.abs(torch.randn(B, T, H, dtype=torch.float32, device="cuda", generator=gen)).contiguous()
+    if use_initial_state:
+        initial_state = torch.randn(B, H, K, V, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    else:
+        initial_state = torch.zeros(B, H, K, V, dtype=torch.float32, device="cuda").contiguous()
+    return k, w, u, g, initial_state
+
+
+def ref_kernel(data: input_t) -> output_t:
+    k, w, u, g, initial_state = data
+    B, T, H, K = k.shape
+    V = u.shape[-1]
+    BT = CHUNK_SIZE
+    NT = T // BT
+
+    h = torch.empty(B, NT, H, K, V, dtype=torch.float32, device=k.device)
+    v_new = torch.empty_like(u)
+
+    for b in range(B):
+        for hh in range(H):
+            b_h = initial_state[b, hh].float().clone()  # [K, V]
+
+            for c in range(NT):
+                cs = c * BT
+                ce = cs + BT
+
+                # Store current state
+                h[b, c, hh] = b_h
+
+                # v_new = u - w @ h_state
+                b_w = w[b, cs:ce, hh].float()  # [BT, K]
+                b_u = u[b, cs:ce, hh].float()  # [BT, V]
+                b_v = b_u - torch.matmul(b_w, b_h)  # [BT, V]
+                v_new[b, cs:ce, hh] = b_v
+
+                # Gating
+                b_g = g[b, cs:ce, hh].float()  # [BT]
+                b_g_last = b_g[-1]
+                b_v_gated = b_v * torch.exp(b_g_last - b_g)[:, None]
+
+                # Decay and update
+                b_h = b_h * torch.exp(b_g_last)
+                b_k = k[b, cs:ce, hh].float()  # [BT, K]
+                b_h = b_h + torch.matmul(b_k.T, b_v_gated)
+
+    return h, v_new
+
+
+def check_implementation(data, output):
+    expected = ref_kernel(data)
+    exp_h, exp_v = expected
+    got_h, got_v = output
+
+    reasons_h = verbose_allclose(got_h, exp_h, rtol=1e-2, atol=1e-2)
+    reasons_v = verbose_allclose(got_v, exp_v, rtol=1e-2, atol=1e-2)
+
+    reasons = []
+    if reasons_h:
+        reasons.append("h mismatch: " + " ".join(reasons_h))
+    if reasons_v:
+        reasons.append("v_new mismatch: " + " ".join(reasons_v))
+
+    if reasons:
+        return False, " | ".join(reasons)
+    return True, ""
diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py b/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py
new file mode 100644
index 00000000..38fa590a
--- /dev/null
+++ b/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py
@@ -0,0 +1,39 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    import torch
+
+    k, w, u, g, initial_state = data
+    B, T, H, K = k.shape
+    V = u.shape[-1]
+    BT = 64
+    NT = T // BT
+
+    h = torch.empty(B, NT, H, K, V, dtype=torch.float32, device=k.device)
+    v_new = torch.empty_like(u)
+
+    for b in range(B):
+        for hh in range(H):
+            b_h = initial_state[b, hh].float().clone()
+
+            for c in range(NT):
+                cs = c * BT
+                ce = cs + BT
+
+                h[b, c, hh] = b_h
+
+                b_w = w[b, cs:ce, hh].float()
+                b_u = u[b, cs:ce, hh].float()
+                b_v = b_u - torch.matmul(b_w, b_h)
+                v_new[b, cs:ce, hh] = b_v
+
+                b_g = g[b, cs:ce, hh].float()
+                b_g_last = b_g[-1]
+                b_v_gated = b_v * torch.exp(b_g_last - b_g)[:, None]
+
+                b_h = b_h * torch.exp(b_g_last)
+                b_k = k[b, cs:ce, hh].float()
+                b_h = b_h + torch.matmul(b_k.T, b_v_gated)
+
+    return h, v_new
diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/task.py b/problems/helion/gated_deltanet_chunk_fwd_h_py/task.py
new file mode 100644
index 00000000..435bb18c
--- /dev/null
+++ b/problems/helion/gated_deltanet_chunk_fwd_h_py/task.py
@@ -0,0 +1,14 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=tuple[torch.Tensor, torch.Tensor])
+
+class TestSpec(TypedDict):
+    B: int
+    T: int
+    H: int
+    K: int
+    V: int
+    use_initial_state: bool
+    seed: int
diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/task.yml b/problems/helion/gated_deltanet_chunk_fwd_h_py/task.yml
new file mode 100644
index 00000000..5567bfae
--- /dev/null
+++ b/problems/helion/gated_deltanet_chunk_fwd_h_py/task.yml
@@ -0,0 +1,69 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement the chunk_fwd_h (inter-chunk state recurrence) kernel for Gated DeltaNet.
+
+  This kernel maintains a hidden state h of shape [K, V] across chunks and computes
+  v_new (corrected values) for each chunk. It is the sequential bottleneck in the
+  chunkwise parallel forward pass of Gated DeltaNet (arXiv:2412.06464, ICLR 2025).
+
+  The sequence is divided into chunks of BT=64 timesteps. Processing is sequential
+  across chunks but parallel across (B, H) and within each chunk:
+
+  For each (b, h) pair, starting with h_state = initial_state[b, h] (zeros or provided):
+    For each chunk c = 0, 1, ..., NT-1:
+      1. Store: h_out[b, c, h] = h_state
+      2. Compute: v_new = u - w @ h_state
+      3. Gate: v_gated[t] = v_new[t] * exp(g[last_t] - g[t])
+      4. Decay: h_state = h_state * exp(g[last_t])
+      5. Update: h_state = h_state + k^T @ v_gated
+
+  Input: tuple(k, w, u, g, initial_state) where:
+    - k: torch.Tensor of shape [B, T, H, K] (float32) — keys
+    - w: torch.Tensor of shape [B, T, H, K] (float32) — WY-transformed keys
+    - u: torch.Tensor of shape [B, T, H, V] (float32) — WY-transformed values
+    - g: torch.Tensor of shape [B, T, H] (float32) — cumulative gate
+    - initial_state: torch.Tensor of shape [B, H, K, V] (float32) — initial hidden state (zeros or random)
+
+  Output: tuple(h, v_new) where:
+    - h: torch.Tensor of shape [B, NT, H, K, V] (float32) — per-chunk hidden states
+    - v_new: torch.Tensor of shape [B, T, H, V] (float32) — corrected values
+
+  Constraint: T must be a multiple of 64. NT = T // 64.
+
+  See also: Helion examples/gdn_fwd_h.py for a related implementation
+  (simpler variant that returns only h, without v_new or initial_state support).
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"B": 1, "T": 64, "H": 2, "K": 64, "V": 64, "use_initial_state": false, "seed": 4242}
+  - {"B": 2, "T": 128, "H": 4, "K": 64, "V": 64, "use_initial_state": true, "seed": 5236}
+  - {"B": 1, "T": 256, "H": 4, "K": 64, "V": 128, "use_initial_state": false, "seed": 1001}
+  - {"B": 1, "T": 64, "H": 1, "K": 128, "V": 128, "use_initial_state": true, "seed": 5531}
+  - {"B": 2, "T": 128, "H": 2, "K": 100, "V": 100, "use_initial_state": true, "seed": 9173}
+
+benchmarks:
+  - {"B": 1, "T": 64, "H": 1, "K": 64, "V": 64, "use_initial_state": false, "seed": 31232}
+  - {"B": 2, "T": 512, "H": 3, "K": 64, "V": 64, "use_initial_state": true, "seed": 4052}
+  - {"B": 2, "T": 1024, "H": 3, "K": 64, "V": 64, "use_initial_state": false, "seed": 2146}
+  - {"B": 3, "T": 1024, "H": 4, "K": 100, "V": 100, "use_initial_state": true, "seed": 3129}
+  - {"B": 4, "T": 1024, "H": 4, "K": 128, "V": 128, "use_initial_state": false, "seed": 54352}
+  - {"B": 2, "T": 1536, "H": 4, "K": 128, "V": 128, "use_initial_state": true, "seed": 71234}
+  - {"B": 4, "T": 2048, "H": 8, "K": 64, "V": 64, "use_initial_state": true, "seed": 82345}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 420
+ranking_by: "geom"
diff --git a/problems/helion/gated_deltanet_chunk_fwd_o_py/reference.py b/problems/helion/gated_deltanet_chunk_fwd_o_py/reference.py
new file mode 100644
index 00000000..0078d9e5
--- /dev/null
+++ b/problems/helion/gated_deltanet_chunk_fwd_o_py/reference.py
@@ -0,0 +1,59 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference
+
+CHUNK_SIZE = 64
+
+
+def generate_input(B: int, T: int, H: int, K: int, V: int, seed: int) -> input_t:
+    gen = torch.Generator(device="cuda")
+    gen.manual_seed(seed)
+    NT = T // CHUNK_SIZE
+    q = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    k = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    v_new = torch.randn(B, T, H, V, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    h = torch.randn(B, NT, H, K, V, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    # Use negative values for g to keep exp(g) bounded in (0, 1]
+    g = -torch.abs(torch.randn(B, T, H, dtype=torch.float32, device="cuda", generator=gen)).contiguous()
+    return q, k, v_new, h, g
+
+
+def ref_kernel(data: input_t) -> output_t:
+    q, k, v_new, h, g = data
+    B, T, H, K = q.shape
+    V = v_new.shape[-1]
+    BT = CHUNK_SIZE
+    scale = K ** -0.5
+
+    o = torch.empty_like(v_new)
+    causal = torch.tril(torch.ones(BT, BT, device=q.device, dtype=torch.bool))
+
+    for cs in range(0, T, BT):
+        ce = cs + BT
+        c_idx = cs // BT
+
+        # Reshape to [B, H, BT, ...] for batched matmul
+        b_q = q[:, cs:ce, :, :].permute(0, 2, 1, 3).float()   # [B, H, BT, K]
+        b_k = k[:, cs:ce, :, :].permute(0, 2, 1, 3).float()   # [B, H, BT, K]
+        b_v = v_new[:, cs:ce, :, :].permute(0, 2, 1, 3).float()  # [B, H, BT, V]
+        b_h = h[:, c_idx, :, :, :].float()                     # [B, H, K, V]
+        b_g = g[:, cs:ce, :].permute(0, 2, 1).float()          # [B, H, BT]
+
+        # Inter-chunk: q @ h * exp(g)
+        inter = torch.matmul(b_q, b_h)  # [B, H, BT, V]
+        inter = inter * torch.exp(b_g).unsqueeze(-1)
+
+        # Intra-chunk: causal(q @ k^T * exp(g_diff)) @ v_new
+        attn = torch.matmul(b_q, b_k.transpose(-1, -2))  # [B, H, BT, BT]
+        g_diff = b_g.unsqueeze(-1) - b_g.unsqueeze(-2)    # [B, H, BT, BT]
+        attn = attn * torch.exp(g_diff)
+        attn = attn.masked_fill(~causal, 0.0)
+        intra = torch.matmul(attn, b_v)  # [B, H, BT, V]
+
+        b_o = (inter + intra) * scale
+        o[:, cs:ce, :, :] = b_o.permute(0, 2, 1, 3)
+
+    return o
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
diff --git a/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py b/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py
new file mode 100644
index 00000000..0b5f02cd
--- /dev/null
+++ b/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py
@@ -0,0 +1,38 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    import torch
+
+    q, k, v_new, h, g = data
+    B, T, H, K = q.shape
+    V = v_new.shape[-1]
+    BT = 64
+    scale = K ** -0.5
+
+    o = torch.empty_like(v_new)
+    causal = torch.tril(torch.ones(BT, BT, device=q.device, dtype=torch.bool))
+
+    for cs in range(0, T, BT):
+        ce = cs + BT
+        c_idx = cs // BT
+
+        b_q = q[:, cs:ce, :, :].permute(0, 2, 1, 3).float()
+        b_k = k[:, cs:ce, :, :].permute(0, 2, 1, 3).float()
+        b_v = v_new[:, cs:ce, :, :].permute(0, 2, 1, 3).float()
+        b_h = h[:, c_idx, :, :, :].float()
+        b_g = g[:, cs:ce, :].permute(0, 2, 1).float()
+
+        inter = torch.matmul(b_q, b_h)
+        inter = inter * torch.exp(b_g).unsqueeze(-1)
+
+        attn = torch.matmul(b_q, b_k.transpose(-1, -2))
+        g_diff = b_g.unsqueeze(-1) - b_g.unsqueeze(-2)
+        attn = attn * torch.exp(g_diff)
+        attn = attn.masked_fill(~causal, 0.0)
+        intra = torch.matmul(attn, b_v)
+
+        b_o = (inter + intra) * scale
+        o[:, cs:ce, :, :] = b_o.permute(0, 2, 1, 3)
+
+    return o
diff --git a/problems/helion/gated_deltanet_chunk_fwd_o_py/task.py b/problems/helion/gated_deltanet_chunk_fwd_o_py/task.py
new file mode 100644
index 00000000..08d4b4f6
--- /dev/null
+++ b/problems/helion/gated_deltanet_chunk_fwd_o_py/task.py
@@ -0,0 +1,13 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    B: int
+    T: int
+    H: int
+    K: int
+    V: int
+    seed: int
diff --git a/problems/helion/gated_deltanet_chunk_fwd_o_py/task.yml b/problems/helion/gated_deltanet_chunk_fwd_o_py/task.yml
new file mode 100644
index 00000000..73b9321c
--- /dev/null
+++ b/problems/helion/gated_deltanet_chunk_fwd_o_py/task.yml
@@ -0,0 +1,61 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement the chunk_fwd_o (output computation) kernel for Gated DeltaNet.
+
+  This kernel computes the final output by combining inter-chunk (state-based)
+  and intra-chunk (attention-based) contributions for the chunkwise parallel
+  forward pass of Gated DeltaNet (arXiv:2412.06464, ICLR 2025).
+
+  The sequence is divided into chunks of BT=64 timesteps. For each chunk
+  independently:
+    inter = q @ h * exp(g)
+    intra = causal_mask(q @ k^T * exp(g[:, None] - g[None, :])) @ v_new
+    output = (inter + intra) * scale
+
+  where scale = K^(-0.5), and causal_mask zeros out entries where row < col.
+
+  Input: tuple(q, k, v_new, h, g) where:
+    - q: torch.Tensor of shape [B, T, H, K] (float32) — queries
+    - k: torch.Tensor of shape [B, T, H, K] (float32) — keys
+    - v_new: torch.Tensor of shape [B, T, H, V] (float32) — corrected values
+    - h: torch.Tensor of shape [B, NT, H, K, V] (float32) — per-chunk states
+    - g: torch.Tensor of shape [B, T, H] (float32) — cumulative gate
+
+  Output: torch.Tensor of shape [B, T, H, V] (float32)
+
+  Constraint: T must be a multiple of 64. NT = T // 64. scale = K^(-0.5).
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"B": 1, "T": 64, "H": 2, "K": 64, "V": 64, "seed": 4242}
+  - {"B": 2, "T": 128, "H": 4, "K": 64, "V": 64, "seed": 5236}
+  - {"B": 1, "T": 256, "H": 4, "K": 64, "V": 128, "seed": 1001}
+  - {"B": 1, "T": 64, "H": 1, "K": 128, "V": 128, "seed": 5531}
+  - {"B": 2, "T": 128, "H": 2, "K": 100, "V": 100, "seed": 9173}
+
+benchmarks:
+  - {"B": 1, "T": 64, "H": 1, "K": 64, "V": 64, "seed": 31232}
+  - {"B": 2, "T": 512, "H": 3, "K": 64, "V": 64, "seed": 4052}
+  - {"B": 2, "T": 1024, "H": 3, "K": 64, "V": 64, "seed": 2146}
+  - {"B": 3, "T": 1024, "H": 4, "K": 100, "V": 100, "seed": 3129}
+  - {"B": 4, "T": 1024, "H": 4, "K": 128, "V": 128, "seed": 54352}
+  - {"B": 2, "T": 1536, "H": 4, "K": 128, "V": 128, "seed": 71234}
+  - {"B": 4, "T": 2048, "H": 8, "K": 64, "V": 64, "seed": 82345}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 420
+ranking_by: "geom"
diff --git a/problems/helion/gated_deltanet_recompute_w_u_py/reference.py b/problems/helion/gated_deltanet_recompute_w_u_py/reference.py
new file mode 100644
index 00000000..99750dda
--- /dev/null
+++ b/problems/helion/gated_deltanet_recompute_w_u_py/reference.py
@@ -0,0 +1,61 @@
+import torch
+from task import input_t, output_t
+from utils import verbose_allclose
+
+CHUNK_SIZE = 64
+
+
+def generate_input(B: int, T: int, H: int, K: int, V: int, seed: int) -> input_t:
+    gen = torch.Generator(device="cuda")
+    gen.manual_seed(seed)
+    k = torch.randn(B, T, H, K, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    v = torch.randn(B, T, H, V, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    beta = torch.randn(B, T, H, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    A = torch.randn(B, T, H, CHUNK_SIZE, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    # Use negative values for g to keep exp(g) bounded in (0, 1]
+    g = -torch.abs(torch.randn(B, T, H, dtype=torch.float32, device="cuda", generator=gen)).contiguous()
+    return k, v, beta, A, g
+
+
+def ref_kernel(data: input_t) -> output_t:
+    k, v, beta, A, g = data
+    B, T, H, K = k.shape
+    V = v.shape[-1]
+    BT = CHUNK_SIZE
+
+    w = torch.empty_like(k)
+    u = torch.empty_like(v)
+
+    for cs in range(0, T, BT):
+        ce = cs + BT
+        # Reshape to [B, H, BT, BT] for batched matmul
+        A_bh = A[:, cs:ce, :, :].permute(0, 2, 1, 3).float()
+
+        # u = A @ (v * beta[..., None])
+        vb = (v[:, cs:ce, :, :] * beta[:, cs:ce, :, None]).permute(0, 2, 1, 3).float()
+        u[:, cs:ce, :, :] = torch.matmul(A_bh, vb).permute(0, 2, 1, 3)
+
+        # w = A @ (k * beta[..., None] * exp(g)[..., None])
+        kb = (k[:, cs:ce, :, :] * beta[:, cs:ce, :, None] * torch.exp(g[:, cs:ce, :, None])).permute(0, 2, 1, 3).float()
+        w[:, cs:ce, :, :] = torch.matmul(A_bh, kb).permute(0, 2, 1, 3)
+
+    return w, u
+
+
+def check_implementation(data, output):
+    expected = ref_kernel(data)
+    exp_w, exp_u = expected
+    got_w, got_u = output
+
+    reasons_w = verbose_allclose(got_w, exp_w, rtol=1e-3, atol=1e-3)
+    reasons_u = verbose_allclose(got_u, exp_u, rtol=1e-3, atol=1e-3)
+
+    reasons = []
+    if reasons_w:
+        reasons.append("w mismatch: " + " ".join(reasons_w))
+    if reasons_u:
+        reasons.append("u mismatch: " + " ".join(reasons_u))
+
+    if reasons:
+        return False, " | ".join(reasons)
+    return True, ""
diff --git a/problems/helion/gated_deltanet_recompute_w_u_py/submission.py b/problems/helion/gated_deltanet_recompute_w_u_py/submission.py
new file mode 100644
index 00000000..ec50c3cf
--- /dev/null
+++ b/problems/helion/gated_deltanet_recompute_w_u_py/submission.py
@@ -0,0 +1,25 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    import torch
+
+    k, v, beta, A, g = data
+    B, T, H, K = k.shape
+    V = v.shape[-1]
+    BT = A.shape[-1]
+
+    w = torch.empty_like(k)
+    u = torch.empty_like(v)
+
+    for cs in range(0, T, BT):
+        ce = cs + BT
+        A_bh = A[:, cs:ce, :, :].permute(0, 2, 1, 3).float()
+
+        vb = (v[:, cs:ce, :, :] * beta[:, cs:ce, :, None]).permute(0, 2, 1, 3).float()
+        u[:, cs:ce, :, :] = torch.matmul(A_bh, vb).permute(0, 2, 1, 3)
+
+        kb = (k[:, cs:ce, :, :] * beta[:, cs:ce, :, None] * torch.exp(g[:, cs:ce, :, None])).permute(0, 2, 1, 3).float()
+        w[:, cs:ce, :, :] = torch.matmul(A_bh, kb).permute(0, 2, 1, 3)
+
+    return w, u
diff --git a/problems/helion/gated_deltanet_recompute_w_u_py/task.py b/problems/helion/gated_deltanet_recompute_w_u_py/task.py
new file mode 100644
index 00000000..2887eb89
--- /dev/null
+++ b/problems/helion/gated_deltanet_recompute_w_u_py/task.py
@@ -0,0 +1,13 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=tuple[torch.Tensor, torch.Tensor])
+
+class TestSpec(TypedDict):
+    B: int
+    T: int
+    H: int
+    K: int
+    V: int
+    seed: int
diff --git a/problems/helion/gated_deltanet_recompute_w_u_py/task.yml b/problems/helion/gated_deltanet_recompute_w_u_py/task.yml
new file mode 100644
index 00000000..f3d83002
--- /dev/null
+++ b/problems/helion/gated_deltanet_recompute_w_u_py/task.yml
@@ -0,0 +1,66 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement the recompute_w_u forward kernel for Gated DeltaNet.
+
+  This kernel computes WY-transformed keys (w) and values (u) for the chunkwise
+  parallel forward pass of Gated DeltaNet (arXiv:2412.06464, ICLR 2025). It is
+  one of three per-chunk kernels in the forward pipeline.
+
+  The sequence is divided into non-overlapping chunks of BT=64 timesteps.
+  For each chunk independently:
+    u = A @ diag(beta) @ v           (WY-transformed values)
+    w = A @ diag(beta * exp(g)) @ k  (WY-transformed keys)
+
+  Equivalently:
+    u = A @ (v * beta[:, None])
+    w = A @ (k * beta[:, None] * exp(g)[:, None])
+
+  where A is a [BT, BT] WY representation matrix per chunk.
+
+  Input: tuple(k, v, beta, A, g) where:
+    - k: torch.Tensor of shape [B, T, H, K] (float32) — keys
+    - v: torch.Tensor of shape [B, T, H, V] (float32) — values
+    - beta: torch.Tensor of shape [B, T, H] (float32) — gating coefficients
+    - A: torch.Tensor of shape [B, T, H, BT] (float32) — WY matrix (BT=64)
+    - g: torch.Tensor of shape [B, T, H] (float32) — cumulative gate
+
+  Output: tuple(w, u) where:
+    - w: torch.Tensor of shape [B, T, H, K] (float32) — WY-transformed keys
+    - u: torch.Tensor of shape [B, T, H, V] (float32) — WY-transformed values
+
+  Constraint: T must be a multiple of 64.
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"B": 1, "T": 64, "H": 2, "K": 64, "V": 64, "seed": 4242}
+  - {"B": 2, "T": 128, "H": 4, "K": 64, "V": 64, "seed": 5236}
+  - {"B": 1, "T": 256, "H": 4, "K": 64, "V": 128, "seed": 1001}
+  - {"B": 1, "T": 64, "H": 1, "K": 128, "V": 128, "seed": 5531}
+  - {"B": 2, "T": 128, "H": 2, "K": 100, "V": 100, "seed": 9173}
+
+benchmarks:
+  - {"B": 1, "T": 64, "H": 1, "K": 64, "V": 64, "seed": 31232}
+  - {"B": 2, "T": 512, "H": 3, "K": 64, "V": 64, "seed": 4052}
+  - {"B": 2, "T": 1024, "H": 3, "K": 64, "V": 64, "seed": 2146}
+  - {"B": 3, "T": 1024, "H": 4, "K": 100, "V": 100, "seed": 3129}
+  - {"B": 4, "T": 1024, "H": 4, "K": 128, "V": 128, "seed": 54352}
+  - {"B": 2, "T": 1536, "H": 4, "K": 128, "V": 128, "seed": 71234}
+  - {"B": 4, "T": 2048, "H": 8, "K": 64, "V": 64, "seed": 82345}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 420
+ranking_by: "geom"

From 8d878d967abcb7309c14fa5487912f7cdb5f1f35 Mon Sep 17 00:00:00 2001
From: Will Feng <yfeng.us@gmail.com>
Date: Tue, 3 Mar 2026 12:49:29 -0800
Subject: [PATCH 4/5] clean up mentions of old kernels

---
 problems/helion.yaml | 34 +++++++---------------------------
 1 file changed, 7 insertions(+), 27 deletions(-)

diff --git a/problems/helion.yaml b/problems/helion.yaml
index 8fe5ce9a..32f9b0e8 100644
--- a/problems/helion.yaml
+++ b/problems/helion.yaml
@@ -1,22 +1,7 @@
 name: Helion Kernel Challenge
 deadline: "2026-03-14"
-description: "GPU kernel challenges inspired by Helion kernel ideas — attention mechanisms, sampling strategies, quantization, and sequence modeling operators from production LLM architectures."
+description: "GPU kernel challenges inspired by Helion kernel ideas — convolution, quantization, and gated deltanet operators from production LLM architectures."
 problems:
-  - directory: helion/gqa_py
-    name: gqa
-    deadline: "2026-03-14 00:00"
-    gpus:
-      - NVIDIA
-  - directory: helion/mla_py
-    name: mla
-    deadline: "2026-03-14 00:00"
-    gpus:
-      - NVIDIA
-  - directory: helion/kda_py
-    name: kda
-    deadline: "2026-03-14 00:00"
-    gpus:
-      - NVIDIA
   - directory: helion/causal_conv1d_py
     name: causal_conv1d
     deadline: "2026-03-14 00:00"
@@ -27,23 +12,18 @@ problems:
     deadline: "2026-03-14 00:00"
     gpus:
       - NVIDIA
-  - directory: helion/int8_quant_py
-    name: int8_quant
-    deadline: "2026-03-14 00:00"
-    gpus:
-      - NVIDIA
-  - directory: helion/min_p_py
-    name: min_p
+  - directory: helion/gated_deltanet_chunk_fwd_h_py
+    name: gated_deltanet_chunk_fwd_h
     deadline: "2026-03-14 00:00"
     gpus:
       - NVIDIA
-  - directory: helion/top_k_py
-    name: top_k
+  - directory: helion/gated_deltanet_chunk_fwd_o_py
+    name: gated_deltanet_chunk_fwd_o
     deadline: "2026-03-14 00:00"
     gpus:
       - NVIDIA
-  - directory: helion/top_p_py
-    name: top_p
+  - directory: helion/gated_deltanet_recompute_w_u_py
+    name: gated_deltanet_recompute_w_u
     deadline: "2026-03-14 00:00"
     gpus:
       - NVIDIA

From 11ed36b5fa0c709e0fab9c595c145fb2b00cfe19 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 3 Mar 2026 17:37:51 -0800
Subject: [PATCH 5/5] Fix eval.py test case parser to support underscored keys
 and booleans

The regex only matched [a-zA-Z]+ for keys, which broke parameters
like group_size, hidden_dim, num_tokens, and use_initial_state.
Also adds true/false boolean value parsing.
---
 problems/helion/eval.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/problems/helion/eval.py b/problems/helion/eval.py
index 981b9322..f99aba46 100644
--- a/problems/helion/eval.py
+++ b/problems/helion/eval.py
@@ -64,7 +64,7 @@ def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
 
     tests = []
     lines = content.splitlines()
-    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    match = r"\s*([a-zA-Z_]\w*):\s*([a-zA-Z_]\w*|[+-]?[0-9]+)\s*"
     for line in lines:
         parts = line.split(";")
         case = {}
@@ -78,7 +78,10 @@ def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
             try:
                 val = int(val)
             except ValueError:
-                pass
+                if val == "true":
+                    val = True
+                elif val == "false":
+                    val = False
 
             case[key] = val
         tests.append(TestCase(spec=line, args=case))