gpu-mode · msaroufim · Mar 4, 2026 · Feb 26, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/problems/helion.yaml b/problems/helion.yaml
@@ -0,0 +1,29 @@
+name: Helion Kernel Challenge
+deadline: "2026-03-14"
+description: "GPU kernel challenges inspired by Helion kernel ideas — convolution, quantization, and gated deltanet operators from production LLM architectures."
+problems:
+  - directory: helion/causal_conv1d_py
+    name: causal_conv1d
+    deadline: "2026-03-14 00:00"
+    gpus:
+      - NVIDIA
+  - directory: helion/fp8_quant_py
+    name: fp8_quant
+    deadline: "2026-03-14 00:00"
+    gpus:
+      - NVIDIA
+  - directory: helion/gated_deltanet_chunk_fwd_h_py
+    name: gated_deltanet_chunk_fwd_h
+    deadline: "2026-03-14 00:00"
+    gpus:
+      - NVIDIA
+  - directory: helion/gated_deltanet_chunk_fwd_o_py
+    name: gated_deltanet_chunk_fwd_o
+    deadline: "2026-03-14 00:00"
+    gpus:
+      - NVIDIA
+  - directory: helion/gated_deltanet_recompute_w_u_py
+    name: gated_deltanet_recompute_w_u
+    deadline: "2026-03-14 00:00"
+    gpus:
+      - NVIDIA
diff --git a/problems/helion/causal_conv1d_py/reference.py b/problems/helion/causal_conv1d_py/reference.py
@@ -0,0 +1,35 @@
+import torch
+import torch.nn.functional as F
+from task import input_t, output_t
+from utils import make_match_reference, DeterministicContext
+
+
+def generate_input(B: int, D: int, S: int, W: int, seed: int) -> input_t:
+    gen = torch.Generator(device="cuda")
+    gen.manual_seed(seed)
+    x = torch.randn(B, D, S, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    weight = torch.randn(D, W, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    bias = torch.randn(D, dtype=torch.float32, device="cuda", generator=gen).contiguous()
+    return x, weight, bias
+
+
+def ref_kernel(data: input_t) -> output_t:
+    with DeterministicContext():
+        x, weight, bias = data
+        B, D, S = x.shape
+        W = weight.shape[1]
+
+        # Causal (left) padding
+        x_padded = F.pad(x, (W - 1, 0))
+
+        # Depthwise conv1d (groups=D)
+        output = F.conv1d(
+            x_padded,
+            weight.unsqueeze(1),  # [D, 1, W]
+            bias=bias,
+            groups=D,
+        )
+        return output
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-4, atol=1e-4)
diff --git a/problems/helion/causal_conv1d_py/submission.py b/problems/helion/causal_conv1d_py/submission.py
@@ -0,0 +1,14 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    import torch
+    import torch.nn.functional as F
+
+    x, weight, bias = data
+    W = weight.shape[1]
+    D = x.shape[1]
+
+    x_padded = F.pad(x, (W - 1, 0))
+    output = F.conv1d(x_padded, weight.unsqueeze(1), bias=bias, groups=D)
+    return output
diff --git a/problems/helion/causal_conv1d_py/task.py b/problems/helion/causal_conv1d_py/task.py
@@ -0,0 +1,12 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    B: int
+    D: int
+    S: int
+    W: int
+    seed: int
diff --git a/problems/helion/causal_conv1d_py/task.yml b/problems/helion/causal_conv1d_py/task.yml
@@ -0,0 +1,51 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a causal depthwise 1D convolution kernel.
+
+  This is a core component of Mamba/Mamba-2 architectures. Each channel is
+  convolved independently (depthwise) with causal (left) zero-padding so that
+  output[t] depends only on input[t-W+1:t+1].
+
+  For each batch b, channel d, and time t:
+    out[b, d, t] = bias[d] + sum_{k=0}^{W-1} weight[d, k] * x[b, d, t - W + 1 + k]
+  where out-of-bounds values are treated as zero.
+
+  Input: tuple(x, weight, bias) where:
+    - x: torch.Tensor of shape [B, D, S] (float32)
+    - weight: torch.Tensor of shape [D, W] (float32)
+    - bias: torch.Tensor of shape [D] (float32)
+
+  Output: torch.Tensor of shape [B, D, S] (float32)
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"B": 1, "D": 64, "S": 64, "W": 4, "seed": 4242}
+  - {"B": 2, "D": 128, "S": 128, "W": 4, "seed": 5236}
+  - {"B": 1, "D": 256, "S": 256, "W": 3, "seed": 1001}
+  - {"B": 1, "D": 128, "S": 64, "W": 8, "seed": 5531}
+  - {"B": 4, "D": 64, "S": 128, "W": 4, "seed": 9173}
+
+benchmarks:
+  - {"B": 1, "D": 768, "S": 512, "W": 4, "seed": 31232}
+  - {"B": 1, "D": 768, "S": 2048, "W": 4, "seed": 4052}
+  - {"B": 1, "D": 1536, "S": 2048, "W": 4, "seed": 2146}
+  - {"B": 1, "D": 2560, "S": 2048, "W": 4, "seed": 3129}
+  - {"B": 1, "D": 2560, "S": 4096, "W": 4, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 420
+ranking_by: "geom"