Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions problems/helion.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Helion Kernel Challenge
deadline: "2026-03-14"
description: "GPU kernel challenges inspired by Helion kernel ideas — convolution, quantization, and gated deltanet operators from production LLM architectures."
problems:
- directory: helion/causal_conv1d_py
name: causal_conv1d
deadline: "2026-03-14 00:00"
gpus:
- NVIDIA
- directory: helion/fp8_quant_py
name: fp8_quant
deadline: "2026-03-14 00:00"
gpus:
- NVIDIA
- directory: helion/gated_deltanet_chunk_fwd_h_py
name: gated_deltanet_chunk_fwd_h
deadline: "2026-03-14 00:00"
gpus:
- NVIDIA
- directory: helion/gated_deltanet_chunk_fwd_o_py
name: gated_deltanet_chunk_fwd_o
deadline: "2026-03-14 00:00"
gpus:
- NVIDIA
- directory: helion/gated_deltanet_recompute_w_u_py
name: gated_deltanet_recompute_w_u
deadline: "2026-03-14 00:00"
gpus:
- NVIDIA
35 changes: 35 additions & 0 deletions problems/helion/causal_conv1d_py/reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import torch
import torch.nn.functional as F
from task import input_t, output_t
from utils import make_match_reference, DeterministicContext


def generate_input(B: int, D: int, S: int, W: int, seed: int) -> input_t:
gen = torch.Generator(device="cuda")
gen.manual_seed(seed)
x = torch.randn(B, D, S, dtype=torch.float32, device="cuda", generator=gen).contiguous()
weight = torch.randn(D, W, dtype=torch.float32, device="cuda", generator=gen).contiguous()
bias = torch.randn(D, dtype=torch.float32, device="cuda", generator=gen).contiguous()
return x, weight, bias


def ref_kernel(data: input_t) -> output_t:
with DeterministicContext():
x, weight, bias = data
B, D, S = x.shape
W = weight.shape[1]

# Causal (left) padding
x_padded = F.pad(x, (W - 1, 0))

# Depthwise conv1d (groups=D)
output = F.conv1d(
x_padded,
weight.unsqueeze(1), # [D, 1, W]
bias=bias,
groups=D,
)
return output


check_implementation = make_match_reference(ref_kernel, rtol=1e-4, atol=1e-4)
14 changes: 14 additions & 0 deletions problems/helion/causal_conv1d_py/submission.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from task import input_t, output_t


def custom_kernel(data: input_t) -> output_t:
import torch
import torch.nn.functional as F

x, weight, bias = data
W = weight.shape[1]
D = x.shape[1]

x_padded = F.pad(x, (W - 1, 0))
output = F.conv1d(x_padded, weight.unsqueeze(1), bias=bias, groups=D)
return output
12 changes: 12 additions & 0 deletions problems/helion/causal_conv1d_py/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from typing import TypedDict, TypeVar
import torch

input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor])
output_t = TypeVar("output_t", bound=torch.Tensor)

class TestSpec(TypedDict):
B: int
D: int
S: int
W: int
seed: int
51 changes: 51 additions & 0 deletions problems/helion/causal_conv1d_py/task.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
files:
- {"name": "submission.py", "source": "@SUBMISSION@"}
- {"name": "task.py", "source": "task.py"}
- {"name": "utils.py", "source": "../utils.py"}
- {"name": "reference.py", "source": "reference.py"}
- {"name": "eval.py", "source": "../eval.py"}

lang: "py"

description: |
Implement a causal depthwise 1D convolution kernel.

This is a core component of Mamba/Mamba-2 architectures. Each channel is
convolved independently (depthwise) with causal (left) zero-padding so that
output[t] depends only on input[t-W+1:t+1].

For each batch b, channel d, and time t:
out[b, d, t] = bias[d] + sum_{k=0}^{W-1} weight[d, k] * x[b, d, t - W + 1 + k]
where out-of-bounds values are treated as zero.

Input: tuple(x, weight, bias) where:
- x: torch.Tensor of shape [B, D, S] (float32)
- weight: torch.Tensor of shape [D, W] (float32)
- bias: torch.Tensor of shape [D] (float32)

Output: torch.Tensor of shape [B, D, S] (float32)

config:
main: "eval.py"

templates:
Python: "../template.py"

tests:
- {"B": 1, "D": 64, "S": 64, "W": 4, "seed": 4242}
- {"B": 2, "D": 128, "S": 128, "W": 4, "seed": 5236}
- {"B": 1, "D": 256, "S": 256, "W": 3, "seed": 1001}
- {"B": 1, "D": 128, "S": 64, "W": 8, "seed": 5531}
- {"B": 4, "D": 64, "S": 128, "W": 4, "seed": 9173}

benchmarks:
- {"B": 1, "D": 768, "S": 512, "W": 4, "seed": 31232}
- {"B": 1, "D": 768, "S": 2048, "W": 4, "seed": 4052}
- {"B": 1, "D": 1536, "S": 2048, "W": 4, "seed": 2146}
- {"B": 1, "D": 2560, "S": 2048, "W": 4, "seed": 3129}
- {"B": 1, "D": 2560, "S": 4096, "W": 4, "seed": 54352}

test_timeout: 180
benchmark_timeout: 180
ranked_timeout: 420
ranking_by: "geom"
Loading