From cf40450be5b45adb413b20a4a6b46e30099ff9f7 Mon Sep 17 00:00:00 2001 From: Will Feng Date: Fri, 13 Mar 2026 16:54:55 -0700 Subject: [PATCH 1/5] Add per-shape config dispatch pattern to all Helion submissions - Use SHAPE_CONFIGS dict mapping shape tuples to helion.Config objects - Factory pattern with _make_kernel() creates separate kernel instances per config - All test and benchmark shapes from task.yml listed in SHAPE_CONFIGS - Test shapes: TODO to replace with default config or any config that passes correctness - Benchmark shapes: TODO to replace with autotuned config Co-Authored-By: Claude Opus 4.6 (1M context) --- .../helion/causal_conv1d_py/submission.py | 101 +++++++++------ problems/helion/fp8_quant_py/submission.py | 99 +++++++------- .../submission.py | 119 ++++++++++------- .../submission.py | 109 +++++++++------- .../submission.py | 121 +++++++++++------- problems/helion/template.py | 26 ++++ 6 files changed, 353 insertions(+), 222 deletions(-) diff --git a/problems/helion/causal_conv1d_py/submission.py b/problems/helion/causal_conv1d_py/submission.py index 32037a94..53060a80 100644 --- a/problems/helion/causal_conv1d_py/submission.py +++ b/problems/helion/causal_conv1d_py/submission.py @@ -5,49 +5,72 @@ import helion.language as hl +# Per-shape configs: map (B, D, S, W) to optimized helion.Config objects. +# Autotune locally for each shape, then paste the best config here. +SHAPE_CONFIGS: dict[tuple, helion.Config] = { + # Test shapes + (1, 64, 64, 4): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (2, 128, 128, 4): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (1, 256, 256, 3): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (1, 128, 64, 8): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (4, 64, 128, 4): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + # Benchmark shapes + (1, 768, 512, 4): helion.Config(...), # TODO: replace with your autotuned config + (1, 768, 2048, 4): helion.Config(...), # TODO: replace with your autotuned config + (1, 1536, 2048, 4): helion.Config(...), # TODO: replace with your autotuned config + (1, 2560, 2048, 4): helion.Config(...), # TODO: replace with your autotuned config + (1, 2560, 4096, 4): helion.Config(...), # TODO: replace with your autotuned config +} + + # NOTE: This is an intentionally inefficient baseline implementation. -@helion.kernel( - static_shapes=True, - config=helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), -) -def conv1d_kernel( - x_pad: torch.Tensor, # (B, D, L) zero-padded input - w: torch.Tensor, # (D, W) filter coefficients - b: torch.Tensor, # (D,) additive offset -) -> torch.Tensor: - B = x_pad.size(0) - D = x_pad.size(1) - L = x_pad.size(2) - W = hl.specialize(w.size(1)) - N = L - W + 1 - - y = torch.empty(B, D, N, dtype=x_pad.dtype, device=x_pad.device) - - for rb, rd, rs in hl.tile([B, D, N], block_size=[1, None, None]): - bi = rb.begin - acc1 = hl.zeros([rd, rs], dtype=torch.float32) - acc2 = hl.zeros([rd, rs], dtype=torch.float32) - acc3 = hl.zeros([rd, rs], dtype=torch.float32) - for j in range(W): - c1 = w[rd, j].to(torch.float32) - x1 = hl.load(x_pad, [bi, rd, rs.index + j]).to(torch.float32) - acc1 = acc1 + x1 * c1[:, None] - c2 = w[rd, j].to(torch.float32) - x2 = hl.load(x_pad, [bi, rd, rs.index + j]).to(torch.float32) - acc2 = acc2 + x2 * c2[:, None] - c3 = w[rd, j].to(torch.float32) - x3 = hl.load(x_pad, [bi, rd, rs.index + j]).to(torch.float32) - acc3 = acc3 + x3 * c3[:, None] - acc = (acc1 + acc2 + acc3) / 3.0 - acc = acc + b[rd].to(torch.float32)[:, None] - y[rb, rd, rs] = acc[None, :, :].to(y.dtype) - - return y +def _make_kernel(config: helion.Config): + @helion.kernel(static_shapes=True, config=config) + def conv1d_kernel( + x_pad: torch.Tensor, # (B, D, L) zero-padded input + w: torch.Tensor, # (D, W) filter coefficients + b: torch.Tensor, # (D,) additive offset + ) -> torch.Tensor: + B = x_pad.size(0) + D = x_pad.size(1) + L = x_pad.size(2) + W = hl.specialize(w.size(1)) + N = L - W + 1 + + y = torch.empty(B, D, N, dtype=x_pad.dtype, device=x_pad.device) + + for rb, rd, rs in hl.tile([B, D, N], block_size=[1, None, None]): + bi = rb.begin + acc1 = hl.zeros([rd, rs], dtype=torch.float32) + acc2 = hl.zeros([rd, rs], dtype=torch.float32) + acc3 = hl.zeros([rd, rs], dtype=torch.float32) + for j in range(W): + c1 = w[rd, j].to(torch.float32) + x1 = hl.load(x_pad, [bi, rd, rs.index + j]).to(torch.float32) + acc1 = acc1 + x1 * c1[:, None] + c2 = w[rd, j].to(torch.float32) + x2 = hl.load(x_pad, [bi, rd, rs.index + j]).to(torch.float32) + acc2 = acc2 + x2 * c2[:, None] + c3 = w[rd, j].to(torch.float32) + x3 = hl.load(x_pad, [bi, rd, rs.index + j]).to(torch.float32) + acc3 = acc3 + x3 * c3[:, None] + acc = (acc1 + acc2 + acc3) / 3.0 + acc = acc + b[rd].to(torch.float32)[:, None] + y[rb, rd, rs] = acc[None, :, :].to(y.dtype) + + return y + + return conv1d_kernel + + +_KERNELS = {shape: _make_kernel(cfg) for shape, cfg in SHAPE_CONFIGS.items()} def custom_kernel(data: input_t) -> output_t: x, weight, bias = data + B, D, S = x.shape W = weight.shape[1] - pad_zeros = torch.zeros(x.shape[0], x.shape[1], W - 1, dtype=x.dtype, device=x.device) + kernel = _KERNELS[(B, D, S, W)] + pad_zeros = torch.zeros(B, D, W - 1, dtype=x.dtype, device=x.device) padded = torch.cat([pad_zeros, x], dim=2) - return conv1d_kernel(padded, weight, bias) + return kernel(padded, weight, bias) diff --git a/problems/helion/fp8_quant_py/submission.py b/problems/helion/fp8_quant_py/submission.py index e3108d51..1fc324e7 100644 --- a/problems/helion/fp8_quant_py/submission.py +++ b/problems/helion/fp8_quant_py/submission.py @@ -5,52 +5,63 @@ import helion.language as hl from pathlib import Path -COFIG_DICT={ - "block_sizes": [1], - "num_warps": 1, - "num_stages": 1, + +# Per-shape configs: map (num_tokens, hidden_dim, group_size) to optimized helion.Config objects. +# Autotune locally for each shape, then paste the best config here. +SHAPE_CONFIGS: dict[tuple, helion.Config] = { + # Test shapes + (1, 256, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (4, 512, 128): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (16, 1024, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (1, 4096, 128): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (8, 4096, 128): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + # Benchmark shapes + # (1, 4096, 128) already covered above + (16, 4096, 128): helion.Config(...), # TODO: replace with your autotuned config + (256, 4096, 128): helion.Config(...), # TODO: replace with your autotuned config + (256, 8192, 128): helion.Config(...), # TODO: replace with your autotuned config + (4096, 7168, 128): helion.Config(...), # TODO: replace with your autotuned config } -ACF_FILE = "booster_pack/fp8_group_quant_0.acf" -if Path(ACF_FILE).exists(): - print(f"Using ACF file: {ACF_FILE}") - COFIG_DICT["advanced_controls_file"] = ACF_FILE # NOTE: This is an intentionally inefficient baseline implementation. -@helion.kernel( - static_shapes=True, - config=helion.Config(**COFIG_DICT), -) -def normalize_to_range( - data: torch.Tensor, # [N, G] input rows - scales_out: torch.Tensor, # [N] output normalization factors -) -> torch.Tensor: - nrows = data.size(0) - ncols = hl.specialize(data.size(1)) - MAX_VAL = 448.0 - - qout = torch.empty(nrows, ncols, dtype=torch.float32, device=data.device) - - for rr in hl.tile(nrows): - row = data[rr, :].to(torch.float32) - - abs1 = torch.abs(row) - amax1 = torch.amax(abs1, -1) - abs2 = torch.abs(row) - amax2 = torch.amax(abs2, -1) - abs3 = torch.abs(row) - amax3 = torch.amax(abs3, -1) - amax = (amax1 + amax2 + amax3) / 3.0 - amax = torch.clamp(amax, min=1e-10) - scale = amax / MAX_VAL - - q1 = row / scale[:, None] - q2 = row / scale[:, None] - q3 = row / scale[:, None] - qout[rr, :] = (q1 + q2 + q3) / 3.0 - scales_out[rr] = scale - - return qout +def _make_kernel(config: helion.Config): + @helion.kernel(static_shapes=True, config=config) + def normalize_to_range( + data: torch.Tensor, # [N, G] input rows + scales_out: torch.Tensor, # [N] output normalization factors + ) -> torch.Tensor: + nrows = data.size(0) + ncols = hl.specialize(data.size(1)) + MAX_VAL = 448.0 + + qout = torch.empty(nrows, ncols, dtype=torch.float32, device=data.device) + + for rr in hl.tile(nrows): + row = data[rr, :].to(torch.float32) + + abs1 = torch.abs(row) + amax1 = torch.amax(abs1, -1) + abs2 = torch.abs(row) + amax2 = torch.amax(abs2, -1) + abs3 = torch.abs(row) + amax3 = torch.amax(abs3, -1) + amax = (amax1 + amax2 + amax3) / 3.0 + amax = torch.clamp(amax, min=1e-10) + scale = amax / MAX_VAL + + q1 = row / scale[:, None] + q2 = row / scale[:, None] + q3 = row / scale[:, None] + qout[rr, :] = (q1 + q2 + q3) / 3.0 + scales_out[rr] = scale + + return qout + + return normalize_to_range + + +_KERNELS = {shape: _make_kernel(cfg) for shape, cfg in SHAPE_CONFIGS.items()} def custom_kernel(data: input_t) -> output_t: @@ -60,10 +71,12 @@ def custom_kernel(data: input_t) -> output_t: gsz = H // G N = T * G + kernel = _KERNELS[(T, H, gsz)] + flat_in = x.reshape(N, gsz) flat_s = x_s.reshape(N) - flat_q = normalize_to_range(flat_in, flat_s) + flat_q = kernel(flat_in, flat_s) x_q[...] = flat_q.reshape(T, H) x_s[...] = flat_s.reshape(T, G) diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py b/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py index 528a61cc..a24f3014 100644 --- a/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py +++ b/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py @@ -5,65 +5,88 @@ import helion.language as hl +# Per-shape configs: map (B, T, H, K, V) to optimized helion.Config objects. +# Autotune locally for each shape, then paste the best config here. +SHAPE_CONFIGS: dict[tuple, helion.Config] = { + # Test shapes + (1, 64, 2, 64, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (2, 128, 4, 64, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (1, 256, 4, 64, 128): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + # Benchmark shapes + (1, 64, 1, 64, 64): helion.Config(...), # TODO: replace with your autotuned config + (2, 512, 3, 64, 64): helion.Config(...), # TODO: replace with your autotuned config + (2, 1024, 3, 64, 64): helion.Config(...), # TODO: replace with your autotuned config + (3, 1024, 4, 100, 100): helion.Config(...), # TODO: replace with your autotuned config + (4, 1024, 4, 128, 128): helion.Config(...), # TODO: replace with your autotuned config + (2, 1536, 4, 128, 128): helion.Config(...), # TODO: replace with your autotuned config + (4, 2048, 8, 64, 64): helion.Config(...), # TODO: replace with your autotuned config +} + + # NOTE: This is an intentionally inefficient baseline implementation. -@helion.kernel( - static_shapes=True, - dot_precision="ieee", - config=helion.Config(block_sizes=[], num_warps=1, num_stages=1), -) -def chunk_state_pass( - k: torch.Tensor, # [B, T, H, K] - w: torch.Tensor, # [B, T, H, K] - u: torch.Tensor, # [B, T, H, V] - g: torch.Tensor, # [B, T, H] -) -> tuple[torch.Tensor, torch.Tensor]: - B, T, H, K = k.shape - V = u.shape[-1] - C = 64 - K = hl.specialize(K) - V = hl.specialize(V) +def _make_kernel(config: helion.Config): + @helion.kernel(static_shapes=True, dot_precision="ieee", config=config) + def chunk_state_pass( + k: torch.Tensor, # [B, T, H, K] + w: torch.Tensor, # [B, T, H, K] + u: torch.Tensor, # [B, T, H, V] + g: torch.Tensor, # [B, T, H] + ) -> tuple[torch.Tensor, torch.Tensor]: + B, T, H, K = k.shape + V = u.shape[-1] + C = 64 + K = hl.specialize(K) + V = hl.specialize(V) + + NT = (T + C - 1) // C + h_out = torch.empty(B, NT, H, K, V, dtype=k.dtype, device=k.device) + v_out = torch.empty_like(u) - NT = (T + C - 1) // C - h_out = torch.empty(B, NT, H, K, V, dtype=k.dtype, device=k.device) - v_out = torch.empty_like(u) + BH = B * H - BH = B * H + for flat, tv in hl.tile([BH, V], block_size=[1, 8]): + b_idx = flat.begin // H + h_idx = flat.begin % H + state = hl.zeros([K, tv], dtype=torch.float32) - for flat, tv in hl.tile([BH, V], block_size=[1, 8]): - b_idx = flat.begin // H - h_idx = flat.begin % H - state = hl.zeros([K, tv], dtype=torch.float32) + for tc in hl.tile(T, block_size=C): + chunk_idx = tc.begin // C + t_end = min(tc.begin + C, T) - 1 - for tc in hl.tile(T, block_size=C): - chunk_idx = tc.begin // C - t_end = min(tc.begin + C, T) - 1 + h_out[b_idx, chunk_idx, h_idx, :, tv] = state.to(k.dtype) - h_out[b_idx, chunk_idx, h_idx, :, tv] = state.to(k.dtype) + proj1 = hl.dot( + w[b_idx, tc, h_idx, :], state, out_dtype=torch.float32 + ) + proj2 = hl.dot( + w[b_idx, tc, h_idx, :], state, out_dtype=torch.float32 + ) + proj = (proj1 + proj2) * 0.5 + diff = u[b_idx, tc, h_idx, tv].to(torch.float32) - proj + v_out[b_idx, tc, h_idx, tv] = diff.to(u.dtype) - proj1 = hl.dot( - w[b_idx, tc, h_idx, :], state, out_dtype=torch.float32 - ) - proj2 = hl.dot( - w[b_idx, tc, h_idx, :], state, out_dtype=torch.float32 - ) - proj = (proj1 + proj2) * 0.5 - diff = u[b_idx, tc, h_idx, tv].to(torch.float32) - proj - v_out[b_idx, tc, h_idx, tv] = diff.to(u.dtype) + g_end = g[b_idx, t_end, h_idx].to(torch.float32) + g_t = g[b_idx, tc, h_idx].to(torch.float32) + valid = tc.index < T + alpha = torch.where(valid, torch.exp(g_end - g_t), 0.0) + k_adj = k[b_idx, tc, h_idx, :] * alpha[:, None] - g_end = g[b_idx, t_end, h_idx].to(torch.float32) - g_t = g[b_idx, tc, h_idx].to(torch.float32) - valid = tc.index < T - alpha = torch.where(valid, torch.exp(g_end - g_t), 0.0) - k_adj = k[b_idx, tc, h_idx, :] * alpha[:, None] + state = state * torch.exp(g_end) + upd1 = hl.dot(k_adj.T, diff, out_dtype=torch.float32) + upd2 = hl.dot(k_adj.T, diff, out_dtype=torch.float32) + state = state + (upd1 + upd2) * 0.5 - state = state * torch.exp(g_end) - upd1 = hl.dot(k_adj.T, diff, out_dtype=torch.float32) - upd2 = hl.dot(k_adj.T, diff, out_dtype=torch.float32) - state = state + (upd1 + upd2) * 0.5 + return h_out, v_out - return h_out, v_out + return chunk_state_pass + + +_KERNELS = {shape: _make_kernel(cfg) for shape, cfg in SHAPE_CONFIGS.items()} def custom_kernel(data: input_t) -> output_t: k, w, u, g = data - return chunk_state_pass(k, w, u, g) + B, T, H, K = k.shape + V = u.shape[-1] + kernel = _KERNELS[(B, T, H, K, V)] + return kernel(k, w, u, g) diff --git a/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py b/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py index 8e2a2f53..f90e054a 100644 --- a/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py +++ b/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py @@ -5,58 +5,81 @@ import helion.language as hl +# Per-shape configs: map (B, T, H, K, V) to optimized helion.Config objects. +# Autotune locally for each shape, then paste the best config here. +SHAPE_CONFIGS: dict[tuple, helion.Config] = { + # Test shapes + (1, 64, 2, 64, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (2, 128, 4, 64, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (1, 256, 4, 64, 128): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + # Benchmark shapes + (1, 64, 1, 64, 64): helion.Config(...), # TODO: replace with your autotuned config + (2, 512, 3, 64, 64): helion.Config(...), # TODO: replace with your autotuned config + (2, 1024, 3, 64, 64): helion.Config(...), # TODO: replace with your autotuned config + (3, 1024, 4, 100, 100): helion.Config(...), # TODO: replace with your autotuned config + (4, 1024, 4, 128, 128): helion.Config(...), # TODO: replace with your autotuned config + (2, 1536, 4, 128, 128): helion.Config(...), # TODO: replace with your autotuned config + (4, 2048, 8, 64, 64): helion.Config(...), # TODO: replace with your autotuned config +} + + # NOTE: This is an intentionally inefficient baseline implementation. -@helion.kernel( - static_shapes=True, - dot_precision="ieee", - config=helion.Config(block_sizes=[], num_warps=1, num_stages=1), -) -def gated_chunk_attn( - q: torch.Tensor, # [B, T, H, K] - k: torch.Tensor, # [B, T, H, K] - v: torch.Tensor, # [B, T, H, V] - h: torch.Tensor, # [B, NT, H, K, V] - g: torch.Tensor, # [B, T, H] - scale: float, -) -> torch.Tensor: - B, T, H, K = q.shape - V = v.shape[-1] - C = 64 - K = hl.specialize(K) - V = hl.specialize(V) +def _make_kernel(config: helion.Config): + @helion.kernel(static_shapes=True, dot_precision="ieee", config=config) + def gated_chunk_attn( + q: torch.Tensor, # [B, T, H, K] + k: torch.Tensor, # [B, T, H, K] + v: torch.Tensor, # [B, T, H, V] + h: torch.Tensor, # [B, NT, H, K, V] + g: torch.Tensor, # [B, T, H] + scale: float, + ) -> torch.Tensor: + B, T, H, K = q.shape + V = v.shape[-1] + C = 64 + K = hl.specialize(K) + V = hl.specialize(V) + + out = torch.empty_like(v) - out = torch.empty_like(v) + BH = B * H + for flat_bh, tile_t in hl.tile([BH, T], block_size=[1, C]): + b_idx = flat_bh.begin // H + h_idx = flat_bh.begin % H + c_idx = tile_t.begin // C - BH = B * H - for flat_bh, tile_t in hl.tile([BH, T], block_size=[1, C]): - b_idx = flat_bh.begin // H - h_idx = flat_bh.begin % H - c_idx = tile_t.begin // C + g_vals = g[b_idx, tile_t, h_idx] + q_s = q[b_idx, tile_t, h_idx, :] * torch.exp(g_vals)[:, None] + k_s = k[b_idx, tile_t, h_idx, :] * torch.exp(-g_vals)[:, None] - g_vals = g[b_idx, tile_t, h_idx] - q_s = q[b_idx, tile_t, h_idx, :] * torch.exp(g_vals)[:, None] - k_s = k[b_idx, tile_t, h_idx, :] * torch.exp(-g_vals)[:, None] + sim1 = hl.dot(q_s, k_s.T) + sim2 = hl.dot(q_s, k_s.T) + sim = (sim1 + sim2) * 0.5 + idx = hl.arange(tile_t.block_size) + mask = idx[:, None] >= idx[None, :] + sim = torch.where(mask, sim, 0.0) + local1 = hl.dot(sim.to(v.dtype), v[b_idx, tile_t, h_idx, :]) + local2 = hl.dot(sim.to(v.dtype), v[b_idx, tile_t, h_idx, :]) + local_out = (local1 + local2) * 0.5 - sim1 = hl.dot(q_s, k_s.T) - sim2 = hl.dot(q_s, k_s.T) - sim = (sim1 + sim2) * 0.5 - idx = hl.arange(tile_t.block_size) - mask = idx[:, None] >= idx[None, :] - sim = torch.where(mask, sim, 0.0) - local1 = hl.dot(sim.to(v.dtype), v[b_idx, tile_t, h_idx, :]) - local2 = hl.dot(sim.to(v.dtype), v[b_idx, tile_t, h_idx, :]) - local_out = (local1 + local2) * 0.5 + glob1 = hl.dot(q_s, h[b_idx, c_idx, h_idx, :, :]) + glob2 = hl.dot(q_s, h[b_idx, c_idx, h_idx, :, :]) + global_out = (glob1 + glob2) * 0.5 - glob1 = hl.dot(q_s, h[b_idx, c_idx, h_idx, :, :]) - glob2 = hl.dot(q_s, h[b_idx, c_idx, h_idx, :, :]) - global_out = (glob1 + glob2) * 0.5 + out[b_idx, tile_t, h_idx, :] = ((global_out + local_out) * scale).to(out.dtype) - out[b_idx, tile_t, h_idx, :] = ((global_out + local_out) * scale).to(out.dtype) + return out - return out + return gated_chunk_attn + + +_KERNELS = {shape: _make_kernel(cfg) for shape, cfg in SHAPE_CONFIGS.items()} def custom_kernel(data: input_t) -> output_t: q, k, v_new, h, g = data - scale = q.shape[-1] ** -0.5 - return gated_chunk_attn(q, k, v_new, h, g, scale) + B, T, H, K = q.shape + V = v_new.shape[-1] + scale = K ** -0.5 + kernel = _KERNELS[(B, T, H, K, V)] + return kernel(q, k, v_new, h, g, scale) diff --git a/problems/helion/gated_deltanet_recompute_w_u_py/submission.py b/problems/helion/gated_deltanet_recompute_w_u_py/submission.py index 918f519a..555eb02c 100644 --- a/problems/helion/gated_deltanet_recompute_w_u_py/submission.py +++ b/problems/helion/gated_deltanet_recompute_w_u_py/submission.py @@ -5,68 +5,91 @@ import helion.language as hl +# Per-shape configs: map (B, T, H, K, V) to optimized helion.Config objects. +# Autotune locally for each shape, then paste the best config here. +SHAPE_CONFIGS: dict[tuple, helion.Config] = { + # Test shapes + (1, 64, 2, 64, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (2, 128, 4, 64, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (1, 256, 4, 64, 128): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + # Benchmark shapes + (1, 64, 1, 64, 64): helion.Config(...), # TODO: replace with your autotuned config + (2, 512, 3, 64, 64): helion.Config(...), # TODO: replace with your autotuned config + (2, 1024, 3, 64, 64): helion.Config(...), # TODO: replace with your autotuned config + (3, 1024, 4, 100, 100): helion.Config(...), # TODO: replace with your autotuned config + (4, 1024, 4, 128, 128): helion.Config(...), # TODO: replace with your autotuned config + (2, 1536, 4, 128, 128): helion.Config(...), # TODO: replace with your autotuned config + (4, 2048, 8, 64, 64): helion.Config(...), # TODO: replace with your autotuned config +} + + # NOTE: This is an intentionally inefficient baseline implementation. -@helion.kernel( - static_shapes=True, - dot_precision="ieee", - config=helion.Config(block_sizes=[], num_warps=1, num_stages=1), -) -def project_kv( - k: torch.Tensor, # [B, T, H, K] - v: torch.Tensor, # [B, T, H, V] - beta: torch.Tensor, # [B, T, H] - A: torch.Tensor, # [B, T, H, BT] - g: torch.Tensor, # [B, T, H] -) -> tuple[torch.Tensor, torch.Tensor]: - B, T, H, K = k.shape - V = v.shape[-1] - C = hl.specialize(A.shape[-1]) - K = hl.specialize(K) - V = hl.specialize(V) +def _make_kernel(config: helion.Config): + @helion.kernel(static_shapes=True, dot_precision="ieee", config=config) + def project_kv( + k: torch.Tensor, # [B, T, H, K] + v: torch.Tensor, # [B, T, H, V] + beta: torch.Tensor, # [B, T, H] + A: torch.Tensor, # [B, T, H, BT] + g: torch.Tensor, # [B, T, H] + ) -> tuple[torch.Tensor, torch.Tensor]: + B, T, H, K = k.shape + V = v.shape[-1] + C = hl.specialize(A.shape[-1]) + K = hl.specialize(K) + V = hl.specialize(V) + + w_out = torch.empty_like(k) + u_out = torch.empty_like(v) - w_out = torch.empty_like(k) - u_out = torch.empty_like(v) + BH = B * H + for flat_bh, rt in hl.tile([BH, T], block_size=[1, C]): + b_idx = flat_bh.begin // H + h_idx = flat_bh.begin % H - BH = B * H - for flat_bh, rt in hl.tile([BH, T], block_size=[1, C]): - b_idx = flat_bh.begin // H - h_idx = flat_bh.begin % H + w_acc1 = hl.zeros([rt, K], dtype=torch.float32) + u_acc1 = hl.zeros([rt, V], dtype=torch.float32) + w_acc2 = hl.zeros([rt, K], dtype=torch.float32) + u_acc2 = hl.zeros([rt, V], dtype=torch.float32) - w_acc1 = hl.zeros([rt, K], dtype=torch.float32) - u_acc1 = hl.zeros([rt, V], dtype=torch.float32) - w_acc2 = hl.zeros([rt, K], dtype=torch.float32) - u_acc2 = hl.zeros([rt, V], dtype=torch.float32) + for ci in range(C): + t_ci = rt.begin + ci + a_col = A[b_idx, rt, h_idx, ci].to(torch.float32) + coeff_ci = beta[b_idx, t_ci, h_idx].to(torch.float32) + decay_ci = torch.exp(g[b_idx, t_ci, h_idx].to(torch.float32)) - for ci in range(C): - t_ci = rt.begin + ci - a_col = A[b_idx, rt, h_idx, ci].to(torch.float32) - coeff_ci = beta[b_idx, t_ci, h_idx].to(torch.float32) - decay_ci = torch.exp(g[b_idx, t_ci, h_idx].to(torch.float32)) + k_ci = k[b_idx, t_ci, h_idx, :].to(torch.float32) + v_ci = v[b_idx, t_ci, h_idx, :].to(torch.float32) - k_ci = k[b_idx, t_ci, h_idx, :].to(torch.float32) - v_ci = v[b_idx, t_ci, h_idx, :].to(torch.float32) + w_acc1 = w_acc1 + a_col[:, None] * (k_ci * coeff_ci * decay_ci)[None, :] + u_acc1 = u_acc1 + a_col[:, None] * (v_ci * coeff_ci)[None, :] - w_acc1 = w_acc1 + a_col[:, None] * (k_ci * coeff_ci * decay_ci)[None, :] - u_acc1 = u_acc1 + a_col[:, None] * (v_ci * coeff_ci)[None, :] + for ci in range(C - 1, -1, -1): + t_ci = rt.begin + ci + a_col = A[b_idx, rt, h_idx, ci].to(torch.float32) + coeff_ci = beta[b_idx, t_ci, h_idx].to(torch.float32) + decay_ci = torch.exp(g[b_idx, t_ci, h_idx].to(torch.float32)) - for ci in range(C - 1, -1, -1): - t_ci = rt.begin + ci - a_col = A[b_idx, rt, h_idx, ci].to(torch.float32) - coeff_ci = beta[b_idx, t_ci, h_idx].to(torch.float32) - decay_ci = torch.exp(g[b_idx, t_ci, h_idx].to(torch.float32)) + k_ci = k[b_idx, t_ci, h_idx, :].to(torch.float32) + v_ci = v[b_idx, t_ci, h_idx, :].to(torch.float32) - k_ci = k[b_idx, t_ci, h_idx, :].to(torch.float32) - v_ci = v[b_idx, t_ci, h_idx, :].to(torch.float32) + w_acc2 = w_acc2 + a_col[:, None] * (k_ci * coeff_ci * decay_ci)[None, :] + u_acc2 = u_acc2 + a_col[:, None] * (v_ci * coeff_ci)[None, :] - w_acc2 = w_acc2 + a_col[:, None] * (k_ci * coeff_ci * decay_ci)[None, :] - u_acc2 = u_acc2 + a_col[:, None] * (v_ci * coeff_ci)[None, :] + w_out[b_idx, rt, h_idx, :] = ((w_acc1 + w_acc2) * 0.5).to(k.dtype) + u_out[b_idx, rt, h_idx, :] = ((u_acc1 + u_acc2) * 0.5).to(v.dtype) - w_out[b_idx, rt, h_idx, :] = ((w_acc1 + w_acc2) * 0.5).to(k.dtype) - u_out[b_idx, rt, h_idx, :] = ((u_acc1 + u_acc2) * 0.5).to(v.dtype) + return w_out, u_out - return w_out, u_out + return project_kv + + +_KERNELS = {shape: _make_kernel(cfg) for shape, cfg in SHAPE_CONFIGS.items()} def custom_kernel(data: input_t) -> output_t: k, v, beta, A, g = data - return project_kv(k, v, beta, A, g) + B, T, H, K = k.shape + V = v.shape[-1] + kernel = _KERNELS[(B, T, H, K, V)] + return kernel(k, v, beta, A, g) diff --git a/problems/helion/template.py b/problems/helion/template.py index 4aec6a6c..37d04820 100644 --- a/problems/helion/template.py +++ b/problems/helion/template.py @@ -1,5 +1,31 @@ from task import input_t, output_t +import torch +import helion +import helion.language as hl + + +# Per-shape configs: map input shape tuples to optimized helion.Config objects. +# Autotune locally for each shape, then paste the best config here. +# Include all test and benchmark shapes from task.yml. +SHAPE_CONFIGS: dict[tuple, helion.Config] = { + # (shape_dim_1, shape_dim_2, ...): helion.Config(...), # TODO: replace with your config +} + + +def _make_kernel(config: helion.Config): + @helion.kernel(static_shapes=True, config=config) + def kernel(...) -> ...: + # Your Helion kernel implementation + ... + + return kernel + + +_KERNELS = {shape: _make_kernel(cfg) for shape, cfg in SHAPE_CONFIGS.items()} def custom_kernel(data: input_t) -> output_t: + # Extract shape key from input tensors to select the right kernel + # shape_key = (...) + # kernel = _KERNELS[shape_key] pass From 7cce3d01ba23bafb3774b1766abf972e00086423 Mon Sep 17 00:00:00 2001 From: Will Feng Date: Fri, 13 Mar 2026 17:11:35 -0700 Subject: [PATCH 2/5] Add commented-out ACF usage hint to all Helion submissions Each submission now shows the matching ACF file path as a commented example so participants know which booster pack files to try. Co-Authored-By: Claude Opus 4.6 (1M context) --- problems/helion/causal_conv1d_py/submission.py | 5 +++++ problems/helion/fp8_quant_py/submission.py | 5 +++++ problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py | 5 +++++ problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py | 5 +++++ .../helion/gated_deltanet_recompute_w_u_py/submission.py | 5 +++++ 5 files changed, 25 insertions(+) diff --git a/problems/helion/causal_conv1d_py/submission.py b/problems/helion/causal_conv1d_py/submission.py index 53060a80..118874de 100644 --- a/problems/helion/causal_conv1d_py/submission.py +++ b/problems/helion/causal_conv1d_py/submission.py @@ -23,6 +23,11 @@ } +# Optional: add advanced_controls_file to your Config for extra performance (see docs). +# Autotune with autotune_search_acf to find the best ACF, then hardcode it: +# helion.Config(..., advanced_controls_file="/opt/booster_pack/causal_conv_0.acf") + + # NOTE: This is an intentionally inefficient baseline implementation. def _make_kernel(config: helion.Config): @helion.kernel(static_shapes=True, config=config) diff --git a/problems/helion/fp8_quant_py/submission.py b/problems/helion/fp8_quant_py/submission.py index 1fc324e7..b116e205 100644 --- a/problems/helion/fp8_quant_py/submission.py +++ b/problems/helion/fp8_quant_py/submission.py @@ -24,6 +24,11 @@ } +# Optional: add advanced_controls_file to your Config for extra performance (see docs). +# Autotune with autotune_search_acf to find the best ACF, then hardcode it: +# helion.Config(..., advanced_controls_file="/opt/booster_pack/fp8_group_quant_0.acf") + + # NOTE: This is an intentionally inefficient baseline implementation. def _make_kernel(config: helion.Config): @helion.kernel(static_shapes=True, config=config) diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py b/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py index a24f3014..c00a4945 100644 --- a/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py +++ b/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py @@ -23,6 +23,11 @@ } +# Optional: add advanced_controls_file to your Config for extra performance (see docs). +# Autotune with autotune_search_acf to find the best ACF, then hardcode it: +# helion.Config(..., advanced_controls_file="/opt/booster_pack/chunk_fwd_h_0.acf") + + # NOTE: This is an intentionally inefficient baseline implementation. def _make_kernel(config: helion.Config): @helion.kernel(static_shapes=True, dot_precision="ieee", config=config) diff --git a/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py b/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py index f90e054a..31c49b38 100644 --- a/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py +++ b/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py @@ -23,6 +23,11 @@ } +# Optional: add advanced_controls_file to your Config for extra performance (see docs). +# Autotune with autotune_search_acf to find the best ACF, then hardcode it: +# helion.Config(..., advanced_controls_file="/opt/booster_pack/chunk_fwd_o_0.acf") + + # NOTE: This is an intentionally inefficient baseline implementation. def _make_kernel(config: helion.Config): @helion.kernel(static_shapes=True, dot_precision="ieee", config=config) diff --git a/problems/helion/gated_deltanet_recompute_w_u_py/submission.py b/problems/helion/gated_deltanet_recompute_w_u_py/submission.py index 555eb02c..8b002a1c 100644 --- a/problems/helion/gated_deltanet_recompute_w_u_py/submission.py +++ b/problems/helion/gated_deltanet_recompute_w_u_py/submission.py @@ -23,6 +23,11 @@ } +# Optional: add advanced_controls_file to your Config for extra performance (see docs). +# Autotune with autotune_search_acf to find the best ACF, then hardcode it: +# helion.Config(..., advanced_controls_file="/opt/booster_pack/recompute_w_u_fwd_0.acf") + + # NOTE: This is an intentionally inefficient baseline implementation. def _make_kernel(config: helion.Config): @helion.kernel(static_shapes=True, dot_precision="ieee", config=config) From 18d7e487882a08d1ba1ab56d856b454801ab3177 Mon Sep 17 00:00:00 2001 From: Will Feng Date: Fri, 13 Mar 2026 17:19:49 -0700 Subject: [PATCH 3/5] Rename inner kernel functions to 'kernel' in all submissions Co-Authored-By: Claude Opus 4.6 (1M context) --- problems/helion/causal_conv1d_py/submission.py | 4 ++-- problems/helion/fp8_quant_py/submission.py | 4 ++-- problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py | 4 ++-- problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py | 4 ++-- problems/helion/gated_deltanet_recompute_w_u_py/submission.py | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/problems/helion/causal_conv1d_py/submission.py b/problems/helion/causal_conv1d_py/submission.py index 118874de..ecaca557 100644 --- a/problems/helion/causal_conv1d_py/submission.py +++ b/problems/helion/causal_conv1d_py/submission.py @@ -31,7 +31,7 @@ # NOTE: This is an intentionally inefficient baseline implementation. def _make_kernel(config: helion.Config): @helion.kernel(static_shapes=True, config=config) - def conv1d_kernel( + def kernel( x_pad: torch.Tensor, # (B, D, L) zero-padded input w: torch.Tensor, # (D, W) filter coefficients b: torch.Tensor, # (D,) additive offset @@ -65,7 +65,7 @@ def conv1d_kernel( return y - return conv1d_kernel + return kernel _KERNELS = {shape: _make_kernel(cfg) for shape, cfg in SHAPE_CONFIGS.items()} diff --git a/problems/helion/fp8_quant_py/submission.py b/problems/helion/fp8_quant_py/submission.py index b116e205..f5773328 100644 --- a/problems/helion/fp8_quant_py/submission.py +++ b/problems/helion/fp8_quant_py/submission.py @@ -32,7 +32,7 @@ # NOTE: This is an intentionally inefficient baseline implementation. def _make_kernel(config: helion.Config): @helion.kernel(static_shapes=True, config=config) - def normalize_to_range( + def kernel( data: torch.Tensor, # [N, G] input rows scales_out: torch.Tensor, # [N] output normalization factors ) -> torch.Tensor: @@ -63,7 +63,7 @@ def normalize_to_range( return qout - return normalize_to_range + return kernel _KERNELS = {shape: _make_kernel(cfg) for shape, cfg in SHAPE_CONFIGS.items()} diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py b/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py index c00a4945..d0663435 100644 --- a/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py +++ b/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py @@ -31,7 +31,7 @@ # NOTE: This is an intentionally inefficient baseline implementation. def _make_kernel(config: helion.Config): @helion.kernel(static_shapes=True, dot_precision="ieee", config=config) - def chunk_state_pass( + def kernel( k: torch.Tensor, # [B, T, H, K] w: torch.Tensor, # [B, T, H, K] u: torch.Tensor, # [B, T, H, V] @@ -83,7 +83,7 @@ def chunk_state_pass( return h_out, v_out - return chunk_state_pass + return kernel _KERNELS = {shape: _make_kernel(cfg) for shape, cfg in SHAPE_CONFIGS.items()} diff --git a/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py b/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py index 31c49b38..5fc31fec 100644 --- a/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py +++ b/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py @@ -31,7 +31,7 @@ # NOTE: This is an intentionally inefficient baseline implementation. def _make_kernel(config: helion.Config): @helion.kernel(static_shapes=True, dot_precision="ieee", config=config) - def gated_chunk_attn( + def kernel( q: torch.Tensor, # [B, T, H, K] k: torch.Tensor, # [B, T, H, K] v: torch.Tensor, # [B, T, H, V] @@ -75,7 +75,7 @@ def gated_chunk_attn( return out - return gated_chunk_attn + return kernel _KERNELS = {shape: _make_kernel(cfg) for shape, cfg in SHAPE_CONFIGS.items()} diff --git a/problems/helion/gated_deltanet_recompute_w_u_py/submission.py b/problems/helion/gated_deltanet_recompute_w_u_py/submission.py index 8b002a1c..73913bd2 100644 --- a/problems/helion/gated_deltanet_recompute_w_u_py/submission.py +++ b/problems/helion/gated_deltanet_recompute_w_u_py/submission.py @@ -31,7 +31,7 @@ # NOTE: This is an intentionally inefficient baseline implementation. def _make_kernel(config: helion.Config): @helion.kernel(static_shapes=True, dot_precision="ieee", config=config) - def project_kv( + def kernel( k: torch.Tensor, # [B, T, H, K] v: torch.Tensor, # [B, T, H, V] beta: torch.Tensor, # [B, T, H] @@ -86,7 +86,7 @@ def project_kv( return w_out, u_out - return project_kv + return kernel _KERNELS = {shape: _make_kernel(cfg) for shape, cfg in SHAPE_CONFIGS.items()} From 7c083310500ba679181029cac3538276f4c974cc Mon Sep 17 00:00:00 2001 From: Will Feng Date: Fri, 13 Mar 2026 21:23:33 -0700 Subject: [PATCH 4/5] Replace placeholder helion.Config(...) with original baseline configs --- .../helion/causal_conv1d_py/submission.py | 20 +++++++++---------- problems/helion/fp8_quant_py/submission.py | 18 ++++++++--------- .../submission.py | 20 +++++++++---------- .../submission.py | 20 +++++++++---------- .../submission.py | 20 +++++++++---------- 5 files changed, 49 insertions(+), 49 deletions(-) diff --git a/problems/helion/causal_conv1d_py/submission.py b/problems/helion/causal_conv1d_py/submission.py index ecaca557..248fd31a 100644 --- a/problems/helion/causal_conv1d_py/submission.py +++ b/problems/helion/causal_conv1d_py/submission.py @@ -9,17 +9,17 @@ # Autotune locally for each shape, then paste the best config here. SHAPE_CONFIGS: dict[tuple, helion.Config] = { # Test shapes - (1, 64, 64, 4): helion.Config(...), # TODO: replace with default config or any config that passes correctness check - (2, 128, 128, 4): helion.Config(...), # TODO: replace with default config or any config that passes correctness check - (1, 256, 256, 3): helion.Config(...), # TODO: replace with default config or any config that passes correctness check - (1, 128, 64, 8): helion.Config(...), # TODO: replace with default config or any config that passes correctness check - (4, 64, 128, 4): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (1, 64, 64, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (2, 128, 128, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (1, 256, 256, 3): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (1, 128, 64, 8): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (4, 64, 128, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check # Benchmark shapes - (1, 768, 512, 4): helion.Config(...), # TODO: replace with your autotuned config - (1, 768, 2048, 4): helion.Config(...), # TODO: replace with your autotuned config - (1, 1536, 2048, 4): helion.Config(...), # TODO: replace with your autotuned config - (1, 2560, 2048, 4): helion.Config(...), # TODO: replace with your autotuned config - (1, 2560, 4096, 4): helion.Config(...), # TODO: replace with your autotuned config + (1, 768, 512, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (1, 768, 2048, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (1, 1536, 2048, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (1, 2560, 2048, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (1, 2560, 4096, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with your autotuned config } diff --git a/problems/helion/fp8_quant_py/submission.py b/problems/helion/fp8_quant_py/submission.py index f5773328..af82843a 100644 --- a/problems/helion/fp8_quant_py/submission.py +++ b/problems/helion/fp8_quant_py/submission.py @@ -10,17 +10,17 @@ # Autotune locally for each shape, then paste the best config here. SHAPE_CONFIGS: dict[tuple, helion.Config] = { # Test shapes - (1, 256, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check - (4, 512, 128): helion.Config(...), # TODO: replace with default config or any config that passes correctness check - (16, 1024, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check - (1, 4096, 128): helion.Config(...), # TODO: replace with default config or any config that passes correctness check - (8, 4096, 128): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (1, 256, 64): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (4, 512, 128): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (16, 1024, 64): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (1, 4096, 128): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (8, 4096, 128): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check # Benchmark shapes # (1, 4096, 128) already covered above - (16, 4096, 128): helion.Config(...), # TODO: replace with your autotuned config - (256, 4096, 128): helion.Config(...), # TODO: replace with your autotuned config - (256, 8192, 128): helion.Config(...), # TODO: replace with your autotuned config - (4096, 7168, 128): helion.Config(...), # TODO: replace with your autotuned config + (16, 4096, 128): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (256, 4096, 128): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (256, 8192, 128): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (4096, 7168, 128): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: replace with your autotuned config } diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py b/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py index d0663435..34ca5bff 100644 --- a/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py +++ b/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py @@ -9,17 +9,17 @@ # Autotune locally for each shape, then paste the best config here. SHAPE_CONFIGS: dict[tuple, helion.Config] = { # Test shapes - (1, 64, 2, 64, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check - (2, 128, 4, 64, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check - (1, 256, 4, 64, 128): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (1, 64, 2, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (2, 128, 4, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (1, 256, 4, 64, 128): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check # Benchmark shapes - (1, 64, 1, 64, 64): helion.Config(...), # TODO: replace with your autotuned config - (2, 512, 3, 64, 64): helion.Config(...), # TODO: replace with your autotuned config - (2, 1024, 3, 64, 64): helion.Config(...), # TODO: replace with your autotuned config - (3, 1024, 4, 100, 100): helion.Config(...), # TODO: replace with your autotuned config - (4, 1024, 4, 128, 128): helion.Config(...), # TODO: replace with your autotuned config - (2, 1536, 4, 128, 128): helion.Config(...), # TODO: replace with your autotuned config - (4, 2048, 8, 64, 64): helion.Config(...), # TODO: replace with your autotuned config + (1, 64, 1, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (2, 512, 3, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (2, 1024, 3, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (3, 1024, 4, 100, 100): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (4, 1024, 4, 128, 128): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (2, 1536, 4, 128, 128): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (4, 2048, 8, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config } diff --git a/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py b/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py index 5fc31fec..70c17e67 100644 --- a/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py +++ b/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py @@ -9,17 +9,17 @@ # Autotune locally for each shape, then paste the best config here. SHAPE_CONFIGS: dict[tuple, helion.Config] = { # Test shapes - (1, 64, 2, 64, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check - (2, 128, 4, 64, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check - (1, 256, 4, 64, 128): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (1, 64, 2, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (2, 128, 4, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (1, 256, 4, 64, 128): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check # Benchmark shapes - (1, 64, 1, 64, 64): helion.Config(...), # TODO: replace with your autotuned config - (2, 512, 3, 64, 64): helion.Config(...), # TODO: replace with your autotuned config - (2, 1024, 3, 64, 64): helion.Config(...), # TODO: replace with your autotuned config - (3, 1024, 4, 100, 100): helion.Config(...), # TODO: replace with your autotuned config - (4, 1024, 4, 128, 128): helion.Config(...), # TODO: replace with your autotuned config - (2, 1536, 4, 128, 128): helion.Config(...), # TODO: replace with your autotuned config - (4, 2048, 8, 64, 64): helion.Config(...), # TODO: replace with your autotuned config + (1, 64, 1, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (2, 512, 3, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (2, 1024, 3, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (3, 1024, 4, 100, 100): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (4, 1024, 4, 128, 128): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (2, 1536, 4, 128, 128): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (4, 2048, 8, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config } diff --git a/problems/helion/gated_deltanet_recompute_w_u_py/submission.py b/problems/helion/gated_deltanet_recompute_w_u_py/submission.py index 73913bd2..f507f21e 100644 --- a/problems/helion/gated_deltanet_recompute_w_u_py/submission.py +++ b/problems/helion/gated_deltanet_recompute_w_u_py/submission.py @@ -9,17 +9,17 @@ # Autotune locally for each shape, then paste the best config here. SHAPE_CONFIGS: dict[tuple, helion.Config] = { # Test shapes - (1, 64, 2, 64, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check - (2, 128, 4, 64, 64): helion.Config(...), # TODO: replace with default config or any config that passes correctness check - (1, 256, 4, 64, 128): helion.Config(...), # TODO: replace with default config or any config that passes correctness check + (1, 64, 2, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (2, 128, 4, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (1, 256, 4, 64, 128): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check # Benchmark shapes - (1, 64, 1, 64, 64): helion.Config(...), # TODO: replace with your autotuned config - (2, 512, 3, 64, 64): helion.Config(...), # TODO: replace with your autotuned config - (2, 1024, 3, 64, 64): helion.Config(...), # TODO: replace with your autotuned config - (3, 1024, 4, 100, 100): helion.Config(...), # TODO: replace with your autotuned config - (4, 1024, 4, 128, 128): helion.Config(...), # TODO: replace with your autotuned config - (2, 1536, 4, 128, 128): helion.Config(...), # TODO: replace with your autotuned config - (4, 2048, 8, 64, 64): helion.Config(...), # TODO: replace with your autotuned config + (1, 64, 1, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (2, 512, 3, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (2, 1024, 3, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (3, 1024, 4, 100, 100): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (4, 1024, 4, 128, 128): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (2, 1536, 4, 128, 128): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config + (4, 2048, 8, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config } From ec9d47a23a211d91318b7984af5c9613c442605f Mon Sep 17 00:00:00 2001 From: Will Feng Date: Fri, 13 Mar 2026 21:30:43 -0700 Subject: [PATCH 5/5] Simplify TODO comments for test shape configs --- problems/helion/causal_conv1d_py/submission.py | 10 +++++----- problems/helion/fp8_quant_py/submission.py | 10 +++++----- .../helion/gated_deltanet_chunk_fwd_h_py/submission.py | 6 +++--- .../helion/gated_deltanet_chunk_fwd_o_py/submission.py | 6 +++--- .../gated_deltanet_recompute_w_u_py/submission.py | 6 +++--- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/problems/helion/causal_conv1d_py/submission.py b/problems/helion/causal_conv1d_py/submission.py index 248fd31a..92716763 100644 --- a/problems/helion/causal_conv1d_py/submission.py +++ b/problems/helion/causal_conv1d_py/submission.py @@ -9,11 +9,11 @@ # Autotune locally for each shape, then paste the best config here. SHAPE_CONFIGS: dict[tuple, helion.Config] = { # Test shapes - (1, 64, 64, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check - (2, 128, 128, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check - (1, 256, 256, 3): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check - (1, 128, 64, 8): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check - (4, 64, 128, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (1, 64, 64, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check + (2, 128, 128, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check + (1, 256, 256, 3): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check + (1, 128, 64, 8): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check + (4, 64, 128, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check # Benchmark shapes (1, 768, 512, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with your autotuned config (1, 768, 2048, 4): helion.Config(block_sizes=[1, 8], num_warps=1, num_stages=1), # TODO: replace with your autotuned config diff --git a/problems/helion/fp8_quant_py/submission.py b/problems/helion/fp8_quant_py/submission.py index af82843a..4b562fa9 100644 --- a/problems/helion/fp8_quant_py/submission.py +++ b/problems/helion/fp8_quant_py/submission.py @@ -10,11 +10,11 @@ # Autotune locally for each shape, then paste the best config here. SHAPE_CONFIGS: dict[tuple, helion.Config] = { # Test shapes - (1, 256, 64): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check - (4, 512, 128): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check - (16, 1024, 64): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check - (1, 4096, 128): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check - (8, 4096, 128): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (1, 256, 64): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check + (4, 512, 128): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check + (16, 1024, 64): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check + (1, 4096, 128): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check + (8, 4096, 128): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check # Benchmark shapes # (1, 4096, 128) already covered above (16, 4096, 128): helion.Config(block_sizes=[1], num_warps=1, num_stages=1), # TODO: replace with your autotuned config diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py b/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py index 34ca5bff..04e0ecfc 100644 --- a/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py +++ b/problems/helion/gated_deltanet_chunk_fwd_h_py/submission.py @@ -9,9 +9,9 @@ # Autotune locally for each shape, then paste the best config here. SHAPE_CONFIGS: dict[tuple, helion.Config] = { # Test shapes - (1, 64, 2, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check - (2, 128, 4, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check - (1, 256, 4, 64, 128): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (1, 64, 2, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check + (2, 128, 4, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check + (1, 256, 4, 64, 128): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check # Benchmark shapes (1, 64, 1, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config (2, 512, 3, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config diff --git a/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py b/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py index 70c17e67..0743521d 100644 --- a/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py +++ b/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py @@ -9,9 +9,9 @@ # Autotune locally for each shape, then paste the best config here. SHAPE_CONFIGS: dict[tuple, helion.Config] = { # Test shapes - (1, 64, 2, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check - (2, 128, 4, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check - (1, 256, 4, 64, 128): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (1, 64, 2, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check + (2, 128, 4, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check + (1, 256, 4, 64, 128): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check # Benchmark shapes (1, 64, 1, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config (2, 512, 3, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config diff --git a/problems/helion/gated_deltanet_recompute_w_u_py/submission.py b/problems/helion/gated_deltanet_recompute_w_u_py/submission.py index f507f21e..07fb0691 100644 --- a/problems/helion/gated_deltanet_recompute_w_u_py/submission.py +++ b/problems/helion/gated_deltanet_recompute_w_u_py/submission.py @@ -9,9 +9,9 @@ # Autotune locally for each shape, then paste the best config here. SHAPE_CONFIGS: dict[tuple, helion.Config] = { # Test shapes - (1, 64, 2, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check - (2, 128, 4, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check - (1, 256, 4, 64, 128): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with default config or any config that passes correctness check + (1, 64, 2, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check + (2, 128, 4, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check + (1, 256, 4, 64, 128): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: use any config that passes correctness check # Benchmark shapes (1, 64, 1, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config (2, 512, 3, 64, 64): helion.Config(block_sizes=[], num_warps=1, num_stages=1), # TODO: replace with your autotuned config