gpu-mode · S1ro1 · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026
diff --git a/problems/helion/gated_deltanet_chunk_fwd_h_py/reference.py b/problems/helion/gated_deltanet_chunk_fwd_h_py/reference.py
@@ -19,8 +19,9 @@ def _chunk_scaled_dot_kkt_fwd_eager(k, g_cumsum, beta, chunk_size):
     g_c = g_cumsum.float().reshape(B, NT, C, H).permute(0, 1, 3, 2)
     beta_c = beta.float().reshape(B, NT, C, H).permute(0, 1, 3, 2)
     kkt = k_c @ k_c.transpose(-1, -2)
-    g_diff = g_c.unsqueeze(-1) - g_c.unsqueeze(-2)
     strict_lower = torch.tril(torch.ones(C, C, device=k.device), diagonal=-1)
+    g_diff = g_c.unsqueeze(-1) - g_c.unsqueeze(-2)
+    g_diff = g_diff * strict_lower
     A = kkt * beta_c.unsqueeze(-1) * torch.exp(g_diff) * strict_lower
     return A.permute(0, 1, 3, 2, 4).reshape(B, T, H, C).to(torch.float32)
 

diff --git a/problems/helion/gated_deltanet_chunk_fwd_o_py/reference.py b/problems/helion/gated_deltanet_chunk_fwd_o_py/reference.py
@@ -19,8 +19,9 @@ def _chunk_scaled_dot_kkt_fwd_eager(k, g_cumsum, beta, chunk_size):
     g_c = g_cumsum.float().reshape(B, NT, C, H).permute(0, 1, 3, 2)
     beta_c = beta.float().reshape(B, NT, C, H).permute(0, 1, 3, 2)
     kkt = k_c @ k_c.transpose(-1, -2)
-    g_diff = g_c.unsqueeze(-1) - g_c.unsqueeze(-2)
     strict_lower = torch.tril(torch.ones(C, C, device=k.device), diagonal=-1)
+    g_diff = g_c.unsqueeze(-1) - g_c.unsqueeze(-2)
+    g_diff = g_diff * strict_lower
     A = kkt * beta_c.unsqueeze(-1) * torch.exp(g_diff) * strict_lower
     return A.permute(0, 1, 3, 2, 4).reshape(B, T, H, C).to(torch.float32)
 
@@ -103,9 +104,11 @@ def ref_kernel(data: input_t) -> output_t:
     v_c = v_new.float().reshape(B, NT, C, H, V).permute(0, 1, 3, 2, 4)
     g_c = g.float().reshape(B, NT, C, H).permute(0, 1, 3, 2)
     o_inter = (q_c @ h.float()) * torch.exp(g_c).unsqueeze(-1)
-    qk = q_c @ k_c.transpose(-1, -2) * torch.exp(g_c.unsqueeze(-1) - g_c.unsqueeze(-2))
-    causal = torch.tril(torch.ones(C, C, device=q.device))
-    o = (o_inter + (qk * causal) @ v_c) * scale
+    causal = torch.tril(torch.ones(C, C, dtype=torch.bool, device=q.device))
+    g_diff = g_c.unsqueeze(-1) - g_c.unsqueeze(-2)
+    g_diff = torch.where(causal, g_diff, torch.zeros_like(g_diff))
+    qk = q_c @ k_c.transpose(-1, -2) * torch.exp(g_diff) * causal
+    o = (o_inter + qk @ v_c) * scale
     return o.permute(0, 1, 3, 2, 4).reshape(B, T, H, V).to(q.dtype)
 
 

diff --git a/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py b/problems/helion/gated_deltanet_chunk_fwd_o_py/submission.py
@@ -54,15 +54,20 @@ def kernel(
             c_idx = tile_t.begin // C
 
             g_vals = g[b_idx, tile_t, h_idx]
-            q_s = q[b_idx, tile_t, h_idx, :] * torch.exp(g_vals)[:, None]
-            k_s = k[b_idx, tile_t, h_idx, :] * torch.exp(-g_vals)[:, None]
+            q_tile = q[b_idx, tile_t, h_idx, :]
+            k_tile = k[b_idx, tile_t, h_idx, :]
+            v_tile = v[b_idx, tile_t, h_idx, :]
 
-            sim = hl.dot(q_s, k_s.T)
+            # intra-chunk: q @ k^T * exp(g_i - g_j), with causal mask
+            qk = hl.dot(q_tile, k_tile.T)
             idx = hl.arange(tile_t.block_size)
-            mask = idx[:, None] >= idx[None, :]
-            sim = torch.where(mask, sim, 0.0)
-            local_out = hl.dot(sim.to(v.dtype), v[b_idx, tile_t, h_idx, :])
+            g_diff = g_vals[:, None] - g_vals[None, :]
+            causal_mask = idx[:, None] >= idx[None, :]
+            sim = torch.where(causal_mask, qk * torch.exp(g_diff), 0.0)
+            local_out = hl.dot(sim.to(v.dtype), v_tile)
 
+            # inter-chunk: (q @ h) * exp(g)
+            q_s = q_tile * torch.exp(g_vals)[:, None]
             global_out = hl.dot(q_s, h[b_idx, c_idx, h_idx, :, :])
 
             out[b_idx, tile_t, h_idx, :] = ((global_out + local_out) * scale).to(out.dtype)

diff --git a/problems/helion/gated_deltanet_recompute_w_u_py/reference.py b/problems/helion/gated_deltanet_recompute_w_u_py/reference.py
@@ -19,8 +19,9 @@ def _chunk_scaled_dot_kkt_fwd_eager(k, g_cumsum, beta, chunk_size):
     g_c = g_cumsum.float().reshape(B, NT, C, H).permute(0, 1, 3, 2)
     beta_c = beta.float().reshape(B, NT, C, H).permute(0, 1, 3, 2)
     kkt = k_c @ k_c.transpose(-1, -2)
-    g_diff = g_c.unsqueeze(-1) - g_c.unsqueeze(-2)
     strict_lower = torch.tril(torch.ones(C, C, device=k.device), diagonal=-1)
+    g_diff = g_c.unsqueeze(-1) - g_c.unsqueeze(-2)
+    g_diff = g_diff * strict_lower
     A = kkt * beta_c.unsqueeze(-1) * torch.exp(g_diff) * strict_lower
     return A.permute(0, 1, 3, 2, 4).reshape(B, T, H, C).to(torch.float32)