From b29e3ac6dc211faf74b28c5bb7dbb52a17ef4787 Mon Sep 17 00:00:00 2001
From: David Berard <davidberard98@gmail.com>
Date: Fri, 26 Sep 2025 17:13:29 -0700
Subject: [PATCH] [trimul] Calculate Cauchy distribution on GPU

While running eval.py locally, I observed super long iteration times for any Cauchy distributions. Profiles show that most of the time is spent on the CPU generating Cauchy distributions.

For example, on a H100, I saw a 100-iteration benchmarking run for a BS=2, dim=256, seqlen=128, cauchy distribution take 13 seconds without this change, and <1s after this change.

My suspicion is that the previous behavior could lead to some submissions timing out if they have more variation than average.
---
 problems/bioml/trimul/reference.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/problems/bioml/trimul/reference.py b/problems/bioml/trimul/reference.py
index 86c1ee1c..e02653dd 100644
--- a/problems/bioml/trimul/reference.py
+++ b/problems/bioml/trimul/reference.py
@@ -134,7 +134,9 @@ def generate_input(
     # Generate input tensor based on distribution
     if distribution == "cauchy":
         # Heavier tail distribution
-        input_tensor = torch.distributions.Cauchy(0, 2).sample(
+        zero = torch.tensor(0.0, device="cuda")
+        two = torch.tensor(2.0, device="cuda")
+        input_tensor = torch.distributions.Cauchy(zero, two).sample(
             (batch_size, seq_len, seq_len, dim)
         ).to(device='cuda', dtype=torch.float32)
     else:  # normal distribution
@@ -165,4 +167,4 @@ def generate_input(
     return (input_tensor, mask, weights, config)
 
 
-check_implementation = make_match_reference(ref_kernel, rtol=2e-2, atol=2e-2)
\ No newline at end of file
+check_implementation = make_match_reference(ref_kernel, rtol=2e-2, atol=2e-2)