From b29e3ac6dc211faf74b28c5bb7dbb52a17ef4787 Mon Sep 17 00:00:00 2001 From: David Berard Date: Fri, 26 Sep 2025 17:13:29 -0700 Subject: [PATCH] [trimul] Calculate Cauchy distribution on GPU While running eval.py locally, I observed super long iteration times for any Cauchy distributions. Profiles show that most of the time is spent on the CPU generating Cauchy distributions. For example, on a H100, I saw a 100-iteration benchmarking run for a BS=2, dim=256, seqlen=128, cauchy distribution take 13 seconds without this change, and <1s after this change. My suspicion is that the previous behavior could lead to some submissions timing out if they have more variation than average. --- problems/bioml/trimul/reference.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/problems/bioml/trimul/reference.py b/problems/bioml/trimul/reference.py index 86c1ee1c..e02653dd 100644 --- a/problems/bioml/trimul/reference.py +++ b/problems/bioml/trimul/reference.py @@ -134,7 +134,9 @@ def generate_input( # Generate input tensor based on distribution if distribution == "cauchy": # Heavier tail distribution - input_tensor = torch.distributions.Cauchy(0, 2).sample( + zero = torch.tensor(0.0, device="cuda") + two = torch.tensor(2.0, device="cuda") + input_tensor = torch.distributions.Cauchy(zero, two).sample( (batch_size, seq_len, seq_len, dim) ).to(device='cuda', dtype=torch.float32) else: # normal distribution @@ -165,4 +167,4 @@ def generate_input( return (input_tensor, mask, weights, config) -check_implementation = make_match_reference(ref_kernel, rtol=2e-2, atol=2e-2) \ No newline at end of file +check_implementation = make_match_reference(ref_kernel, rtol=2e-2, atol=2e-2)