From b28acd1aa78dc0f146ea25e07c8287c6d1b1dfc4 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Sun, 10 Aug 2025 19:58:39 +0800
Subject: [PATCH] Streamlines benchmark suite structure and test scope

Removes obsolete MQAR benchmark configuration and reorganizes benchmark files with cleaner naming convention.

Comments out extensive test configurations to focus on essential test cases, reducing test execution time while maintaining core functionality validation.

Simplifies the benchmark suite to improve maintainability and development workflow efficiency.
---
 benchmarks/benchmark_mqar.py                  |  62 -----------
 ..._equivalence.py => forward_equivalence.py} |  52 ++++-----
 ..._performance.py => forward_performance.py} | 100 +++++++++---------
 ...{benchmark_grad.py => grad_equivalence.py} |   0
 4 files changed, 76 insertions(+), 138 deletions(-)
 delete mode 100644 benchmarks/benchmark_mqar.py
 rename benchmarks/{benchmark_forward_equivalence.py => forward_equivalence.py} (97%)
 rename benchmarks/{benchmark_forward_performance.py => forward_performance.py} (94%)
 rename benchmarks/{benchmark_grad.py => grad_equivalence.py} (100%)

diff --git a/benchmarks/benchmark_mqar.py b/benchmarks/benchmark_mqar.py
deleted file mode 100644
index 80fcc89..0000000
--- a/benchmarks/benchmark_mqar.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from zoology.config import TrainConfig, ModelConfig, DataConfig, ModuleConfig
-from zoology.data.associative_recall import MQARConfig
-
-
-seq_len = 2048
-d_model = 256  # 32, 64, 128, 256
-vocab_size = seq_len + 1
-num_kv_pairs = 512
-
-if d_model == 32:
-    learning_rate = 4e-4
-elif d_model == 64:
-    learning_rate = 3e-4
-elif d_model == 128:
-    learning_rate = 2e-4
-elif d_model == 256:
-    learning_rate = 1e-4
-
-if seq_len == 1024:
-    batch_size = 64
-elif seq_len == 2048:
-    batch_size = 32
-elif seq_len == 4096:
-    batch_size = 16
-elif seq_len == 8192:
-    batch_size = 8
-
-config = TrainConfig(
-    learning_rate=learning_rate,
-    data=DataConfig(
-        cache_dir=".cache",
-        batch_size=batch_size,
-        train_configs=[
-            MQARConfig(
-                num_examples=250_000,
-                vocab_size=vocab_size,
-                input_seq_len=seq_len,
-                num_kv_pairs=num_kv_pairs,
-            )
-        ],
-        test_configs=[
-            MQARConfig(
-                num_examples=1_000,
-                vocab_size=vocab_size,
-                input_seq_len=seq_len,
-                num_kv_pairs=num_kv_pairs,
-            )
-        ]
-    ),
-    model=ModelConfig(
-        vocab_size=vocab_size,
-        d_model=d_model,
-        max_position_embeddings=seq_len,
-        sequence_mixer=ModuleConfig(
-            name="zoology.mixers.dma.DynamicMaskAttention",
-            kwargs={"keep_window_size": num_kv_pairs, "num_heads": 1},
-        )
-    ),
-    
-)
-
-configs = [config]
\ No newline at end of file
diff --git a/benchmarks/benchmark_forward_equivalence.py b/benchmarks/forward_equivalence.py
similarity index 97%
rename from benchmarks/benchmark_forward_equivalence.py
rename to benchmarks/forward_equivalence.py
index fac85d2..f55c33b 100644
--- a/benchmarks/benchmark_forward_equivalence.py
+++ b/benchmarks/forward_equivalence.py
@@ -518,27 +518,27 @@ def test_cuda_forward_equivalence(accuracy_threshold=0.95):
     # If you encounter NAN issues when running multiple configurations, try running a single configuration
     test_configs = [
         # (batch_size, num_heads, num_kv_heads, query_len, key_len, head_dim, is_causal)
-        (1, 1, 1, 64, 64, 32, True),
-        (1, 1, 1, 64, 64, 32, False),
-        (1, 1, 1, 128, 128, 32, True),
-        (1, 1, 1, 128, 128, 32, False),
-        (1, 1, 1, 256, 256, 32, True),
-        (1, 1, 1, 256, 256, 32, False),
-        (1, 1, 1, 512, 512, 32, True),
-        (1, 1, 1, 512, 512, 32, False),
-        (1, 1, 1, 1024, 1024, 32, True),
-        (1, 1, 1, 1024, 1024, 32, False),
-        (1, 1, 1, 2048, 2048, 32, True),
-        (1, 1, 1, 2048, 2048, 32, False),
+        # (1, 1, 1, 64, 64, 32, True),
+        # (1, 1, 1, 64, 64, 32, False),
+        # (1, 1, 1, 128, 128, 32, True),
+        # (1, 1, 1, 128, 128, 32, False),
+        # (1, 1, 1, 256, 256, 32, True),
+        # (1, 1, 1, 256, 256, 32, False),
+        # (1, 1, 1, 512, 512, 32, True),
+        # (1, 1, 1, 512, 512, 32, False),
+        # (1, 1, 1, 1024, 1024, 32, True),
+        # (1, 1, 1, 1024, 1024, 32, False),
+        # (1, 1, 1, 2048, 2048, 32, True),
+        # (1, 1, 1, 2048, 2048, 32, False),
         (1, 1, 1, 4096, 4096, 32, True),
-        (1, 1, 1, 4096, 4096, 32, False),
-        (1, 2, 1, 64, 64, 32, True),
-        (2, 1, 1, 128, 128, 32, True),
-        (2, 2, 1, 128, 128, 32, True),
-        (1, 2, 1, 64, 64, 128, True),
-        (1, 2, 1, 128, 128, 128, True),
-        (1, 2, 1, 256, 256, 128, True),
-        (1, 2, 1, 512, 512, 128, True),
+        # (1, 1, 1, 4096, 4096, 32, False),
+        # (1, 2, 1, 64, 64, 32, True),
+        # (2, 1, 1, 128, 128, 32, True),
+        # (2, 2, 1, 128, 128, 32, True),
+        # (1, 2, 1, 64, 64, 128, True),
+        # (1, 2, 1, 128, 128, 128, True),
+        # (1, 2, 1, 256, 256, 128, True),
+        # (1, 2, 1, 512, 512, 128, True),
     ]
     
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -1050,13 +1050,13 @@ def main():
         print("\n" + "📍" + " Starting Standard Forward Pass Tests " + "📍")
         test_results['cuda'] = test_cuda_forward_equivalence(args.accuracy_threshold)
     
-    if args.test_type in ['all', 'triton']:
-        print("\n" + "🔥" + " Starting Python vs Triton Tests " + "🔥")
-        test_results['triton'] = test_triton_forward_equivalence(args.accuracy_threshold)
+    # if args.test_type in ['all', 'triton']:
+    #     print("\n" + "🔥" + " Starting Python vs Triton Tests " + "🔥")
+    #     test_results['triton'] = test_triton_forward_equivalence(args.accuracy_threshold)
 
-    if args.test_type in ['all', 'flex']:
-        print("\n" + "🌟" + " Starting Python vs Flex Attention Tests " + "🌟")
-        test_results['flex'] = test_flex_forward_equivalence(args.accuracy_threshold)
+    # if args.test_type in ['all', 'flex']:
+    #     print("\n" + "🌟" + " Starting Python vs Flex Attention Tests " + "🌟")
+    #     test_results['flex'] = test_flex_forward_equivalence(args.accuracy_threshold)
 
 
     # Print overall summary
diff --git a/benchmarks/benchmark_forward_performance.py b/benchmarks/forward_performance.py
similarity index 94%
rename from benchmarks/benchmark_forward_performance.py
rename to benchmarks/forward_performance.py
index 0c1c042..7511946 100644
--- a/benchmarks/benchmark_forward_performance.py
+++ b/benchmarks/forward_performance.py
@@ -732,57 +732,57 @@ def run_performance_benchmark(test_type='all', num_runs=3, warmup_runs=2):
         (1, 2, 1, 4096, 4096, 128, 2048, True),
         (1, 2, 1, 8192, 8192, 128, 2048, True),
         (1, 2, 1, 16384, 16384, 128, 2048, True),
-        (1, 2, 1, 32768, 32768, 128, 2048, True),
-
-        # Inference
-        (1, 2, 1, 2, 256, 128, 2048, True),
-        (1, 2, 1, 2, 512, 128, 2048, True),
-        (1, 2, 1, 2, 1024, 128, 2048, True),
-        (1, 2, 1, 2, 2048, 128, 2048, True),
-        (1, 2, 1, 2, 4096, 128, 2048, True),
-        (1, 2, 1, 2, 8192, 128, 2048, True),
-        (1, 2, 1, 2, 16384, 128, 2048, True),
-        (1, 2, 1, 2, 32768, 128, 2048, True),
+        # (1, 2, 1, 32768, 32768, 128, 2048, True),
+
+        # # Inference
+        # (1, 2, 1, 2, 256, 128, 2048, True),
+        # (1, 2, 1, 2, 512, 128, 2048, True),
+        # (1, 2, 1, 2, 1024, 128, 2048, True),
+        # (1, 2, 1, 2, 2048, 128, 2048, True),
+        # (1, 2, 1, 2, 4096, 128, 2048, True),
+        # (1, 2, 1, 2, 8192, 128, 2048, True),
+        # (1, 2, 1, 2, 16384, 128, 2048, True),
+        # (1, 2, 1, 2, 32768, 128, 2048, True),
         (1, 2, 1, 2, 65536, 128, 2048, True),
-        (1, 2, 1, 2, 131072, 128, 2048, True),
-        (1, 2, 1, 2, 262144, 128, 2048, True),
-        (1, 2, 1, 2, 524288, 128, 2048, True),
-        
-        # Vary batch size
-        (1, 2, 1, 4096, 4096, 32, 2048, True),
-        (2, 2, 1, 4096, 4096, 32, 2048, True),
-        (4, 2, 1, 4096, 4096, 32, 2048, True),
-        (8, 2, 1, 4096, 4096, 32, 2048, True),
-        
-        # Vary head count
-        (1, 1, 1, 4096, 4096, 32, 2048, True),
-        (1, 2, 1, 4096, 4096, 32, 2048, True),
-        (1, 4, 1, 4096, 4096, 32, 2048, True),
-        (1, 8, 2, 4096, 4096, 32, 2048, True),
-        
-        # Vary head dimension
-        (1, 2, 1, 4096, 4096, 32, 2048, True),
-        (1, 2, 1, 4096, 4096, 64, 2048, True),
-        (1, 2, 1, 4096, 4096, 96, 2048, True),
-        (1, 2, 1, 4096, 4096, 128, 2048, True),
-        (1, 2, 1, 4096, 4096, 192, 2048, True),
-        (1, 2, 1, 4096, 4096, 256, 2048, True),
-        
-        # Vary keep_window_size
-        (1, 2, 1, 32768, 32768, 128, 32, True),
-        (1, 2, 1, 32768, 32768, 128, 64, True),
-        (1, 2, 1, 32768, 32768, 128, 128, True),
-        (1, 2, 1, 32768, 32768, 128, 256, True),
-        (1, 2, 1, 32768, 32768, 128, 512, True),
-        (1, 2, 1, 32768, 32768, 128, 1024, True),
-        (1, 2, 1, 32768, 32768, 128, 2048, True),
-        (1, 2, 1, 32768, 32768, 128, 4096, True),
-        (1, 2, 1, 32768, 32768, 128, 8192, True),
-        (1, 2, 1, 32768, 32768, 128, 16384, True),
-        (1, 2, 1, 32768, 32768, 128, 32768, True),
-        
-        # Test non-causal
-        (1, 2, 1, 4096, 4096, 128, 2048, False),
+        # (1, 2, 1, 2, 131072, 128, 2048, True),
+        # (1, 2, 1, 2, 262144, 128, 2048, True),
+        # (1, 2, 1, 2, 524288, 128, 2048, True),
+        
+        # # Vary batch size
+        # (1, 2, 1, 4096, 4096, 32, 2048, True),
+        # (2, 2, 1, 4096, 4096, 32, 2048, True),
+        # (4, 2, 1, 4096, 4096, 32, 2048, True),
+        # (8, 2, 1, 4096, 4096, 32, 2048, True),
+        
+        # # Vary head count
+        # (1, 1, 1, 4096, 4096, 32, 2048, True),
+        # (1, 2, 1, 4096, 4096, 32, 2048, True),
+        # (1, 4, 1, 4096, 4096, 32, 2048, True),
+        # (1, 8, 2, 4096, 4096, 32, 2048, True),
+        
+        # # Vary head dimension
+        # (1, 2, 1, 4096, 4096, 32, 2048, True),
+        # (1, 2, 1, 4096, 4096, 64, 2048, True),
+        # (1, 2, 1, 4096, 4096, 96, 2048, True),
+        # (1, 2, 1, 4096, 4096, 128, 2048, True),
+        # (1, 2, 1, 4096, 4096, 192, 2048, True),
+        # (1, 2, 1, 4096, 4096, 256, 2048, True),
+        
+        # # Vary keep_window_size
+        # (1, 2, 1, 32768, 32768, 128, 32, True),
+        # (1, 2, 1, 32768, 32768, 128, 64, True),
+        # (1, 2, 1, 32768, 32768, 128, 128, True),
+        # (1, 2, 1, 32768, 32768, 128, 256, True),
+        # (1, 2, 1, 32768, 32768, 128, 512, True),
+        # (1, 2, 1, 32768, 32768, 128, 1024, True),
+        # (1, 2, 1, 32768, 32768, 128, 2048, True),
+        # (1, 2, 1, 32768, 32768, 128, 4096, True),
+        # (1, 2, 1, 32768, 32768, 128, 8192, True),
+        # (1, 2, 1, 32768, 32768, 128, 16384, True),
+        # (1, 2, 1, 32768, 32768, 128, 32768, True),
+        
+        # # Test non-causal
+        # (1, 2, 1, 4096, 4096, 128, 2048, False),
     ]
     
     print(f"\n📊 Benchmark Results (averaged over {num_runs} runs):")
diff --git a/benchmarks/benchmark_grad.py b/benchmarks/grad_equivalence.py
similarity index 100%
rename from benchmarks/benchmark_grad.py
rename to benchmarks/grad_equivalence.py