flash-algo · LoserCheems · Jun 26, 2025 · Jun 26, 2025 · Jun 26, 2025 · Jun 26, 2025
diff --git a/csrc/src/flash_fwd_hdim128_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim128_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim128_bf16_sm80.cu b/csrc/src/flash_fwd_hdim128_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim128_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim128_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim128_fp16_sm80.cu b/csrc/src/flash_fwd_hdim128_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim192_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim192_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim192_bf16_sm80.cu b/csrc/src/flash_fwd_hdim192_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim192_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim192_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim192_fp16_sm80.cu b/csrc/src/flash_fwd_hdim192_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim256_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim256_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim256_bf16_sm80.cu b/csrc/src/flash_fwd_hdim256_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim256_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim256_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim256_fp16_sm80.cu b/csrc/src/flash_fwd_hdim256_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim32_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim32_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim32_bf16_sm80.cu b/csrc/src/flash_fwd_hdim32_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim32_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim32_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim32_fp16_sm80.cu b/csrc/src/flash_fwd_hdim32_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim64_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim64_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim64_bf16_sm80.cu b/csrc/src/flash_fwd_hdim64_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim64_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim64_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim64_fp16_sm80.cu b/csrc/src/flash_fwd_hdim64_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim96_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim96_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim96_bf16_sm80.cu b/csrc/src/flash_fwd_hdim96_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim96_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim96_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_hdim96_fp16_sm80.cu b/csrc/src/flash_fwd_hdim96_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
diff --git a/csrc/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE