diff --git a/csrc/src/flash_fwd_hdim128_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim128_bf16_causal_sm80.cu
new file mode 100644
index 0000000..2bacfd3
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim128_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim128_bf16_sm80.cu b/csrc/src/flash_fwd_hdim128_bf16_sm80.cu
new file mode 100644
index 0000000..bc6f103
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim128_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim128_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim128_fp16_causal_sm80.cu
new file mode 100644
index 0000000..48c2d89
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim128_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim128_fp16_sm80.cu b/csrc/src/flash_fwd_hdim128_fp16_sm80.cu
new file mode 100644
index 0000000..c67fdce
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim128_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim192_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim192_bf16_causal_sm80.cu
new file mode 100644
index 0000000..e957ce0
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim192_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim192_bf16_sm80.cu b/csrc/src/flash_fwd_hdim192_bf16_sm80.cu
new file mode 100644
index 0000000..f53f7f5
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim192_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim192_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim192_fp16_causal_sm80.cu
new file mode 100644
index 0000000..805e548
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim192_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim192_fp16_sm80.cu b/csrc/src/flash_fwd_hdim192_fp16_sm80.cu
new file mode 100644
index 0000000..87a6565
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim192_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim256_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim256_bf16_causal_sm80.cu
new file mode 100644
index 0000000..ded9ab1
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim256_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim256_bf16_sm80.cu b/csrc/src/flash_fwd_hdim256_bf16_sm80.cu
new file mode 100644
index 0000000..c3d9e04
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim256_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim256_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim256_fp16_causal_sm80.cu
new file mode 100644
index 0000000..8780ed4
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim256_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim256_fp16_sm80.cu b/csrc/src/flash_fwd_hdim256_fp16_sm80.cu
new file mode 100644
index 0000000..293e001
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim256_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim32_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim32_bf16_causal_sm80.cu
new file mode 100644
index 0000000..f50910b
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim32_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim32_bf16_sm80.cu b/csrc/src/flash_fwd_hdim32_bf16_sm80.cu
new file mode 100644
index 0000000..6127386
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim32_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim32_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim32_fp16_causal_sm80.cu
new file mode 100644
index 0000000..845ab35
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim32_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim32_fp16_sm80.cu b/csrc/src/flash_fwd_hdim32_fp16_sm80.cu
new file mode 100644
index 0000000..94a1301
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim32_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim64_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim64_bf16_causal_sm80.cu
new file mode 100644
index 0000000..f3d4e94
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim64_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim64_bf16_sm80.cu b/csrc/src/flash_fwd_hdim64_bf16_sm80.cu
new file mode 100644
index 0000000..a11cc79
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim64_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim64_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim64_fp16_causal_sm80.cu
new file mode 100644
index 0000000..1681867
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim64_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim64_fp16_sm80.cu b/csrc/src/flash_fwd_hdim64_fp16_sm80.cu
new file mode 100644
index 0000000..7b7d184
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim64_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim96_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim96_bf16_causal_sm80.cu
new file mode 100644
index 0000000..ad62e87
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim96_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim96_bf16_sm80.cu b/csrc/src/flash_fwd_hdim96_bf16_sm80.cu
new file mode 100644
index 0000000..92c4344
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim96_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim96_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim96_fp16_causal_sm80.cu
new file mode 100644
index 0000000..36db0da
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim96_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_hdim96_fp16_sm80.cu b/csrc/src/flash_fwd_hdim96_fp16_sm80.cu
new file mode 100644
index 0000000..50040b5
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim96_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu
new file mode 100644
index 0000000..b4118c7
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim128_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim128_bf16_sm80.cu
new file mode 100644
index 0000000..cdcfbe9
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim128_bf16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu
new file mode 100644
index 0000000..71e415d
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim128_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim128_fp16_sm80.cu
new file mode 100644
index 0000000..df4febe
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim128_fp16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu
new file mode 100644
index 0000000..83c8f8a
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim192_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim192_bf16_sm80.cu
new file mode 100644
index 0000000..d3bbf47
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim192_bf16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu
new file mode 100644
index 0000000..5652982
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim192_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim192_fp16_sm80.cu
new file mode 100644
index 0000000..edac2b6
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim192_fp16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu
new file mode 100644
index 0000000..28ab7ad
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim256_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim256_bf16_sm80.cu
new file mode 100644
index 0000000..751035e
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim256_bf16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu
new file mode 100644
index 0000000..502b5cc
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim256_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim256_fp16_sm80.cu
new file mode 100644
index 0000000..3153e17
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim256_fp16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu
new file mode 100644
index 0000000..9910f63
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim32_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim32_bf16_sm80.cu
new file mode 100644
index 0000000..d498fea
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim32_bf16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu
new file mode 100644
index 0000000..a5a713a
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim32_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim32_fp16_sm80.cu
new file mode 100644
index 0000000..4cfc36a
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim32_fp16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu
new file mode 100644
index 0000000..e89cb76
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim64_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim64_bf16_sm80.cu
new file mode 100644
index 0000000..8d72e93
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim64_bf16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu
new file mode 100644
index 0000000..76ba0c8
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim64_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim64_fp16_sm80.cu
new file mode 100644
index 0000000..ab07719
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim64_fp16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu
new file mode 100644
index 0000000..8d44e28
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim96_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim96_bf16_sm80.cu
new file mode 100644
index 0000000..252b468
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim96_bf16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu
new file mode 100644
index 0000000..3eb97b7
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/flash_fwd_split_hdim96_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim96_fp16_sm80.cu
new file mode 100644
index 0000000..367e12b
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim96_fp16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file
diff --git a/csrc/src/generate_kernels.py b/csrc/src/generate_kernels.py
index 54160ec..a359fe3 100644
--- a/csrc/src/generate_kernels.py
+++ b/csrc/src/generate_kernels.py
@@ -1,9 +1,8 @@
 import argparse
 import itertools
-import os
 from dataclasses import dataclass
 from pathlib import Path
-from typing import List, Optional
+from typing import Optional, Generator
 
 DTYPE_MAP = {
     "fp16": "cutlass::half_t",
@@ -61,7 +60,7 @@ def template(self) -> str:
         template_funcs = {
             "fwd": get_fwd_template,
             # "bwd": get_bwd_template,
-            # "fwd_split": get_fwd_split_template
+            "fwd_split": get_fwd_split_template
         }
         template_func = template_funcs[self.direction]
         return template_func().format(
@@ -74,8 +73,9 @@ def template(self) -> str:
     def filename(self) -> str:
         return f"flash_{self.direction}_hdim{self.head_dim}_{self.dtype}_{'causal_' if self.is_causal == 'true' else ''}sm{self.sm}.cu"
 
-def get_all_kernels() -> List[Kernel]:
-    for direction in ["fwd"]: #, "fwd_split", "bwd"]:
+def get_all_kernels() -> Generator[Kernel, None, None]:
+    # for direction in ["fwd", "fwd_split", "bwd"]:
+    for direction in ["fwd", "fwd_split"]:
         for dtype, head_dim, is_causal, sm in itertools.product(DTYPE_MAP.keys(), HEAD_DIMENSIONS, IS_CAUSAL, SM):
             yield Kernel(sm=sm, dtype=dtype, head_dim=head_dim, is_causal=is_causal, direction=direction)