From dff25ca8cd7a2cc31a21dd565e903e3eba64b152 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:38:53 +0800
Subject: [PATCH 01/51] Adds specialized causal attention kernel for bf16
 hdim32

Creates dedicated CUDA kernel implementation for causal multi-head attention with bfloat16 precision and head dimension 32.

Separates kernel specializations into individual files to improve compilation performance by reducing template instantiation overhead.
---
 csrc/src/flash_fwd_hdim32_bf16_causal_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim32_bf16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim32_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim32_bf16_causal_sm80.cu
new file mode 100644
index 0000000..f50910b
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim32_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 769a05f40ff4a4da833d1a62ed135b5c614cfe1a Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:39:05 +0800
Subject: [PATCH 02/51] Adds specialized kernel for 32-dim bfloat16 forward
 pass

Introduces a dedicated compilation unit for bfloat16 data type with 32-dimensional head size to improve build performance by parallelizing kernel compilation across multiple files.

The template specialization delegates to the appropriate implementation while maintaining the existing interface.
---
 csrc/src/flash_fwd_hdim32_bf16_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim32_bf16_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim32_bf16_sm80.cu b/csrc/src/flash_fwd_hdim32_bf16_sm80.cu
new file mode 100644
index 0000000..6127386
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim32_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From d8f8f5c691092f4df53522acc7e24c3b3be80b7c Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:39:16 +0800
Subject: [PATCH 03/51] Adds causal flash attention kernel for 32-dim heads

Implements specialized template instantiation for causal multi-head attention with 32-dimensional heads using half precision.

Separates kernel implementations into dedicated files to improve compilation performance by reducing template instantiation overhead.
---
 csrc/src/flash_fwd_hdim32_fp16_causal_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim32_fp16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim32_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim32_fp16_causal_sm80.cu
new file mode 100644
index 0000000..845ab35
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim32_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 60832581f90af4a14fbdaa906bfc9b2b481ca39a Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:39:24 +0800
Subject: [PATCH 04/51] Adds specialized kernel for 32-dim heads with FP16

Introduces a dedicated compilation unit for flash attention forward pass with 32-dimensional heads using half precision on SM80 architecture.

Splits kernel implementations into separate files to improve compilation speed by reducing template instantiation overhead.
---
 csrc/src/flash_fwd_hdim32_fp16_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim32_fp16_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim32_fp16_sm80.cu b/csrc/src/flash_fwd_hdim32_fp16_sm80.cu
new file mode 100644
index 0000000..94a1301
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim32_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From db4fc941ace0d8d1da3c26e42e057d2af605ee10 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:39:35 +0800
Subject: [PATCH 05/51] Adds specialized kernel for 64-dim causal attention

Implements template specialization for bfloat16 multi-head attention
with 64 head dimensions and causal masking on SM80 architecture.

Splits kernel implementations into separate files to improve
compilation performance and enable parallel builds.
---
 csrc/src/flash_fwd_hdim64_bf16_causal_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim64_bf16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim64_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim64_bf16_causal_sm80.cu
new file mode 100644
index 0000000..f3d4e94
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim64_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From fb433fe0fba30fbcb763308cc6592030ec9db5bf Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:39:51 +0800
Subject: [PATCH 06/51] Add bfloat16 forward kernel for 64-dim heads on SM80

Introduces specialized kernel implementation to improve compilation times by splitting different head dimensions into separate files.

The kernel handles multi-head attention forward pass with 64-dimensional heads using bfloat16 precision for SM80 architecture.
---
 csrc/src/flash_fwd_hdim64_bf16_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim64_bf16_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim64_bf16_sm80.cu b/csrc/src/flash_fwd_hdim64_bf16_sm80.cu
new file mode 100644
index 0000000..a11cc79
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim64_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 40fb47a657c420efea6f922c42ed9a4c4c8f9556 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:40:00 +0800
Subject: [PATCH 07/51] Adds specialized kernel for causal attention with
 64-dim heads

Implements template specialization for FP16 causal multi-head attention with 64-dimensional heads on SM80 architecture.

Splits kernel implementations into separate files to improve compilation speed and maintainability.
---
 csrc/src/flash_fwd_hdim64_fp16_causal_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim64_fp16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim64_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim64_fp16_causal_sm80.cu
new file mode 100644
index 0000000..1681867
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim64_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 4e776bedd390c802478391bb75db393a3a775b51 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:40:08 +0800
Subject: [PATCH 08/51] Adds FP16 forward kernel for 64-dim heads on SM80

Introduces specialized CUDA kernel for multi-head attention forward pass with 64-dimensional heads using half-precision floating point on SM80 architecture.

Splits kernel implementations by head dimension to accelerate compilation process as noted in the auto-generated template specialization.
---
 csrc/src/flash_fwd_hdim64_fp16_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim64_fp16_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim64_fp16_sm80.cu b/csrc/src/flash_fwd_hdim64_fp16_sm80.cu
new file mode 100644
index 0000000..7b7d184
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim64_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 18abb73170c97a5d6940f4e9aeeae3e8b30d2647 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:40:18 +0800
Subject: [PATCH 09/51] Adds specialized CUDA kernel for bfloat16 causal
 attention

Implements template specialization for multi-head attention forward pass
with 96-dimensional heads using bfloat16 precision and causal masking.

Splits kernel implementations across separate files to improve
compilation performance for SM80 architecture.
---
 csrc/src/flash_fwd_hdim96_bf16_causal_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim96_bf16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim96_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim96_bf16_causal_sm80.cu
new file mode 100644
index 0000000..ad62e87
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim96_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From abf0c5aa727f2189f54bf8460a60810ee99a9908 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:40:27 +0800
Subject: [PATCH 10/51] Adds specialized flash attention kernel for hdim96 bf16

Implements template specialization for bfloat16 data type with head dimension 96 to support modular kernel compilation.

Splits kernel implementations across separate files to reduce compilation time and improve build efficiency.
---
 csrc/src/flash_fwd_hdim96_bf16_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim96_bf16_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim96_bf16_sm80.cu b/csrc/src/flash_fwd_hdim96_bf16_sm80.cu
new file mode 100644
index 0000000..92c4344
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim96_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From fdf3a711ca591e1e53c04f4619400b7b42a6ff96 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:40:36 +0800
Subject: [PATCH 11/51] Adds FP16 causal forward kernel for 96 head dimension

Introduces specialized CUDA kernel implementation for multi-head attention forward pass with 96-dimensional heads using half-precision floating point and causal masking.

Splits kernel implementations by head dimension into separate compilation units to improve build performance and reduce compilation time.
---
 csrc/src/flash_fwd_hdim96_fp16_causal_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim96_fp16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim96_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim96_fp16_causal_sm80.cu
new file mode 100644
index 0000000..36db0da
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim96_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From e335a28967fd3441ed236e70a9aca22b62907ae7 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:40:46 +0800
Subject: [PATCH 12/51] Adds Flash Attention forward kernel for 96-dim heads

Implements specialized template instantiation for 96-dimensional head size using half precision on SM80 architecture.

Separates kernel implementations by head dimension to improve compilation performance as noted in the auto-generated file structure.
---
 csrc/src/flash_fwd_hdim96_fp16_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim96_fp16_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim96_fp16_sm80.cu b/csrc/src/flash_fwd_hdim96_fp16_sm80.cu
new file mode 100644
index 0000000..50040b5
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim96_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From c194663764d5cc3c959946d3a445482e257ecf00 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:40:56 +0800
Subject: [PATCH 13/51] Adds specialized kernel for bfloat16 causal attention

Introduces auto-generated CUDA kernel implementation for multi-head attention forward pass with 128-dimensional heads using bfloat16 precision and causal masking.

Separates kernel specializations into individual files to improve compilation performance by reducing template instantiation overhead.
---
 csrc/src/flash_fwd_hdim128_bf16_causal_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim128_bf16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim128_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim128_bf16_causal_sm80.cu
new file mode 100644
index 0000000..2bacfd3
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim128_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 9e93777cd6a1c03e960fa83ecd0ee0cb4ceec2fa Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:41:04 +0800
Subject: [PATCH 14/51] Adds specialized kernel for bf16 hdim128 forward pass

Introduces a dedicated compilation unit for bfloat16 multi-head attention forward kernels with head dimension 128 on SM80 architecture.

Splits kernel implementations into separate files to reduce compilation time and improve build parallelization.
---
 csrc/src/flash_fwd_hdim128_bf16_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim128_bf16_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim128_bf16_sm80.cu b/csrc/src/flash_fwd_hdim128_bf16_sm80.cu
new file mode 100644
index 0000000..bc6f103
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim128_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 92a3395ff7452a90d5bc80865eed56179a82e43c Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:41:12 +0800
Subject: [PATCH 15/51] Adds specialized kernel for 128-dim causal attention

Splits compilation by creating dedicated file for FP16 causal attention with 128 head dimensions on SM80 architecture.

Improves build performance by isolating template instantiation into separate compilation unit.
---
 csrc/src/flash_fwd_hdim128_fp16_causal_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim128_fp16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim128_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim128_fp16_causal_sm80.cu
new file mode 100644
index 0000000..48c2d89
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim128_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 3fff6034c21d0fb70d406622966a4a7bc96c1499 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:41:27 +0800
Subject: [PATCH 16/51] Adds specialized FP16 kernel for head dimension 128

Introduces dedicated compilation unit for SM80 architecture to optimize build times by splitting kernel implementations across separate files.

Implements template specialization for half-precision floating point operations with 128-dimensional attention heads.
---
 csrc/src/flash_fwd_hdim128_fp16_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim128_fp16_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim128_fp16_sm80.cu b/csrc/src/flash_fwd_hdim128_fp16_sm80.cu
new file mode 100644
index 0000000..c67fdce
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim128_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 3c02549179fa571db605ccf55398f5d8be92e1cc Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:41:36 +0800
Subject: [PATCH 17/51] Adds bfloat16 causal flash attention kernel for 192
 head dim

Implements specialized CUDA kernel for multi-head attention forward pass
with bfloat16 precision, 192 head dimension, and causal masking.

Separates kernel implementations by head dimension to improve compilation
speed and enables SM80 GPU architecture optimization.
---
 csrc/src/flash_fwd_hdim192_bf16_causal_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim192_bf16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim192_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim192_bf16_causal_sm80.cu
new file mode 100644
index 0000000..e957ce0
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim192_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From dc3354dc14ac12072bb324a9769dd151dca7f6d8 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:41:46 +0800
Subject: [PATCH 18/51] Adds specialized CUDA kernel for bfloat16 head
 dimension 192

Implements template specialization for multi-head attention forward pass
with bfloat16 precision and 192 head dimensions on SM80 architecture.

Splits kernel implementations into separate files to improve compilation
speed and enables targeted optimization for specific configurations.
---
 csrc/src/flash_fwd_hdim192_bf16_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim192_bf16_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim192_bf16_sm80.cu b/csrc/src/flash_fwd_hdim192_bf16_sm80.cu
new file mode 100644
index 0000000..f53f7f5
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim192_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From b829b5bd27a8ef4dfe89177e432ec54ef1ec9d5f Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:41:57 +0800
Subject: [PATCH 19/51] Adds FP16 causal flash attention kernel for 192 head
 dim

Implements specialized CUDA kernel for flash attention forward pass with 192-dimensional heads using half-precision floating point and causal masking.

Splits kernel implementations by head dimension to improve compilation speed and follows auto-generation pattern for SM80 architecture.
---
 csrc/src/flash_fwd_hdim192_fp16_causal_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim192_fp16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim192_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim192_fp16_causal_sm80.cu
new file mode 100644
index 0000000..805e548
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim192_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From b0436da28a26e732149844bbdf3965e071e4ca70 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:42:06 +0800
Subject: [PATCH 20/51] Adds specialized Flash Attention kernel for 192 head
 dimension

Implements template specialization for half-precision floating point operations with 192-dimensional attention heads on SM80 architecture.

Splits kernel implementations across separate files to improve compilation performance as noted in the auto-generated code structure.
---
 csrc/src/flash_fwd_hdim192_fp16_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim192_fp16_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim192_fp16_sm80.cu b/csrc/src/flash_fwd_hdim192_fp16_sm80.cu
new file mode 100644
index 0000000..87a6565
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim192_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From c828cdbcb03eb68c201b5d8b5d732fe752406204 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:42:22 +0800
Subject: [PATCH 21/51] Adds specialized CUDA kernel for bfloat16 causal
 attention

Introduces a new auto-generated CUDA file that implements a specialized forward pass kernel for multi-head attention with bfloat16 precision, 256 head dimensions, and causal masking on SM80 architecture.

Splits kernel implementations across separate files to improve compilation performance and maintainability.
---
 csrc/src/flash_fwd_hdim256_bf16_causal_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim256_bf16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim256_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim256_bf16_causal_sm80.cu
new file mode 100644
index 0000000..ded9ab1
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim256_bf16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::bfloat16_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From f9ce0bc6bd91ed4cc219a1b46025eb63d02fe362 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:42:31 +0800
Subject: [PATCH 22/51] Adds flash attention forward kernel for head dimension
 256

Creates specialized CUDA kernel for bfloat16 flash attention with 256-dimensional heads on SM80 architecture.

Splits kernel implementations by head dimension to improve compilation speed as noted in the auto-generated file structure.
---
 csrc/src/flash_fwd_hdim256_bf16_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim256_bf16_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim256_bf16_sm80.cu b/csrc/src/flash_fwd_hdim256_bf16_sm80.cu
new file mode 100644
index 0000000..c3d9e04
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim256_bf16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::bfloat16_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From d969390be674519cd7796b35cf336231d5444fe3 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:42:39 +0800
Subject: [PATCH 23/51] Adds FP16 causal flash attention kernel for 256 head
 dim

Introduces specialized CUDA kernel implementation for flash attention forward pass with 256-dimensional heads using half-precision floating point and causal masking.

Supports SM80 architecture and follows the pattern of splitting different head dimensions into separate files to improve compilation performance.
---
 csrc/src/flash_fwd_hdim256_fp16_causal_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim256_fp16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim256_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim256_fp16_causal_sm80.cu
new file mode 100644
index 0000000..8780ed4
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim256_fp16_causal_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::half_t, true>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 86d3b377a614da2bc3faf8879540d21ffce4e149 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:42:48 +0800
Subject: [PATCH 24/51] Adds specialized kernel for head dimension 256 with
 FP16

Splits kernel implementation into separate file to improve compilation speed for flash attention forward pass with 256 head dimensions using half precision on SM80 architecture.

The template specialization enables optimized execution path for this specific configuration while maintaining modularity in the codebase.
---
 csrc/src/flash_fwd_hdim256_fp16_sm80.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 csrc/src/flash_fwd_hdim256_fp16_sm80.cu

diff --git a/csrc/src/flash_fwd_hdim256_fp16_sm80.cu b/csrc/src/flash_fwd_hdim256_fp16_sm80.cu
new file mode 100644
index 0000000..293e001
--- /dev/null
+++ b/csrc/src/flash_fwd_hdim256_fp16_sm80.cu
@@ -0,0 +1,14 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::half_t, false>(params, stream);
+}
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From b069e4bcfd5403b7746c2ffd90c9e6bf101b18f0 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:42:56 +0800
Subject: [PATCH 25/51] Adds specialized kernel for 32-dim BF16 causal
 attention

Splits flash attention kernels by head dimension to reduce compilation time.

Creates dedicated instantiation for 32-dimensional heads with bfloat16 precision and causal masking on SM80 architecture.
---
 csrc/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu
new file mode 100644
index 0000000..9910f63
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 67c9c36566627da007867f6fe98cefd578acf661 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:43:06 +0800
Subject: [PATCH 26/51] Adds split kernel for bfloat16 hdim32 forward pass

Introduces auto-generated CUDA kernel specialization to improve compilation performance by splitting different head dimensions into separate files.

Implements template instantiation for split-k attention forward pass with 32-dimensional heads using bfloat16 precision on SM80 architecture.
---
 csrc/src/flash_fwd_split_hdim32_bf16_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim32_bf16_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim32_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim32_bf16_sm80.cu
new file mode 100644
index 0000000..d498fea
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim32_bf16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 07777f4d1edd3fd805e25923b0d7fcd276d1b240 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:43:14 +0800
Subject: [PATCH 27/51] Adds split kernel for FP16 causal attention

Introduces specialized kernel file for head dimension 32 with FP16 precision and causal masking to accelerate compilation times.

Splits kernel instantiations across separate files as part of compilation optimization strategy.
---
 csrc/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu
new file mode 100644
index 0000000..a5a713a
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 860c579eb46f20122a5fbfc40a37ac22f29f15d8 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:43:38 +0800
Subject: [PATCH 28/51] Adds auto-generated kernel for FP16 SM80 hdim32

Splits head dimension 32 flash attention kernel into separate file to improve compilation speed.

Auto-generated template instantiation for half precision on SM80 architecture with split key-value dispatch.
---
 csrc/src/flash_fwd_split_hdim32_fp16_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim32_fp16_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim32_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim32_fp16_sm80.cu
new file mode 100644
index 0000000..4cfc36a
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim32_fp16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 090c8be42733556f29290d097a0757ed2a130e57 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:43:54 +0800
Subject: [PATCH 29/51] Adds split kernel for bfloat16 causal attention

Introduces specialized kernel file for head dimension 64 with bfloat16 precision and causal masking to improve compilation performance.

Separates kernel instantiations into dedicated files to reduce build times and enable parallel compilation of different attention configurations.
---
 csrc/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu
new file mode 100644
index 0000000..e89cb76
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From c67b1efc75e6d528c97cde4a37677db3f3829a70 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:44:06 +0800
Subject: [PATCH 30/51] Adds split kernel for bfloat16 head dimension 64

Splits flash attention forward kernels by head dimension to reduce compilation time.

Creates dedicated kernel instantiation for bfloat16 with 64-dimensional heads on SM80 architecture.
---
 csrc/src/flash_fwd_split_hdim64_bf16_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim64_bf16_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim64_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim64_bf16_sm80.cu
new file mode 100644
index 0000000..8d72e93
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim64_bf16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From c331e5b4922f60c8050905b3ca7d4dedbefec439 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:44:44 +0800
Subject: [PATCH 31/51] Adds CUDA kernel for 64-dim causal attention

Splits flash attention kernels by head dimension to improve compilation performance.

Creates specialized kernel instantiation for 64-dimensional heads with causal masking on SM80 architecture using half precision.
---
 csrc/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu
new file mode 100644
index 0000000..76ba0c8
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 2d715f1b0395815187d2989bca16bc0e05161054 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:46:03 +0800
Subject: [PATCH 32/51] Adds split kernel file for 64-dim heads with FP16

Introduces a new compilation unit specifically for 64-dimensional head kernels using half-precision floating point on SM80 architecture.

Splits kernel implementations across separate files to reduce compilation time and improve build efficiency for the flash attention forward pass.
---
 csrc/src/flash_fwd_split_hdim64_fp16_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim64_fp16_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim64_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim64_fp16_sm80.cu
new file mode 100644
index 0000000..ab07719
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim64_fp16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 5baaedb85cd4bbc852a691d7178e399e9ebcc64b Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:46:11 +0800
Subject: [PATCH 33/51] Adds specialized kernel for 96-dim causal attention

Splits head dimension 96 into separate compilation unit to improve build times.

Uses bfloat16 precision with causal masking for SM80 architecture.
---
 csrc/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu
new file mode 100644
index 0000000..8d44e28
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 8d63d454e05eaad91b2f2ad36577493e67c6e432 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:46:20 +0800
Subject: [PATCH 34/51] Adds split kernel for bfloat16 head dimension 96

Introduces a dedicated compilation unit for bfloat16 kernels with head dimension 96 to improve build performance.

Splits kernel instantiations across separate files to reduce compilation time and memory usage during builds.
---
 csrc/src/flash_fwd_split_hdim96_bf16_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim96_bf16_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim96_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim96_bf16_sm80.cu
new file mode 100644
index 0000000..252b468
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim96_bf16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From fa967e6404089f6713ee27c0931af12009536105 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:46:44 +0800
Subject: [PATCH 35/51] Adds specialized kernel for hdim96 fp16 causal
 attention

Splits head dimension 96 implementation into separate compilation unit to improve build performance.

Template instantiation targets SM80 architecture with half precision and causal masking enabled.
---
 csrc/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu
new file mode 100644
index 0000000..3eb97b7
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 26b953bf7fe5cfe036f7bb7d9268d07817bc3ea5 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:51:17 +0800
Subject: [PATCH 36/51] Adds split kernel for head dimension 96 with FP16

Introduces specialized kernel instantiation for 96-dimensional heads using half precision to enable faster compilation through file splitting strategy.

Improves build performance by isolating specific head dimension and precision combinations into separate compilation units.
---
 csrc/src/flash_fwd_split_hdim96_fp16_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim96_fp16_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim96_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim96_fp16_sm80.cu
new file mode 100644
index 0000000..367e12b
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim96_fp16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 2a4c14ba9dbe044cfb64dfb4d4269fb29ad9b4ff Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:51:30 +0800
Subject: [PATCH 37/51] Adds split kernel for bf16 causal attention

Introduces specialized kernel file for bfloat16 data type with 128 head dimension and causal masking targeting SM80 architecture.

Separates kernel implementations into individual files to improve compilation performance by reducing build times through parallel compilation and selective recompilation.
---
 csrc/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu
new file mode 100644
index 0000000..b4118c7
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From d8ff35ab2e25b352c4e2f784dd71eaded7bcb049 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:51:48 +0800
Subject: [PATCH 38/51] Adds split kernel for bf16 hdim128 on SM80

Splits kernel compilation by head dimension to improve build times.

Auto-generated file contains template instantiation for bfloat16
with 128 head dimensions targeting SM80 architecture.
---
 csrc/src/flash_fwd_split_hdim128_bf16_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim128_bf16_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim128_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim128_bf16_sm80.cu
new file mode 100644
index 0000000..cdcfbe9
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim128_bf16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 6aa467c253210aad86ac63bbd7438aa6f0eafec9 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:51:57 +0800
Subject: [PATCH 39/51] Adds specialized CUDA kernel for FP16 causal attention

Introduces auto-generated kernel file for head dimension 128 with FP16 precision and causal masking on SM80 architecture.

Splits kernel implementations into separate files to accelerate compilation times by reducing template instantiation overhead per compilation unit.
---
 csrc/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu
new file mode 100644
index 0000000..71e415d
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 34a6fe1adcf825b5bc1e045797b67cf60474a339 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:52:07 +0800
Subject: [PATCH 40/51] Adds split kernel for fp16 hdim128 on SM80

Introduces a new compilation unit for FlashAttention forward pass kernels with specific parameters (fp16, head dimension 128, SM80 architecture).

Separates kernel instantiations into individual files to reduce compilation time and improve build parallelization.
---
 csrc/src/flash_fwd_split_hdim128_fp16_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim128_fp16_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim128_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim128_fp16_sm80.cu
new file mode 100644
index 0000000..df4febe
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim128_fp16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 32943f139a4104868fceff27c0ed44d040861a13 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:52:16 +0800
Subject: [PATCH 41/51] Adds specialized kernel for head dimension 192 with
 bfloat16

Introduces a dedicated CUDA kernel file for flash attention forward pass with specific parameters: head dimension 192, bfloat16 data type, causal masking, and SM80 architecture.

Splits kernel implementations into separate files to improve compilation performance by reducing compilation time through modular organization.
---
 csrc/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu
new file mode 100644
index 0000000..83c8f8a
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 36992257e13c2ce3e0ed43e2dae317f7f36b8122 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:52:24 +0800
Subject: [PATCH 42/51] Adds specialized kernel for head dimension 192 with
 bfloat16

Introduces template specialization to improve compilation performance by splitting different head dimensions into separate files.

Supports SM80 architecture with split-kv dispatch functionality.
---
 csrc/src/flash_fwd_split_hdim192_bf16_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim192_bf16_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim192_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim192_bf16_sm80.cu
new file mode 100644
index 0000000..d3bbf47
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim192_bf16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From fc4c89310e9afba5c37587f3bfc4624522e9f8d0 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:52:33 +0800
Subject: [PATCH 43/51] Adds specialized kernel for head dimension 192 with
 FP16

Introduces dedicated CUDA kernel file for flash attention forward pass
with 192 head dimensions, FP16 precision, and causal masking on SM80.

Splits kernel implementations into separate files to improve compilation
speed and reduce build times for the flash attention library.
---
 csrc/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu
new file mode 100644
index 0000000..5652982
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 7eff59259f913d4968f3c4b4097695033c8fccd6 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:52:41 +0800
Subject: [PATCH 44/51] Adds split kernel for head dimension 192 with FP16

Introduces a specialized kernel file for head dimension 192 using half precision to improve compilation performance through file splitting.

The template instantiation targets SM80 architecture and supports split-KV dispatch functionality.
---
 csrc/src/flash_fwd_split_hdim192_fp16_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim192_fp16_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim192_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim192_fp16_sm80.cu
new file mode 100644
index 0000000..edac2b6
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim192_fp16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From f4dcef6baa74aa6532b58e4a5d295347aaa63021 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:52:49 +0800
Subject: [PATCH 45/51] Adds specialized kernel for hdim256 bf16 causal
 attention

Splits flash attention kernels by head dimension to improve compilation speed.

Creates dedicated compilation unit for 256-dimensional bfloat16 causal attention on SM80 architecture.
---
 csrc/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu
new file mode 100644
index 0000000..28ab7ad
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 3460dc775bab32331171d16ddebe9c89fd534b82 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:53:02 +0800
Subject: [PATCH 46/51] Adds split kernel for head dimension 256 with bfloat16

Creates a dedicated compilation unit for the specific configuration of head dimension 256 using bfloat16 precision on SM80 architecture.

Improves build performance by separating kernel instantiations across multiple files, reducing compilation time for the flash attention implementation.
---
 csrc/src/flash_fwd_split_hdim256_bf16_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim256_bf16_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim256_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim256_bf16_sm80.cu
new file mode 100644
index 0000000..751035e
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim256_bf16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From b5fb04a6a9986593f1a3e6592c8592def66a058d Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:53:12 +0800
Subject: [PATCH 47/51] Adds specialized kernel for 256-dim causal attention

Introduces a dedicated CUDA kernel file for flash attention forward pass with 256 head dimensions and causal masking on SM80 architecture.

Splits kernel implementations into separate files to accelerate compilation times by enabling parallel compilation of different head dimension variants.
---
 csrc/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu
new file mode 100644
index 0000000..502b5cc
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From dae54414d0d6cdc5c85c584ef01b7d5a9cb42bb6 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:53:22 +0800
Subject: [PATCH 48/51] Adds split kernel for head dimension 256 with FP16

Splits flash attention forward kernels by head dimension to reduce compilation time.

Creates dedicated compilation unit for 256-dimensional heads using half precision on SM80 architecture.
---
 csrc/src/flash_fwd_split_hdim256_fp16_sm80.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 csrc/src/flash_fwd_split_hdim256_fp16_sm80.cu

diff --git a/csrc/src/flash_fwd_split_hdim256_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim256_fp16_sm80.cu
new file mode 100644
index 0000000..3153e17
--- /dev/null
+++ b/csrc/src/flash_fwd_split_hdim256_fp16_sm80.cu
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, Jingze Shi and Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+#include "namespace_config.h"
+#include "flash_fwd_launch_template.h"
+
+namespace FLASH_NAMESPACE {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream);
+
+} // namespace FLASH_NAMESPACE
\ No newline at end of file

From 9a174108de1397b241478959b4ed189335186fb2 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 10:53:36 +0800
Subject: [PATCH 49/51] Enables fwd_split kernel generation and fixes data
 types

Corrects is_causal field type from string to boolean for proper type safety.

Uncomments fwd_split template function to enable forward split kernel generation.

Extends kernel generation to include both fwd and fwd_split directions instead of only fwd.

Removes unused os import to clean up dependencies.
---
 csrc/src/generate_kernels.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/csrc/src/generate_kernels.py b/csrc/src/generate_kernels.py
index 54160ec..0074794 100644
--- a/csrc/src/generate_kernels.py
+++ b/csrc/src/generate_kernels.py
@@ -1,6 +1,5 @@
 import argparse
 import itertools
-import os
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional
@@ -53,7 +52,7 @@ class Kernel:
     sm: int
     dtype: str
     head_dim: int
-    is_causal: str
+    is_causal: bool
     direction: str
 
     @property
@@ -61,7 +60,7 @@ def template(self) -> str:
         template_funcs = {
             "fwd": get_fwd_template,
             # "bwd": get_bwd_template,
-            # "fwd_split": get_fwd_split_template
+            "fwd_split": get_fwd_split_template
         }
         template_func = template_funcs[self.direction]
         return template_func().format(
@@ -75,7 +74,8 @@ def filename(self) -> str:
         return f"flash_{self.direction}_hdim{self.head_dim}_{self.dtype}_{'causal_' if self.is_causal == 'true' else ''}sm{self.sm}.cu"
 
 def get_all_kernels() -> List[Kernel]:
-    for direction in ["fwd"]: #, "fwd_split", "bwd"]:
+    # for direction in ["fwd", "fwd_split", "bwd"]:
+    for direction in ["fwd", "fwd_split"]:
         for dtype, head_dim, is_causal, sm in itertools.product(DTYPE_MAP.keys(), HEAD_DIMENSIONS, IS_CAUSAL, SM):
             yield Kernel(sm=sm, dtype=dtype, head_dim=head_dim, is_causal=is_causal, direction=direction)
 

From 1421c1bae7b3a9431c050d6a2f03235b764a2e23 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 11:00:20 +0800
Subject: [PATCH 50/51] Changes is_causal field type from bool to str

Updates the data type of the is_causal field in the Kernel class to support string-based causal configurations instead of simple boolean values.

This change enables more flexible causal masking options beyond just enabled/disabled states.
---
 csrc/src/generate_kernels.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/src/generate_kernels.py b/csrc/src/generate_kernels.py
index 0074794..082ccfd 100644
--- a/csrc/src/generate_kernels.py
+++ b/csrc/src/generate_kernels.py
@@ -52,7 +52,7 @@ class Kernel:
     sm: int
     dtype: str
     head_dim: int
-    is_causal: bool
+    is_causal: str
     direction: str
 
     @property

From 51daf66b67efdae6a18ddcf74969973ba6b81479 Mon Sep 17 00:00:00 2001
From: Loser Cheems <losercheems@gmail.com>
Date: Thu, 26 Jun 2025 11:03:49 +0800
Subject: [PATCH 51/51] Changes return type from List to Generator for memory
 efficiency

Improves memory usage by returning a generator instead of materializing all kernel objects in memory at once.

Updates import statement to remove unused List type and add Generator type.
---
 csrc/src/generate_kernels.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/src/generate_kernels.py b/csrc/src/generate_kernels.py
index 082ccfd..a359fe3 100644
--- a/csrc/src/generate_kernels.py
+++ b/csrc/src/generate_kernels.py
@@ -2,7 +2,7 @@
 import itertools
 from dataclasses import dataclass
 from pathlib import Path
-from typing import List, Optional
+from typing import Optional, Generator
 
 DTYPE_MAP = {
     "fp16": "cutlass::half_t",
@@ -73,7 +73,7 @@ def template(self) -> str:
     def filename(self) -> str:
         return f"flash_{self.direction}_hdim{self.head_dim}_{self.dtype}_{'causal_' if self.is_causal == 'true' else ''}sm{self.sm}.cu"
 
-def get_all_kernels() -> List[Kernel]:
+def get_all_kernels() -> Generator[Kernel, None, None]:
     # for direction in ["fwd", "fwd_split", "bwd"]:
     for direction in ["fwd", "fwd_split"]:
         for dtype, head_dim, is_causal, sm in itertools.product(DTYPE_MAP.keys(), HEAD_DIMENSIONS, IS_CAUSAL, SM):