diff --git a/csrc/src/flash_fwd_hdim128_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim128_bf16_causal_sm80.cu new file mode 100644 index 0000000..2bacfd3 --- /dev/null +++ b/csrc/src/flash_fwd_hdim128_bf16_causal_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim128(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim128_bf16_sm80.cu b/csrc/src/flash_fwd_hdim128_bf16_sm80.cu new file mode 100644 index 0000000..bc6f103 --- /dev/null +++ b/csrc/src/flash_fwd_hdim128_bf16_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim128(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim128_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim128_fp16_causal_sm80.cu new file mode 100644 index 0000000..48c2d89 --- /dev/null +++ b/csrc/src/flash_fwd_hdim128_fp16_causal_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim128(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim128_fp16_sm80.cu b/csrc/src/flash_fwd_hdim128_fp16_sm80.cu new file mode 100644 index 0000000..c67fdce --- /dev/null +++ b/csrc/src/flash_fwd_hdim128_fp16_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim128(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim192_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim192_bf16_causal_sm80.cu new file mode 100644 index 0000000..e957ce0 --- /dev/null +++ b/csrc/src/flash_fwd_hdim192_bf16_causal_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim192(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim192_bf16_sm80.cu b/csrc/src/flash_fwd_hdim192_bf16_sm80.cu new file mode 100644 index 0000000..f53f7f5 --- /dev/null +++ b/csrc/src/flash_fwd_hdim192_bf16_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim192(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim192_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim192_fp16_causal_sm80.cu new file mode 100644 index 0000000..805e548 --- /dev/null +++ b/csrc/src/flash_fwd_hdim192_fp16_causal_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim192(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim192_fp16_sm80.cu b/csrc/src/flash_fwd_hdim192_fp16_sm80.cu new file mode 100644 index 0000000..87a6565 --- /dev/null +++ b/csrc/src/flash_fwd_hdim192_fp16_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim192(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim256_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim256_bf16_causal_sm80.cu new file mode 100644 index 0000000..ded9ab1 --- /dev/null +++ b/csrc/src/flash_fwd_hdim256_bf16_causal_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim256(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim256_bf16_sm80.cu b/csrc/src/flash_fwd_hdim256_bf16_sm80.cu new file mode 100644 index 0000000..c3d9e04 --- /dev/null +++ b/csrc/src/flash_fwd_hdim256_bf16_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim256(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim256_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim256_fp16_causal_sm80.cu new file mode 100644 index 0000000..8780ed4 --- /dev/null +++ b/csrc/src/flash_fwd_hdim256_fp16_causal_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim256(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim256_fp16_sm80.cu b/csrc/src/flash_fwd_hdim256_fp16_sm80.cu new file mode 100644 index 0000000..293e001 --- /dev/null +++ b/csrc/src/flash_fwd_hdim256_fp16_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim256(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim32_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim32_bf16_causal_sm80.cu new file mode 100644 index 0000000..f50910b --- /dev/null +++ b/csrc/src/flash_fwd_hdim32_bf16_causal_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim32(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim32_bf16_sm80.cu b/csrc/src/flash_fwd_hdim32_bf16_sm80.cu new file mode 100644 index 0000000..6127386 --- /dev/null +++ b/csrc/src/flash_fwd_hdim32_bf16_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim32(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim32_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim32_fp16_causal_sm80.cu new file mode 100644 index 0000000..845ab35 --- /dev/null +++ b/csrc/src/flash_fwd_hdim32_fp16_causal_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim32(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim32_fp16_sm80.cu b/csrc/src/flash_fwd_hdim32_fp16_sm80.cu new file mode 100644 index 0000000..94a1301 --- /dev/null +++ b/csrc/src/flash_fwd_hdim32_fp16_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim32(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim64_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim64_bf16_causal_sm80.cu new file mode 100644 index 0000000..f3d4e94 --- /dev/null +++ b/csrc/src/flash_fwd_hdim64_bf16_causal_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim64(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim64_bf16_sm80.cu b/csrc/src/flash_fwd_hdim64_bf16_sm80.cu new file mode 100644 index 0000000..a11cc79 --- /dev/null +++ b/csrc/src/flash_fwd_hdim64_bf16_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim64(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim64_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim64_fp16_causal_sm80.cu new file mode 100644 index 0000000..1681867 --- /dev/null +++ b/csrc/src/flash_fwd_hdim64_fp16_causal_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim64(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim64_fp16_sm80.cu b/csrc/src/flash_fwd_hdim64_fp16_sm80.cu new file mode 100644 index 0000000..7b7d184 --- /dev/null +++ b/csrc/src/flash_fwd_hdim64_fp16_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim64(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim96_bf16_causal_sm80.cu b/csrc/src/flash_fwd_hdim96_bf16_causal_sm80.cu new file mode 100644 index 0000000..ad62e87 --- /dev/null +++ b/csrc/src/flash_fwd_hdim96_bf16_causal_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim96(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim96_bf16_sm80.cu b/csrc/src/flash_fwd_hdim96_bf16_sm80.cu new file mode 100644 index 0000000..92c4344 --- /dev/null +++ b/csrc/src/flash_fwd_hdim96_bf16_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim96(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim96_fp16_causal_sm80.cu b/csrc/src/flash_fwd_hdim96_fp16_causal_sm80.cu new file mode 100644 index 0000000..36db0da --- /dev/null +++ b/csrc/src/flash_fwd_hdim96_fp16_causal_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim96(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_hdim96_fp16_sm80.cu b/csrc/src/flash_fwd_hdim96_fp16_sm80.cu new file mode 100644 index 0000000..50040b5 --- /dev/null +++ b/csrc/src/flash_fwd_hdim96_fp16_sm80.cu @@ -0,0 +1,14 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim96(params, stream); +} + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu new file mode 100644 index 0000000..b4118c7 --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim128_bf16_causal_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim128_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim128_bf16_sm80.cu new file mode 100644 index 0000000..cdcfbe9 --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim128_bf16_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu new file mode 100644 index 0000000..71e415d --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim128_fp16_causal_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim128_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim128_fp16_sm80.cu new file mode 100644 index 0000000..df4febe --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim128_fp16_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu new file mode 100644 index 0000000..83c8f8a --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim192_bf16_causal_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim192_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim192_bf16_sm80.cu new file mode 100644 index 0000000..d3bbf47 --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim192_bf16_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu new file mode 100644 index 0000000..5652982 --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim192_fp16_causal_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim192_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim192_fp16_sm80.cu new file mode 100644 index 0000000..edac2b6 --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim192_fp16_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu new file mode 100644 index 0000000..28ab7ad --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim256_bf16_causal_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim256_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim256_bf16_sm80.cu new file mode 100644 index 0000000..751035e --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim256_bf16_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu new file mode 100644 index 0000000..502b5cc --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim256_fp16_causal_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim256_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim256_fp16_sm80.cu new file mode 100644 index 0000000..3153e17 --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim256_fp16_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu new file mode 100644 index 0000000..9910f63 --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim32_bf16_causal_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim32_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim32_bf16_sm80.cu new file mode 100644 index 0000000..d498fea --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim32_bf16_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu new file mode 100644 index 0000000..a5a713a --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim32_fp16_causal_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim32_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim32_fp16_sm80.cu new file mode 100644 index 0000000..4cfc36a --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim32_fp16_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu new file mode 100644 index 0000000..e89cb76 --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim64_bf16_causal_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim64_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim64_bf16_sm80.cu new file mode 100644 index 0000000..8d72e93 --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim64_bf16_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu new file mode 100644 index 0000000..76ba0c8 --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim64_fp16_causal_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim64_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim64_fp16_sm80.cu new file mode 100644 index 0000000..ab07719 --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim64_fp16_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu new file mode 100644 index 0000000..8d44e28 --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim96_bf16_causal_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim96_bf16_sm80.cu b/csrc/src/flash_fwd_split_hdim96_bf16_sm80.cu new file mode 100644 index 0000000..252b468 --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim96_bf16_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu b/csrc/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu new file mode 100644 index 0000000..3eb97b7 --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim96_fp16_causal_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/flash_fwd_split_hdim96_fp16_sm80.cu b/csrc/src/flash_fwd_split_hdim96_fp16_sm80.cu new file mode 100644 index 0000000..367e12b --- /dev/null +++ b/csrc/src/flash_fwd_split_hdim96_fp16_sm80.cu @@ -0,0 +1,11 @@ +// Copyright (c) 2025, Jingze Shi and Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" +#include "namespace_config.h" +#include "flash_fwd_launch_template.h" + +namespace FLASH_NAMESPACE { + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); + +} // namespace FLASH_NAMESPACE \ No newline at end of file diff --git a/csrc/src/generate_kernels.py b/csrc/src/generate_kernels.py index 54160ec..a359fe3 100644 --- a/csrc/src/generate_kernels.py +++ b/csrc/src/generate_kernels.py @@ -1,9 +1,8 @@ import argparse import itertools -import os from dataclasses import dataclass from pathlib import Path -from typing import List, Optional +from typing import Optional, Generator DTYPE_MAP = { "fp16": "cutlass::half_t", @@ -61,7 +60,7 @@ def template(self) -> str: template_funcs = { "fwd": get_fwd_template, # "bwd": get_bwd_template, - # "fwd_split": get_fwd_split_template + "fwd_split": get_fwd_split_template } template_func = template_funcs[self.direction] return template_func().format( @@ -74,8 +73,9 @@ def template(self) -> str: def filename(self) -> str: return f"flash_{self.direction}_hdim{self.head_dim}_{self.dtype}_{'causal_' if self.is_causal == 'true' else ''}sm{self.sm}.cu" -def get_all_kernels() -> List[Kernel]: - for direction in ["fwd"]: #, "fwd_split", "bwd"]: +def get_all_kernels() -> Generator[Kernel, None, None]: + # for direction in ["fwd", "fwd_split", "bwd"]: + for direction in ["fwd", "fwd_split"]: for dtype, head_dim, is_causal, sm in itertools.product(DTYPE_MAP.keys(), HEAD_DIMENSIONS, IS_CAUSAL, SM): yield Kernel(sm=sm, dtype=dtype, head_dim=head_dim, is_causal=is_causal, direction=direction)