From 698c9f24187b990e35c3b73a8067e5387e6ddbd4 Mon Sep 17 00:00:00 2001 From: zhang hui Date: Sat, 22 Nov 2025 14:56:20 +0800 Subject: [PATCH 1/7] enable mmf for rdna4 --- ggml/src/ggml-cuda/mmf.cu | 2 +- ggml/src/ggml-cuda/mmf.cuh | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu index 5c51a22256a..be2ad1c6b65 100644 --- a/ggml/src/ggml-cuda/mmf.cu +++ b/ggml/src/ggml-cuda/mmf.cu @@ -151,7 +151,7 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const return false; } } else { - if (src1_ncols > 16 || GGML_CUDA_CC_IS_RDNA4(cc)) { + if (src1_ncols > 16) { return false; } } diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh index c2a0a2e42fe..18de0dc04a2 100644 --- a/ggml/src/ggml-cuda/mmf.cuh +++ b/ggml/src/ggml-cuda/mmf.cuh @@ -8,6 +8,19 @@ using namespace ggml_cuda_mma; #define MMF_ROWS_PER_BLOCK 32 +// TODO: submit a bug to rocm compiler, remove this when the bug is fixed. +// force rocm compiler to use more register and unroll code for mul_mat_f. +#if defined(RDNA4) +#define MMF_REGISTER_UNROLL_FOR_RDNA \ +do { \ + if (blockIdx.z == -1) { \ + NO_DEVICE_CODE; \ + } \ +} while(0) +#else +#define MMF_REGISTER_UNROLL_FOR_RDNA +#endif // defined(RDNA4) + struct mmf_ids_data { const int32_t * ids_src_compact = nullptr; const int32_t * ids_dst_compact = nullptr; @@ -153,6 +166,7 @@ static __global__ void mul_mat_f( #pragma unroll for (int k0 = 0; k0 < warp_size; k0 += tile_A::J) { load_ldmatrix(A[itA][k0/tile_A::J], tile_xy + k0, tile_k_padded); + MMF_REGISTER_UNROLL_FOR_RDNA; } } @@ -191,6 +205,7 @@ static __global__ void mul_mat_f( for (int k0 = 0; k0 < warp_size; k0 += tile_B::J) { tile_B B; load_ldmatrix(B, tile_xy + k0, tile_k_padded); + MMF_REGISTER_UNROLL_FOR_RDNA; #pragma unroll for (int itA = 0; itA < ntA; ++itA) { mma(C[itA][itB], A[itA][k0/tile_B::J], B); From 99b92bd6653cc8593607f641e44606391691792f Mon Sep 17 00:00:00 2001 From: zhang hui Date: Sat, 22 Nov 2025 15:31:32 +0800 Subject: [PATCH 2/7] move some mmvf to mmf --- ggml/src/ggml-cuda/mmvf.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index 6238ce7ebd7..b9ef371e0dc 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -765,7 +765,7 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0 return ne11 <= 8; } else if (GGML_CUDA_CC_IS_AMD(cc)) { if (fp16_mma_hardware_available(cc)) { - if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) { + if (GGML_CUDA_CC_IS_RDNA3(cc)) { return ne11 <= 5; } return ne11 <= 2; @@ -788,6 +788,9 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0 return ne11 <= 8; } else if (GGML_CUDA_CC_IS_AMD(cc)) { if (bf16_mma_hardware_available(cc)) { + if (GGML_CUDA_CC_IS_RDNA4(cc)) { + return ne11 <= 2; + } return ne11 <= 3; } return ne11 <= 8; From db9ae8b6b4738a5def5b393caa1611d52133e9b5 Mon Sep 17 00:00:00 2001 From: zhang hui Date: Tue, 25 Nov 2025 17:06:40 +0800 Subject: [PATCH 3/7] revert lds128 for wmma loading --- ggml/src/ggml-cuda/mma.cuh | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index caa08b360b5..8300bde0ff2 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -441,13 +441,8 @@ namespace ggml_cuda_mma { int64_t * xi = (int64_t *) t.x; const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I)); xi[0] = xs[0]; - }else if constexpr (I == 16 && J == 8) { - int64_t * xi = (int64_t *) t.x; - const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I)); - xi[0] = xs[0]; - - const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2); - xi[1] = xs1[0]; + }else if constexpr (I == 16 && J == 8 && (std::is_same_v || std::is_same_v)) { + ggml_cuda_memcpy_1(t.x, xs0 + t.get_i(0) * stride + t.get_j(0)); }else{ NO_DEVICE_CODE; } From 900c2a944d57798c8cd99490c9b5fc19d83b4274 Mon Sep 17 00:00:00 2001 From: zhang hui Date: Wed, 26 Nov 2025 14:31:11 +0800 Subject: [PATCH 4/7] Revert "revert lds128 for wmma loading" This reverts commit db9ae8b6b4738a5def5b393caa1611d52133e9b5. --- ggml/src/ggml-cuda/mma.cuh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 8300bde0ff2..caa08b360b5 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -441,8 +441,13 @@ namespace ggml_cuda_mma { int64_t * xi = (int64_t *) t.x; const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I)); xi[0] = xs[0]; - }else if constexpr (I == 16 && J == 8 && (std::is_same_v || std::is_same_v)) { - ggml_cuda_memcpy_1(t.x, xs0 + t.get_i(0) * stride + t.get_j(0)); + }else if constexpr (I == 16 && J == 8) { + int64_t * xi = (int64_t *) t.x; + const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I)); + xi[0] = xs[0]; + + const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2); + xi[1] = xs1[0]; }else{ NO_DEVICE_CODE; } From 806ca35deaae7858563b47412b162d374b1205be Mon Sep 17 00:00:00 2001 From: zhang hui Date: Thu, 27 Nov 2025 14:10:26 +0800 Subject: [PATCH 5/7] Revert "enable mmf for rdna4" This reverts commit 698c9f24187b990e35c3b73a8067e5387e6ddbd4. --- ggml/src/ggml-cuda/mmf.cu | 2 +- ggml/src/ggml-cuda/mmf.cuh | 15 --------------- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu index be2ad1c6b65..5c51a22256a 100644 --- a/ggml/src/ggml-cuda/mmf.cu +++ b/ggml/src/ggml-cuda/mmf.cu @@ -151,7 +151,7 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const return false; } } else { - if (src1_ncols > 16) { + if (src1_ncols > 16 || GGML_CUDA_CC_IS_RDNA4(cc)) { return false; } } diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh index 18de0dc04a2..c2a0a2e42fe 100644 --- a/ggml/src/ggml-cuda/mmf.cuh +++ b/ggml/src/ggml-cuda/mmf.cuh @@ -8,19 +8,6 @@ using namespace ggml_cuda_mma; #define MMF_ROWS_PER_BLOCK 32 -// TODO: submit a bug to rocm compiler, remove this when the bug is fixed. -// force rocm compiler to use more register and unroll code for mul_mat_f. -#if defined(RDNA4) -#define MMF_REGISTER_UNROLL_FOR_RDNA \ -do { \ - if (blockIdx.z == -1) { \ - NO_DEVICE_CODE; \ - } \ -} while(0) -#else -#define MMF_REGISTER_UNROLL_FOR_RDNA -#endif // defined(RDNA4) - struct mmf_ids_data { const int32_t * ids_src_compact = nullptr; const int32_t * ids_dst_compact = nullptr; @@ -166,7 +153,6 @@ static __global__ void mul_mat_f( #pragma unroll for (int k0 = 0; k0 < warp_size; k0 += tile_A::J) { load_ldmatrix(A[itA][k0/tile_A::J], tile_xy + k0, tile_k_padded); - MMF_REGISTER_UNROLL_FOR_RDNA; } } @@ -205,7 +191,6 @@ static __global__ void mul_mat_f( for (int k0 = 0; k0 < warp_size; k0 += tile_B::J) { tile_B B; load_ldmatrix(B, tile_xy + k0, tile_k_padded); - MMF_REGISTER_UNROLL_FOR_RDNA; #pragma unroll for (int itA = 0; itA < ntA; ++itA) { mma(C[itA][itB], A[itA][k0/tile_B::J], B); From e27e8a1111f80b753d95a5dcc6dc1cc59a412934 Mon Sep 17 00:00:00 2001 From: zhang hui Date: Thu, 27 Nov 2025 14:10:35 +0800 Subject: [PATCH 6/7] Revert "move some mmvf to mmf" This reverts commit 99b92bd6653cc8593607f641e44606391691792f. --- ggml/src/ggml-cuda/mmvf.cu | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index b9ef371e0dc..6238ce7ebd7 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -765,7 +765,7 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0 return ne11 <= 8; } else if (GGML_CUDA_CC_IS_AMD(cc)) { if (fp16_mma_hardware_available(cc)) { - if (GGML_CUDA_CC_IS_RDNA3(cc)) { + if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) { return ne11 <= 5; } return ne11 <= 2; @@ -788,9 +788,6 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0 return ne11 <= 8; } else if (GGML_CUDA_CC_IS_AMD(cc)) { if (bf16_mma_hardware_available(cc)) { - if (GGML_CUDA_CC_IS_RDNA4(cc)) { - return ne11 <= 2; - } return ne11 <= 3; } return ne11 <= 8; From 8041802dcafcfdbc5db3fd8d53ca292a7a536198 Mon Sep 17 00:00:00 2001 From: zhang hui Date: Thu, 27 Nov 2025 14:17:59 +0800 Subject: [PATCH 7/7] enable mul_mat for rdna4 --- ggml/src/ggml-cuda/mmf.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu index 5c51a22256a..be2ad1c6b65 100644 --- a/ggml/src/ggml-cuda/mmf.cu +++ b/ggml/src/ggml-cuda/mmf.cu @@ -151,7 +151,7 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const return false; } } else { - if (src1_ncols > 16 || GGML_CUDA_CC_IS_RDNA4(cc)) { + if (src1_ncols > 16) { return false; } }