From 698c9f24187b990e35c3b73a8067e5387e6ddbd4 Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Sat, 22 Nov 2025 14:56:20 +0800
Subject: [PATCH 1/7] enable mmf for rdna4

---
 ggml/src/ggml-cuda/mmf.cu  |  2 +-
 ggml/src/ggml-cuda/mmf.cuh | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu
index 5c51a22256a..be2ad1c6b65 100644
--- a/ggml/src/ggml-cuda/mmf.cu
+++ b/ggml/src/ggml-cuda/mmf.cu
@@ -151,7 +151,7 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
             return false;
         }
     } else {
-        if (src1_ncols > 16 || GGML_CUDA_CC_IS_RDNA4(cc)) {
+        if (src1_ncols > 16) {
             return false;
         }
     }
diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh
index c2a0a2e42fe..18de0dc04a2 100644
--- a/ggml/src/ggml-cuda/mmf.cuh
+++ b/ggml/src/ggml-cuda/mmf.cuh
@@ -8,6 +8,19 @@ using namespace ggml_cuda_mma;
 
 #define MMF_ROWS_PER_BLOCK 32
 
+// TODO: submit a bug to rocm compiler, remove this when the bug is fixed.
+// force rocm compiler to use more register and unroll code for mul_mat_f.
+#if defined(RDNA4)
+#define MMF_REGISTER_UNROLL_FOR_RDNA \
+do {                                 \
+    if (blockIdx.z == -1) {          \
+        NO_DEVICE_CODE;              \
+    }                                \
+} while(0)
+#else
+#define MMF_REGISTER_UNROLL_FOR_RDNA
+#endif // defined(RDNA4)
+
 struct mmf_ids_data {
     const int32_t * ids_src_compact = nullptr;
     const int32_t * ids_dst_compact = nullptr;
@@ -153,6 +166,7 @@ static __global__ void mul_mat_f(
 #pragma unroll
             for (int k0 = 0; k0 < warp_size; k0 += tile_A::J) {
                 load_ldmatrix(A[itA][k0/tile_A::J], tile_xy + k0, tile_k_padded);
+                MMF_REGISTER_UNROLL_FOR_RDNA;
             }
         }
 
@@ -191,6 +205,7 @@ static __global__ void mul_mat_f(
             for (int k0 = 0; k0 < warp_size; k0 += tile_B::J) {
                 tile_B B;
                 load_ldmatrix(B, tile_xy + k0, tile_k_padded);
+                MMF_REGISTER_UNROLL_FOR_RDNA;
 #pragma unroll
                 for (int itA = 0; itA < ntA; ++itA) {
                     mma(C[itA][itB], A[itA][k0/tile_B::J], B);

From 99b92bd6653cc8593607f641e44606391691792f Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Sat, 22 Nov 2025 15:31:32 +0800
Subject: [PATCH 2/7] move some mmvf to mmf

---
 ggml/src/ggml-cuda/mmvf.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu
index 6238ce7ebd7..b9ef371e0dc 100644
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@@ -765,7 +765,7 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
                 return ne11 <= 8;
             } else if (GGML_CUDA_CC_IS_AMD(cc)) {
                 if (fp16_mma_hardware_available(cc)) {
-                    if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
+                    if (GGML_CUDA_CC_IS_RDNA3(cc)) {
                         return ne11 <= 5;
                     }
                     return ne11 <= 2;
@@ -788,6 +788,9 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
                 return ne11 <= 8;
             } else if (GGML_CUDA_CC_IS_AMD(cc)) {
                 if (bf16_mma_hardware_available(cc)) {
+                    if (GGML_CUDA_CC_IS_RDNA4(cc)) {
+                        return ne11 <= 2;
+                    }
                     return ne11 <= 3;
                 }
                 return ne11 <= 8;

From db9ae8b6b4738a5def5b393caa1611d52133e9b5 Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Tue, 25 Nov 2025 17:06:40 +0800
Subject: [PATCH 3/7] revert lds128 for wmma loading

---
 ggml/src/ggml-cuda/mma.cuh | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index caa08b360b5..8300bde0ff2 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -441,13 +441,8 @@ namespace ggml_cuda_mma {
             int64_t * xi = (int64_t *) t.x;
             const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
             xi[0] = xs[0];
-        }else if constexpr (I == 16 && J == 8) {
-            int64_t * xi = (int64_t *) t.x;
-            const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
-            xi[0] = xs[0];
-
-            const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
-            xi[1] = xs1[0];
+        }else if constexpr (I == 16 && J == 8 && (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>)) {
+            ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
         }else{
             NO_DEVICE_CODE;
         }

From 900c2a944d57798c8cd99490c9b5fc19d83b4274 Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Wed, 26 Nov 2025 14:31:11 +0800
Subject: [PATCH 4/7] Revert "revert lds128 for wmma loading"

This reverts commit db9ae8b6b4738a5def5b393caa1611d52133e9b5.
---
 ggml/src/ggml-cuda/mma.cuh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index 8300bde0ff2..caa08b360b5 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -441,8 +441,13 @@ namespace ggml_cuda_mma {
             int64_t * xi = (int64_t *) t.x;
             const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
             xi[0] = xs[0];
-        }else if constexpr (I == 16 && J == 8 && (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>)) {
-            ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
+        }else if constexpr (I == 16 && J == 8) {
+            int64_t * xi = (int64_t *) t.x;
+            const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
+            xi[0] = xs[0];
+
+            const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
+            xi[1] = xs1[0];
         }else{
             NO_DEVICE_CODE;
         }

From 806ca35deaae7858563b47412b162d374b1205be Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Thu, 27 Nov 2025 14:10:26 +0800
Subject: [PATCH 5/7] Revert "enable mmf for rdna4"

This reverts commit 698c9f24187b990e35c3b73a8067e5387e6ddbd4.
---
 ggml/src/ggml-cuda/mmf.cu  |  2 +-
 ggml/src/ggml-cuda/mmf.cuh | 15 ---------------
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu
index be2ad1c6b65..5c51a22256a 100644
--- a/ggml/src/ggml-cuda/mmf.cu
+++ b/ggml/src/ggml-cuda/mmf.cu
@@ -151,7 +151,7 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
             return false;
         }
     } else {
-        if (src1_ncols > 16) {
+        if (src1_ncols > 16 || GGML_CUDA_CC_IS_RDNA4(cc)) {
             return false;
         }
     }
diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh
index 18de0dc04a2..c2a0a2e42fe 100644
--- a/ggml/src/ggml-cuda/mmf.cuh
+++ b/ggml/src/ggml-cuda/mmf.cuh
@@ -8,19 +8,6 @@ using namespace ggml_cuda_mma;
 
 #define MMF_ROWS_PER_BLOCK 32
 
-// TODO: submit a bug to rocm compiler, remove this when the bug is fixed.
-// force rocm compiler to use more register and unroll code for mul_mat_f.
-#if defined(RDNA4)
-#define MMF_REGISTER_UNROLL_FOR_RDNA \
-do {                                 \
-    if (blockIdx.z == -1) {          \
-        NO_DEVICE_CODE;              \
-    }                                \
-} while(0)
-#else
-#define MMF_REGISTER_UNROLL_FOR_RDNA
-#endif // defined(RDNA4)
-
 struct mmf_ids_data {
     const int32_t * ids_src_compact = nullptr;
     const int32_t * ids_dst_compact = nullptr;
@@ -166,7 +153,6 @@ static __global__ void mul_mat_f(
 #pragma unroll
             for (int k0 = 0; k0 < warp_size; k0 += tile_A::J) {
                 load_ldmatrix(A[itA][k0/tile_A::J], tile_xy + k0, tile_k_padded);
-                MMF_REGISTER_UNROLL_FOR_RDNA;
             }
         }
 
@@ -205,7 +191,6 @@ static __global__ void mul_mat_f(
             for (int k0 = 0; k0 < warp_size; k0 += tile_B::J) {
                 tile_B B;
                 load_ldmatrix(B, tile_xy + k0, tile_k_padded);
-                MMF_REGISTER_UNROLL_FOR_RDNA;
 #pragma unroll
                 for (int itA = 0; itA < ntA; ++itA) {
                     mma(C[itA][itB], A[itA][k0/tile_B::J], B);

From e27e8a1111f80b753d95a5dcc6dc1cc59a412934 Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Thu, 27 Nov 2025 14:10:35 +0800
Subject: [PATCH 6/7] Revert "move some mmvf to mmf"

This reverts commit 99b92bd6653cc8593607f641e44606391691792f.
---
 ggml/src/ggml-cuda/mmvf.cu | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu
index b9ef371e0dc..6238ce7ebd7 100644
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@@ -765,7 +765,7 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
                 return ne11 <= 8;
             } else if (GGML_CUDA_CC_IS_AMD(cc)) {
                 if (fp16_mma_hardware_available(cc)) {
-                    if (GGML_CUDA_CC_IS_RDNA3(cc)) {
+                    if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
                         return ne11 <= 5;
                     }
                     return ne11 <= 2;
@@ -788,9 +788,6 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
                 return ne11 <= 8;
             } else if (GGML_CUDA_CC_IS_AMD(cc)) {
                 if (bf16_mma_hardware_available(cc)) {
-                    if (GGML_CUDA_CC_IS_RDNA4(cc)) {
-                        return ne11 <= 2;
-                    }
                     return ne11 <= 3;
                 }
                 return ne11 <= 8;

From 8041802dcafcfdbc5db3fd8d53ca292a7a536198 Mon Sep 17 00:00:00 2001
From: zhang hui <you@example.com>
Date: Thu, 27 Nov 2025 14:17:59 +0800
Subject: [PATCH 7/7] enable mul_mat for rdna4

---
 ggml/src/ggml-cuda/mmf.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu
index 5c51a22256a..be2ad1c6b65 100644
--- a/ggml/src/ggml-cuda/mmf.cu
+++ b/ggml/src/ggml-cuda/mmf.cu
@@ -151,7 +151,7 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
             return false;
         }
     } else {
-        if (src1_ncols > 16 || GGML_CUDA_CC_IS_RDNA4(cc)) {
+        if (src1_ncols > 16) {
             return false;
         }
     }