From 5bca5cc6f6ccb903dd69ad3edd698a07d277e732 Mon Sep 17 00:00:00 2001 From: Li Date: Mon, 10 Nov 2025 22:38:37 -0800 Subject: [PATCH 1/2] opencl: use subgrroup reduce for reduction in rms_norm_mul --- ggml/src/ggml-opencl/ggml-opencl.cpp | 2 +- ggml/src/ggml-opencl/kernels/rms_norm.cl | 30 ++++++++++++++++-------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 465272fab9092..576d2f59229b4 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -5682,7 +5682,7 @@ static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2)); CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3)); CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float), &eps)); - CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*nth/sgs, NULL)); + CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs, NULL)); backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } diff --git a/ggml/src/ggml-opencl/kernels/rms_norm.cl b/ggml/src/ggml-opencl/kernels/rms_norm.cl index ecd053cb4c1ce..36a4cda7dd80a 100644 --- a/ggml/src/ggml-opencl/kernels/rms_norm.cl +++ b/ggml/src/ggml-opencl/kernels/rms_norm.cl @@ -134,6 +134,10 @@ kernel void kernel_rms_norm_mul( src1 = src1 + offset1; dst = dst + offsetd; + if (get_sub_group_id() == 0) { + sum[get_sub_group_local_id()] = 0.0f; + } + int i03 = get_group_id(2); int i02 = get_group_id(1); int i01 = get_group_id(0); @@ -148,24 +152,30 @@ kernel void kernel_rms_norm_mul( sumf += dot(x[i00], x[i00]); } sumf = sub_group_reduce_add(sumf); + + barrier(CLK_LOCAL_MEM_FENCE); + if (get_sub_group_local_id() == 0) { sum[get_sub_group_id()] = sumf; } barrier(CLK_LOCAL_MEM_FENCE); - for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) { - if (get_local_id(0) < i) { - sum[get_local_id(0)] += sum[get_local_id(0) + i]; - } - } - if (get_local_id(0) == 0) { - sum[0] /= ne00; - } + //for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) { + // if (get_local_id(0) < i) { + // sum[get_local_id(0)] += sum[get_local_id(0) + i]; + // } + //} + //if (get_local_id(0) == 0) { + // sum[0] /= ne00; + //} - barrier(CLK_LOCAL_MEM_FENCE); + //barrier(CLK_LOCAL_MEM_FENCE); + + sumf = sum[get_sub_group_local_id()]; + sumf = sub_group_reduce_add(sumf); - float mean = sum[0]; + float mean = sumf / ne00; float scale = 1.0f/sqrt(mean + eps); global float4 * y = (global float4 *) (dst + i03*nb3 + i02*nb2 + i01*nb1); From 696343e6080260e0f2ca703328ece4b3eb9f1568 Mon Sep 17 00:00:00 2001 From: Li He Date: Wed, 12 Nov 2025 23:16:09 -0800 Subject: [PATCH 2/2] opencl: add comment about workgroup size --- ggml/src/ggml-opencl/kernels/rms_norm.cl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ggml/src/ggml-opencl/kernels/rms_norm.cl b/ggml/src/ggml-opencl/kernels/rms_norm.cl index 36a4cda7dd80a..4b18d17d6f8fd 100644 --- a/ggml/src/ggml-opencl/kernels/rms_norm.cl +++ b/ggml/src/ggml-opencl/kernels/rms_norm.cl @@ -134,6 +134,11 @@ kernel void kernel_rms_norm_mul( src1 = src1 + offset1; dst = dst + offsetd; + // The size of sum is sizeof(float)*subgroup_size. + // Each subgroup writes its partial sum to this array. + // So the number of subgroups per workgroup for this kernel cannot exceed the subgroup size. + // This is generally true - + // for subgroup size 64, workgroup size should be less than 4096 (the max is usually 1024). if (get_sub_group_id() == 0) { sum[get_sub_group_local_id()] = 0.0f; }