From b56681191313ee1c78e7717374e2ec8210972616 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 15 Oct 2025 19:04:48 +0800 Subject: [PATCH 1/7] Add buffer label and enable dawn-specific toggles to turn off some checks --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 56 ++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 05e16cd432ad3..b4558a9e3f1d2 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -309,10 +309,12 @@ struct ggml_backend_webgpu_context { struct ggml_backend_webgpu_buffer_context { webgpu_context webgpu_ctx; wgpu::Buffer buffer; + std::string label; - ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf) : + ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf, std::string lbl) : webgpu_ctx(std::move(ctx)), - buffer(std::move(buf)) {} + buffer(std::move(buf)), + label(std::move(lbl)) {} }; /* End struct definitions */ @@ -1336,11 +1338,11 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe WEBGPU_CPU_PROFILE_TOTAL_START(memset_tensor); - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " - << offset << ", " << size << ")"); - ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buf_ctx->label << ", " << tensor << ", " << value + << ", " << offset << ", " << size << ")"); + size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; // This is a trick to set all bytes of a u32 to the same 1 byte value. @@ -1354,12 +1356,13 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, const void * data, size_t offset, size_t size) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " - << offset << ", " << size << ")"); WEBGPU_CPU_PROFILE_TOTAL_START(set_tensor); ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx; + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buf_ctx->label << ", " << tensor << ", " << data + << ", " << offset << ", " << size << ")"); + size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size / 4) * 4); @@ -1397,12 +1400,12 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, void * data, size_t offset, size_t size) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " - << offset << ", " << size << ")"); WEBGPU_CPU_PROFILE_TOTAL_START(get_tensor); - ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; - webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx; - wgpu::Device device = webgpu_ctx->device; + ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buf_ctx->label << ", " << tensor << ", " << data + << ", " << offset << ", " << size << ")"); + webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx; + wgpu::Device device = webgpu_ctx->device; size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; @@ -1473,16 +1476,20 @@ static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer(" << size << ")"); + static std::atomic buffer_count; + int buffer_id = buffer_count++; + std::string buf_name = "tensor_buf" + std::to_string(buffer_id); + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer_" << buffer_id << ": " << size << " bytes"); ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); wgpu::Buffer buf; ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, (size + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1), wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst, - "allocated_buffer"); + buf_name.c_str()); - ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf); + ggml_backend_webgpu_buffer_context * buf_ctx = + new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf, buf_name); return ggml_backend_buffer_init(buft, ggml_backend_webgpu_buffer_interface, buf_ctx, size); } @@ -2129,6 +2136,15 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t required_features.push_back(wgpu::FeatureName::TimestampQuery); #endif + const char * const deviceEnabledToggles[] = { "skip_validation", "disable_robustness", "disable_workgroup_init", + "disable_polyfills_on_integer_div_and_mod" }; + const char * const deviceDisabledToggles[] = { "timestamp_quantization" }; + wgpu::DawnTogglesDescriptor deviceTogglesDesc; + deviceTogglesDesc.enabledToggles = deviceEnabledToggles; + deviceTogglesDesc.enabledToggleCount = 4; + deviceTogglesDesc.disabledToggles = deviceDisabledToggles; + deviceTogglesDesc.disabledToggleCount = 1; + wgpu::DeviceDescriptor dev_desc; dev_desc.requiredLimits = &ctx->limits; dev_desc.requiredFeatures = required_features.data(); @@ -2146,6 +2162,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), std::string(message).c_str()); }); + dev_desc.nextInChain = &deviceTogglesDesc; ctx->instance.WaitAny(ctx->adapter.RequestDevice( &dev_desc, wgpu::CallbackMode::AllowSpontaneous, [ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) { @@ -2243,11 +2260,18 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() { ctx.name = GGML_WEBGPU_NAME; ctx.device_count = 1; + const char * const instanceEnabledToggles[] = { "allow_unsafe_apis" }; + + wgpu::DawnTogglesDescriptor instanceTogglesDesc; + instanceTogglesDesc.enabledToggles = instanceEnabledToggles; + instanceTogglesDesc.enabledToggleCount = 1; wgpu::InstanceDescriptor instance_descriptor{}; std::vector instance_features = { wgpu::InstanceFeatureName::TimedWaitAny }; instance_descriptor.requiredFeatures = instance_features.data(); instance_descriptor.requiredFeatureCount = instance_features.size(); - webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor); + instance_descriptor.nextInChain = &instanceTogglesDesc; + + webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor); GGML_ASSERT(webgpu_ctx->instance != nullptr); static ggml_backend_reg reg = { From f2e187c7f26cdf0f4a9284db8ecbb2d74ffb7213 Mon Sep 17 00:00:00 2001 From: neha-ha <137219201+neha-ha@users.noreply.github.com> Date: Mon, 27 Oct 2025 14:48:39 -0500 Subject: [PATCH 2/7] Minor set_rows optimization (#4) * updated optimization, fixed errors * non vectorized version now dispatches one thread per element * Simplify * Change logic for set_rows pipelines --------- Co-authored-by: Neha Abbas Co-authored-by: Neha Abbas Co-authored-by: Reese Levine --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 25 +++++++--- .../{set_rows.wgsl => set_rows.tmpl.wgsl} | 46 ++++++++++++++++--- 2 files changed, 58 insertions(+), 13 deletions(-) rename ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl => set_rows.tmpl.wgsl} (68%) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index b4558a9e3f1d2..353c7729bd1f8 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -248,7 +248,7 @@ struct webgpu_context_struct { webgpu_pipeline memset_pipeline; webgpu_pipeline mul_mat_pipeline[30][2]; - webgpu_pipeline set_rows_pipeline; + webgpu_pipeline set_rows_pipeline[1][2]; // dst->type, vectorized (0 for vectorized, 1 for non vectorized) webgpu_pipeline get_rows_pipeline[30]; webgpu_pipeline get_rows_f32_no_vec_pipeline; webgpu_pipeline cpy_pipeline[2][2]; // src type, dst type @@ -766,10 +766,21 @@ static std::optional ggml_webgpu_set_rows(webgpu_context & ctx, { .binding = 3, .buffer = error_bufs.dev_buf, .offset = 0, .size = error_bufs.dev_buf.GetSize() } }; - size_t max_wg_size = ctx->max_wg_size_x; - uint32_t wg_x = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size; + size_t max_wg_size = ctx->max_wg_size_x; + + int vectorized = src->ne[0] % 4 == 0; + webgpu_pipeline pipeline = ctx->set_rows_pipeline[0][vectorized]; + // if not evenly divisble by 4, use the non-vectorized version + uint32_t threads; + if (vectorized) { + threads = (src->ne[1] * src->ne[2] * src->ne[3]) * (src->ne[0] / 4); + } else { + threads = src->ne[0] * src->ne[1] * src->ne[2] * src->ne[3]; + } + + uint32_t wg_x = (threads + max_wg_size - 1) / max_wg_size; - return ggml_backend_webgpu_build(ctx, ctx->set_rows_pipeline, params, entries, wg_x, error_bufs); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, error_bufs); } static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx, @@ -1620,8 +1631,10 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { } static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) { - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline, wgsl_set_rows, "set_rows", - ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x)); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline[0][0], wgsl_set_rows_f16, + "set_rows_f16", ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x)); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline[0][1], wgsl_set_rows_f16_vec, + "set_rows_f16_vec", ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x)); } static void ggml_webgpu_init_get_rows_pipeline(webgpu_context & webgpu_ctx) { diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl similarity index 68% rename from ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl rename to ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl index 3567713dc215c..4a6d819d3b145 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl @@ -1,13 +1,38 @@ +#define(VARIANTS) + +[ + { + "SHADER_SUFFIX": "f16_vec", + "REPLS": { + "TYPE" : "vec4", + "DST_TYPE": "vec4", + "VEC_SIZE": 4 + } + }, + { + "SHADER_SUFFIX": "f16", + "REPLS": { + "TYPE" : "f32", + "DST_TYPE": "f16", + "VEC_SIZE": 1 + } + } +] + +#end(VARIANTS) + +#define(SHADER) + enable f16; @group(0) @binding(0) -var src: array; +var src: array<{{TYPE}}>; @group(0) @binding(1) var idx: array; @group(0) @binding(2) -var dst: array; +var dst: array<{{DST_TYPE}}>; @group(0) @binding(3) var error: atomic; @@ -47,10 +72,14 @@ var params: Params; override wg_size: u32; @compute @workgroup_size(wg_size) fn main(@builtin(global_invocation_id) gid: vec3) { - if (gid.x >= params.n_rows * params.ne2 * params.ne3) { + if (gid.x >= (params.ne3 * params.ne2 * params.n_rows * params.ne0) / {{VEC_SIZE}}) { return; } - var i = gid.x; + + // getting the row from gid + let elems_per_row = params.ne0 / {{VEC_SIZE}}; + var i = gid.x / elems_per_row; + let i_src3 = i / (params.ne2 * params.n_rows); i = i % (params.ne2 * params.n_rows); @@ -75,7 +104,10 @@ fn main(@builtin(global_invocation_id) gid: vec3) { let i_dst_row = params.offset_dst + idx_high_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3; let i_src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3; - for (var i: u32 = 0; i < params.ne0; i++) { - dst[i_dst_row + i] = f16(src[i_src_row + i]); - } + // starts at what element of that row? + let col_idx = (gid.x % elems_per_row); + dst[i_dst_row/{{VEC_SIZE}} + col_idx] = {{DST_TYPE}}(src[i_src_row/{{VEC_SIZE}} + col_idx]); } + +#end(SHADER) + From 51aae63b49034ff8171ab623f66ac47a3441628a Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 27 Oct 2025 13:00:19 -0700 Subject: [PATCH 3/7] Comment on dawn toggles --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 353c7729bd1f8..b4a9f6d579b89 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -2149,6 +2149,10 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t required_features.push_back(wgpu::FeatureName::TimestampQuery); #endif + // Enable Dawn-specific toggles to increase native performance + // TODO: Don't enable for WASM builds, they won't have an effect anyways + // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these, + // only for native performance? const char * const deviceEnabledToggles[] = { "skip_validation", "disable_robustness", "disable_workgroup_init", "disable_polyfills_on_integer_div_and_mod" }; const char * const deviceDisabledToggles[] = { "timestamp_quantization" }; From f0cfae49d637ea48b901a72c0b344ba9393f38e8 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 27 Oct 2025 15:48:00 -0700 Subject: [PATCH 4/7] Remove some comments --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 3 +-- ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index b4a9f6d579b89..70e3013537b2d 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -248,7 +248,7 @@ struct webgpu_context_struct { webgpu_pipeline memset_pipeline; webgpu_pipeline mul_mat_pipeline[30][2]; - webgpu_pipeline set_rows_pipeline[1][2]; // dst->type, vectorized (0 for vectorized, 1 for non vectorized) + webgpu_pipeline set_rows_pipeline[1][2]; // dst->type, vectorized webgpu_pipeline get_rows_pipeline[30]; webgpu_pipeline get_rows_f32_no_vec_pipeline; webgpu_pipeline cpy_pipeline[2][2]; // src type, dst type @@ -770,7 +770,6 @@ static std::optional ggml_webgpu_set_rows(webgpu_context & ctx, int vectorized = src->ne[0] % 4 == 0; webgpu_pipeline pipeline = ctx->set_rows_pipeline[0][vectorized]; - // if not evenly divisble by 4, use the non-vectorized version uint32_t threads; if (vectorized) { threads = (src->ne[1] * src->ne[2] * src->ne[3]) * (src->ne[0] / 4); diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl index 4a6d819d3b145..fca3be6bc27ed 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl @@ -104,7 +104,6 @@ fn main(@builtin(global_invocation_id) gid: vec3) { let i_dst_row = params.offset_dst + idx_high_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3; let i_src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3; - // starts at what element of that row? let col_idx = (gid.x % elems_per_row); dst[i_dst_row/{{VEC_SIZE}} + col_idx] = {{DST_TYPE}}(src[i_src_row/{{VEC_SIZE}} + col_idx]); } From ed710b36f51ab3f53fa13db15c1685dc8678a32a Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Fri, 31 Oct 2025 17:35:00 -0700 Subject: [PATCH 5/7] Implement overlap binary operators --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 179 ++++++--- .../ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl | 369 +++++++++++++++--- .../ggml-webgpu/wgsl-shaders/binary_head.tmpl | 45 --- tests/test-backend-ops.cpp | 2 +- 4 files changed, 436 insertions(+), 159 deletions(-) delete mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 70e3013537b2d..f6b939e140415 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -252,10 +252,10 @@ struct webgpu_context_struct { webgpu_pipeline get_rows_pipeline[30]; webgpu_pipeline get_rows_f32_no_vec_pipeline; webgpu_pipeline cpy_pipeline[2][2]; // src type, dst type - webgpu_pipeline add_pipeline[2][2]; // type, inplace - webgpu_pipeline sub_pipeline[2][2]; // type, inplace - webgpu_pipeline mul_pipeline[2][2]; // type, inplace - webgpu_pipeline div_pipeline[2][2]; // type, inplace + webgpu_pipeline add_pipeline[2][2][2]; // type, inplace, overlap + webgpu_pipeline sub_pipeline[2][2][2]; // type, inplace, overlap + webgpu_pipeline mul_pipeline[2][2][2]; // type, inplace, overlap + webgpu_pipeline div_pipeline[2][2][2]; // type, inplace, overlap webgpu_pipeline rms_norm_pipeline[2]; // inplace webgpu_pipeline rope_pipeline[2][2][2]; // type, ff, inplace webgpu_pipeline glu_pipeline[7][2][2]; // glu-op, type, split @@ -677,9 +677,12 @@ static size_t ggml_webgpu_tensor_align_offset(webgpu_context & ctx, ggml_tensor return offset & ~(ctx->limits.minStorageBufferOffsetAlignment - 1); } +static size_t ggml_webgpu_tensor_align_binding_size(size_t size) { + return (size + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1); +} + static size_t ggml_webgpu_tensor_binding_size(webgpu_context & ctx, ggml_tensor * t) { - return (ggml_nbytes(t) + ggml_webgpu_tensor_misalignment(ctx, t) + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & - ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1); + return ggml_webgpu_tensor_align_binding_size(ggml_nbytes(t) + ggml_webgpu_tensor_misalignment(ctx, t)); } // Used to determine if two tensors are the same for in-place operations @@ -688,6 +691,12 @@ static bool ggml_webgpu_tensor_equal(ggml_tensor * a, ggml_tensor * b) { (ggml_webgpu_tensor_offset(a) == ggml_webgpu_tensor_offset(b)); } +static bool ggml_webgpu_tensor_overlap(ggml_tensor * a, ggml_tensor * b) { + return (ggml_webgpu_tensor_buf(a).Get() == ggml_webgpu_tensor_buf(b).Get()) && + ggml_webgpu_tensor_offset(a) < (ggml_webgpu_tensor_offset(b) + ggml_nbytes(b)) && + ggml_webgpu_tensor_offset(b) < (ggml_webgpu_tensor_offset(a) + ggml_nbytes(a)); +} + static webgpu_command ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { uint32_t ne = (uint32_t) ggml_nelements(dst); @@ -870,16 +879,27 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx, return ggml_backend_webgpu_build(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x); } -static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * dst, - webgpu_pipeline & pipeline, - bool inplace) { +template +static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * dst, + webgpu_pipeline (&pipelines)[a][b][c]) { + int inplace = ggml_webgpu_tensor_equal(src0, dst); + int overlap = ggml_webgpu_tensor_overlap(src0, src1); + webgpu_pipeline pipeline = pipelines[dst->type][inplace][overlap]; + + uint32_t src1_offset = ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type); + if (overlap) { + // when overlapped, bind a single buffer covering both src0 and src1 + // TODO: Do other operations need this? + src1_offset = (uint32_t) ((ggml_webgpu_tensor_offset(src1) - ggml_webgpu_tensor_align_offset(ctx, src0)) / + ggml_type_size(src1->type)); + } std::vector params = { (uint32_t) ggml_nelements(dst), (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)), - (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)), + src1_offset, (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)), (uint32_t) (src1->nb[0] / ggml_type_size(src1->type)), (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)), @@ -894,25 +914,36 @@ static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx, (uint32_t) src1->ne[3], }; + size_t src0_binding_size = ggml_webgpu_tensor_binding_size(ctx, src0); + if (overlap) { + const uint64_t base_align = ggml_webgpu_tensor_align_offset(ctx, src0); + // assume end of src1 is >= end of src0 + const uint64_t max_end = ggml_webgpu_tensor_offset(src1) + ggml_nbytes(src1); + src0_binding_size = ggml_webgpu_tensor_align_binding_size(max_end - base_align); + } std::vector entries = { { .binding = 0, .buffer = ggml_webgpu_tensor_buf(src0), .offset = ggml_webgpu_tensor_align_offset(ctx, src0), - .size = ggml_webgpu_tensor_binding_size(ctx, src0) }, - { .binding = 1, - .buffer = ggml_webgpu_tensor_buf(src1), - .offset = ggml_webgpu_tensor_align_offset(ctx, src1), - .size = ggml_webgpu_tensor_binding_size(ctx, src1) } + .size = src0_binding_size } }; + uint32_t binding_num = 1; + if (!overlap) { + entries.push_back({ .binding = binding_num, + .buffer = ggml_webgpu_tensor_buf(src1), + .offset = ggml_webgpu_tensor_align_offset(ctx, src1), + .size = ggml_webgpu_tensor_binding_size(ctx, src1) }); + binding_num++; + } if (!inplace) { - entries.push_back({ .binding = 2, + entries.push_back({ .binding = binding_num, .buffer = ggml_webgpu_tensor_buf(dst), .offset = ggml_webgpu_tensor_align_offset(ctx, dst), .size = ggml_webgpu_tensor_binding_size(ctx, dst) }); } - size_t max_wg_size = ctx->max_wg_size_x; - uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size; + size_t max_wg_size = ctx->max_wg_size_x; + uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size; return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } @@ -1232,25 +1263,13 @@ static std::optional ggml_webgpu_encode_node(webgpu_context ctx, case GGML_OP_MUL_MAT: return ggml_webgpu_mul_mat(ctx, src0, src1, node); case GGML_OP_ADD: - { - int inplace = ggml_webgpu_tensor_equal(src0, node); - return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline[node->type][inplace], inplace); - } + return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline); case GGML_OP_SUB: - { - int inplace = ggml_webgpu_tensor_equal(src0, node); - return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->sub_pipeline[node->type][inplace], inplace); - } + return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->sub_pipeline); case GGML_OP_MUL: - { - int inplace = ggml_webgpu_tensor_equal(src0, node); - return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline[node->type][inplace], inplace); - } + return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline); case GGML_OP_DIV: - { - int inplace = ggml_webgpu_tensor_equal(src0, node); - return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->div_pipeline[node->type][inplace], inplace); - } + return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->div_pipeline); case GGML_OP_RMS_NORM: return ggml_webgpu_rms_norm(ctx, src0, node); case GGML_OP_ROPE: @@ -1700,50 +1719,82 @@ static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) { static void ggml_webgpu_init_add_pipeline(webgpu_context & webgpu_ctx) { std::vector constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][0], wgsl_add_f32, "add_f32", - constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][0], wgsl_add_f16, "add_f16", - constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][1], wgsl_add_f32_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][0][0], wgsl_add_f32, + "add_f32", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][0][0], wgsl_add_f16, + "add_f16", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][1][0], wgsl_add_f32_inplace, "add_f32_inplace", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][1], wgsl_add_f16_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][1][0], wgsl_add_f16_inplace, "add_f16_inplace", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][0][1], wgsl_add_f32_overlap, + "add_f32_overlap", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][1][1], + wgsl_add_f32_inplace_overlap, "add_f32_inplace_overlap", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][0][1], wgsl_add_f16_overlap, + "add_f16_overlap", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][1][1], + wgsl_add_f16_inplace_overlap, "add_f16_inplace_overlap", constants); } static void ggml_webgpu_init_sub_pipeline(webgpu_context & webgpu_ctx) { std::vector constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][0], wgsl_sub_f32, "sub_f32", - constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][0], wgsl_sub_f16, "sub_f16", - constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][1], wgsl_sub_f32_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][0][0], wgsl_sub_f32, + "sub_f32", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][0][0], wgsl_sub_f16, + "sub_f16", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][1][0], wgsl_sub_f32_inplace, "sub_f32_inplace", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][1], wgsl_sub_f16_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][1][0], wgsl_sub_f16_inplace, "sub_f16_inplace", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][0][1], wgsl_sub_f32_overlap, + "sub_f32_overlap", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][1][1], + wgsl_sub_f32_inplace_overlap, "sub_f32_inplace_overlap", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][0][1], wgsl_sub_f16_overlap, + "sub_f16_overlap", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][1][1], + wgsl_sub_f16_inplace_overlap, "sub_f16_inplace_overlap", constants); } static void ggml_webgpu_init_mul_pipeline(webgpu_context & webgpu_ctx) { std::vector constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][0], wgsl_mul_f32, "mul_f32", - constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][0], wgsl_mul_f16, "mul_f16", - constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][1], wgsl_mul_f32_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][0][0], wgsl_mul_f32, + "mul_f32", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][0][0], wgsl_mul_f16, + "mul_f16", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][1][0], wgsl_mul_f32_inplace, "mul_f32_inplace", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][1], wgsl_mul_f16_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][1][0], wgsl_mul_f16_inplace, "mul_f16_inplace", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][0][1], wgsl_mul_f32_overlap, + "mul_f32_overlap", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][1][1], + wgsl_mul_f32_inplace_overlap, "mul_f32_inplace_overlap", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][0][1], wgsl_mul_f16_overlap, + "mul_f16_overlap", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][1][1], + wgsl_mul_f16_inplace_overlap, "mul_f16_inplace_overlap", constants); } static void ggml_webgpu_init_div_pipeline(webgpu_context & webgpu_ctx) { std::vector constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][0], wgsl_div_f32, "div_f32", - constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][0], wgsl_div_f16, "div_f16", - constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][1], wgsl_div_f32_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][0][0], wgsl_div_f32, + "div_f32", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][0][0], wgsl_div_f16, + "div_f16", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][1][0], wgsl_div_f32_inplace, "div_f32_inplace", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][1], wgsl_div_f16_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][1][0], wgsl_div_f16_inplace, "div_f16_inplace", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][0][1], wgsl_div_f32_overlap, + "div_f32_overlap", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][1][1], + wgsl_div_f32_inplace_overlap, "div_f32_inplace_overlap", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][0][1], wgsl_div_f16_overlap, + "div_f16_overlap", constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][1][1], + wgsl_div_f16_inplace_overlap, "div_f16_inplace_overlap", constants); } static void ggml_webgpu_init_rms_norm_pipeline(webgpu_context & webgpu_ctx) { @@ -2152,9 +2203,9 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t // TODO: Don't enable for WASM builds, they won't have an effect anyways // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these, // only for native performance? - const char * const deviceEnabledToggles[] = { "skip_validation", "disable_robustness", "disable_workgroup_init", - "disable_polyfills_on_integer_div_and_mod" }; - const char * const deviceDisabledToggles[] = { "timestamp_quantization" }; + const char * const deviceEnabledToggles[] = { "disable_robustness", "disable_workgroup_init", + "disable_polyfills_on_integer_div_and_mod" }; + const char * const deviceDisabledToggles[] = { "timestamp_quantization" }; wgpu::DawnTogglesDescriptor deviceTogglesDesc; deviceTogglesDesc.enabledToggles = deviceEnabledToggles; deviceTogglesDesc.enabledToggleCount = 4; diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl index 1ce4d83fa8e50..5143a1bbf17f8 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl @@ -5,129 +5,354 @@ "SHADER_NAME": "add_f32", "REPLS": { "TYPE" : "f32", - "OP": "+" + "SRC1_BUF": "src1", + "DST_BUF": "dst", + "OP": "+", + "PARAMS_BINDING": 3 }, "DECLS": ["NOT_INPLACE"] }, + { + "SHADER_NAME": "add_f32_inplace", + "REPLS": { + "TYPE" : "f32", + "SRC1_BUF": "src1", + "DST_BUF": "src0", + "OP": "+", + "PARAMS_BINDING": 2 + }, + "DECLS": ["INPLACE"] + }, + { + "SHADER_NAME": "add_f32_overlap", + "REPLS": { + "TYPE" : "f32", + "SRC1_BUF": "src0", + "DST_BUF": "dst", + "OP": "+", + "PARAMS_BINDING": 2 + }, + "DECLS": ["OVERLAP"] + }, + { + "SHADER_NAME": "add_f32_inplace_overlap", + "REPLS": { + "TYPE" : "f32", + "SRC1_BUF": "src0", + "DST_BUF": "src0", + "OP": "+", + "PARAMS_BINDING": 1 + }, + "DECLS": ["INPLACE_OVERLAP"] + }, { "SHADER_NAME": "add_f16", "REPLS": { "TYPE" : "f16", - "OP": "+" + "SRC1_BUF": "src1", + "DST_BUF": "dst", + "OP": "+", + "PARAMS_BINDING": 3 }, "DECLS": ["NOT_INPLACE"] }, { - "SHADER_NAME": "add_f32_inplace", + "SHADER_NAME": "add_f16_inplace", "REPLS": { - "TYPE" : "f32", - "OP": "+" + "TYPE" : "f16", + "SRC1_BUF": "src1", + "DST_BUF": "src0", + "OP": "+", + "PARAMS_BINDING": 2 }, "DECLS": ["INPLACE"] }, { - "SHADER_NAME": "add_f16_inplace", + "SHADER_NAME": "add_f16_overlap", "REPLS": { "TYPE" : "f16", - "OP": "+" + "SRC1_BUF": "src0", + "DST_BUF": "dst", + "OP": "+", + "PARAMS_BINDING": 2 }, - "DECLS": ["INPLACE"] + "DECLS": ["OVERLAP"] + }, + { + "SHADER_NAME": "add_f16_inplace_overlap", + "REPLS": { + "TYPE" : "f16", + "SRC1_BUF": "src0", + "DST_BUF": "src0", + "OP": "+", + "PARAMS_BINDING": 1 + }, + "DECLS": ["INPLACE_OVERLAP"] }, { "SHADER_NAME": "mul_f32", "REPLS": { "TYPE" : "f32", - "OP": "*" + "SRC1_BUF": "src1", + "DST_BUF": "dst", + "OP": "*", + "PARAMS_BINDING": 3 }, "DECLS": ["NOT_INPLACE"] }, + { + "SHADER_NAME": "mul_f32_inplace", + "REPLS": { + "TYPE" : "f32", + "SRC1_BUF": "src1", + "DST_BUF": "src0", + "OP": "*", + "PARAMS_BINDING": 2 + }, + "DECLS": ["INPLACE"] + }, + { + "SHADER_NAME": "mul_f32_overlap", + "REPLS": { + "TYPE" : "f32", + "SRC1_BUF": "src0", + "DST_BUF": "dst", + "OP": "*", + "PARAMS_BINDING": 2 + }, + "DECLS": ["OVERLAP"] + }, + { + "SHADER_NAME": "mul_f32_inplace_overlap", + "REPLS": { + "TYPE" : "f32", + "SRC1_BUF": "src0", + "DST_BUF": "src0", + "OP": "*", + "PARAMS_BINDING": 1 + }, + "DECLS": ["INPLACE_OVERLAP"] + }, { "SHADER_NAME": "mul_f16", "REPLS": { "TYPE" : "f16", - "OP": "*" + "SRC1_BUF": "src1", + "DST_BUF": "dst", + "OP": "*", + "PARAMS_BINDING": 3 }, "DECLS": ["NOT_INPLACE"] }, { - "SHADER_NAME": "mul_f32_inplace", + "SHADER_NAME": "mul_f16_inplace", "REPLS": { - "TYPE" : "f32", - "OP": "*" + "TYPE" : "f16", + "SRC1_BUF": "src1", + "DST_BUF": "src0", + "OP": "*", + "PARAMS_BINDING": 2 }, "DECLS": ["INPLACE"] }, { - "SHADER_NAME": "mul_f16_inplace", + "SHADER_NAME": "mul_f16_overlap", "REPLS": { "TYPE" : "f16", - "OP": "*" + "SRC1_BUF": "src0", + "DST_BUF": "dst", + "OP": "*", + "PARAMS_BINDING": 2 }, - "DECLS": ["INPLACE"] + "DECLS": ["OVERLAP"] + }, + { + "SHADER_NAME": "mul_f16_inplace_overlap", + "REPLS": { + "TYPE" : "f16", + "SRC1_BUF": "src0", + "DST_BUF": "src0", + "OP": "*", + "PARAMS_BINDING": 1 + }, + "DECLS": ["INPLACE_OVERLAP"] }, { "SHADER_NAME": "sub_f32", "REPLS": { "TYPE" : "f32", - "OP": "-" + "SRC1_BUF": "src1", + "DST_BUF": "dst", + "OP": "-", + "PARAMS_BINDING": 3 }, "DECLS": ["NOT_INPLACE"] }, + { + "SHADER_NAME": "sub_f32_inplace", + "REPLS": { + "TYPE" : "f32", + "SRC1_BUF": "src1", + "DST_BUF": "src0", + "OP": "-", + "PARAMS_BINDING": 2 + }, + "DECLS": ["INPLACE"] + }, + { + "SHADER_NAME": "sub_f32_overlap", + "REPLS": { + "TYPE" : "f32", + "SRC1_BUF": "src0", + "DST_BUF": "dst", + "OP": "-", + "PARAMS_BINDING": 2 + }, + "DECLS": ["OVERLAP"] + }, + { + "SHADER_NAME": "sub_f32_inplace_overlap", + "REPLS": { + "TYPE" : "f32", + "SRC1_BUF": "src0", + "DST_BUF": "src0", + "OP": "-", + "PARAMS_BINDING": 1 + }, + "DECLS": ["INPLACE_OVERLAP"] + }, { "SHADER_NAME": "sub_f16", "REPLS": { "TYPE" : "f16", - "OP": "-" + "SRC1_BUF": "src1", + "DST_BUF": "dst", + "OP": "-", + "PARAMS_BINDING": 3 }, "DECLS": ["NOT_INPLACE"] }, { - "SHADER_NAME": "sub_f32_inplace", + "SHADER_NAME": "sub_f16_inplace", "REPLS": { - "TYPE" : "f32", - "OP": "-" + "TYPE" : "f16", + "SRC1_BUF": "src1", + "DST_BUF": "src0", + "OP": "-", + "PARAMS_BINDING": 2 }, "DECLS": ["INPLACE"] }, { - "SHADER_NAME": "sub_f16_inplace", + "SHADER_NAME": "sub_f16_overlap", "REPLS": { "TYPE" : "f16", - "OP": "-" + "SRC1_BUF": "src0", + "DST_BUF": "dst", + "OP": "-", + "PARAMS_BINDING": 2 }, - "DECLS": ["INPLACE"] + "DECLS": ["OVERLAP"] + }, + { + "SHADER_NAME": "sub_f16_inplace_overlap", + "REPLS": { + "TYPE" : "f16", + "SRC1_BUF": "src0", + "DST_BUF": "src0", + "OP": "-", + "PARAMS_BINDING": 1 + }, + "DECLS": ["INPLACE_OVERLAP"] }, + { "SHADER_NAME": "div_f32", "REPLS": { "TYPE" : "f32", - "OP": "/" + "SRC1_BUF": "src1", + "DST_BUF": "dst", + "OP": "/", + "PARAMS_BINDING": 3 }, "DECLS": ["NOT_INPLACE"] }, + { + "SHADER_NAME": "div_f32_inplace", + "REPLS": { + "TYPE" : "f32", + "SRC1_BUF": "src1", + "DST_BUF": "src0", + "OP": "/", + "PARAMS_BINDING": 2 + }, + "DECLS": ["INPLACE"] + }, + { + "SHADER_NAME": "div_f32_overlap", + "REPLS": { + "TYPE" : "f32", + "SRC1_BUF": "src0", + "DST_BUF": "dst", + "OP": "/", + "PARAMS_BINDING": 2 + }, + "DECLS": ["OVERLAP"] + }, + { + "SHADER_NAME": "div_f32_inplace_overlap", + "REPLS": { + "TYPE" : "f32", + "SRC1_BUF": "src0", + "DST_BUF": "src0", + "OP": "/", + "PARAMS_BINDING": 1 + }, + "DECLS": ["INPLACE_OVERLAP"] + }, { "SHADER_NAME": "div_f16", "REPLS": { "TYPE" : "f16", - "OP": "/" + "SRC1_BUF": "src1", + "DST_BUF": "dst", + "OP": "/", + "PARAMS_BINDING": 3 }, "DECLS": ["NOT_INPLACE"] }, { - "SHADER_NAME": "div_f32_inplace", + "SHADER_NAME": "div_f16_inplace", "REPLS": { - "TYPE" : "f32", - "OP": "/" + "TYPE" : "f16", + "SRC1_BUF": "src1", + "DST_BUF": "src0", + "OP": "/", + "PARAMS_BINDING": 2 }, "DECLS": ["INPLACE"] }, { - "SHADER_NAME": "div_f16_inplace", + "SHADER_NAME": "div_f16_overlap", "REPLS": { "TYPE" : "f16", - "OP": "/" + "SRC1_BUF": "src0", + "DST_BUF": "dst", + "OP": "/", + "PARAMS_BINDING": 2 }, - "DECLS": ["INPLACE"] + "DECLS": ["OVERLAP"] + }, + { + "SHADER_NAME": "div_f16_inplace_overlap", + "REPLS": { + "TYPE" : "f16", + "SRC1_BUF": "src0", + "DST_BUF": "src0", + "OP": "/", + "PARAMS_BINDING": 1 + }, + "DECLS": ["INPLACE_OVERLAP"] } ] @@ -137,43 +362,89 @@ #decl(NOT_INPLACE) -fn update(dst_i: u32, src0_i: u32, src1_i: u32) { - dst[dst_i] = src0[src0_i] {{OP}} src1[src1_i]; -} +@group(0) @binding(1) +var src1: array<{{TYPE}}>; @group(0) @binding(2) var dst: array<{{TYPE}}>; -@group(0) @binding(3) -var params: Params; - #enddecl(NOT_INPLACE) #decl(INPLACE) -fn update(dst_i: u32, src0_i: u32, src1_i: u32) { - src0[dst_i] = src0[src0_i] {{OP}} src1[src1_i]; -} - -@group(0) @binding(2) -var params: Params; +@group(0) @binding(1) +var src1: array<{{TYPE}}>; #enddecl(INPLACE) -#end(DECLS) +#decl(OVERLAP) +@group(0) @binding(1) +var dst: array<{{TYPE}}>; + +#enddecl(OVERLAP) + +#decl(INPLACE_OVERLAP) + +#enddecl(INPLACE_OVERLAP) + +#end(DECLS) #define(SHADER) enable f16; -#include "binary_head.tmpl" +struct Params { + ne: u32, + + // offsets in elements + offset_src0: u32, + offset_src1: u32, + offset_dst: u32, + + stride_src1_0: u32, + stride_src1_1: u32, + stride_src1_2: u32, + stride_src1_3: u32, + + a_ne0: u32, + a_ne1: u32, + a_ne2: u32, + + b_ne0: u32, + b_ne1: u32, + b_ne2: u32, + b_ne3: u32, +}; + +fn src1_index(_i: u32) -> u32 { + var i = _i; + let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0); + i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0); + let a_i2 = i / (params.a_ne1 * params.a_ne0); + i = i % (params.a_ne1 * params.a_ne0); + let a_i1 = i / params.a_ne0; + let a_i0 = i % params.a_ne0; + + // handle repetition of b + // index loops back to the beginning and repeats after elements are exhausted = modulo + let b_i0 = a_i0 % params.b_ne0; + let b_i1 = a_i1 % params.b_ne1; + let b_i2 = a_i2 % params.b_ne2; + let b_i3 = a_i3 % params.b_ne3; + + // compute index for position in b's flat array + return b_i0 * params.stride_src1_0 + + b_i1 * params.stride_src1_1 + + b_i2 * params.stride_src1_2 + + b_i3 * params.stride_src1_3; +} @group(0) @binding(0) var src0: array<{{TYPE}}>; -@group(0) @binding(1) -var src1: array<{{TYPE}}>; +@group(0) @binding({{PARAMS_BINDING}}) +var params: Params; DECLS @@ -181,7 +452,7 @@ override wg_size: u32; @compute @workgroup_size(wg_size) fn main(@builtin(global_invocation_id) gid: vec3) { if (gid.x < params.ne) { - update(params.offset_dst + gid.x, params.offset_src0 + gid.x, params.offset_src1 + src1_index(gid.x)); + {{DST_BUF}}[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] {{OP}} {{SRC1_BUF}}[params.offset_src1 + src1_index(gid.x)]; } } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl deleted file mode 100644 index 4b254f468d69e..0000000000000 --- a/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +++ /dev/null @@ -1,45 +0,0 @@ -struct Params { - ne: u32, - - // offsets in elements - offset_src0: u32, - offset_src1: u32, - offset_dst: u32, - - stride_src1_0: u32, - stride_src1_1: u32, - stride_src1_2: u32, - stride_src1_3: u32, - - a_ne0: u32, - a_ne1: u32, - a_ne2: u32, - - b_ne0: u32, - b_ne1: u32, - b_ne2: u32, - b_ne3: u32, -}; - -fn src1_index(_i: u32) -> u32 { - var i = _i; - let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0); - i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0); - let a_i2 = i / (params.a_ne1 * params.a_ne0); - i = i % (params.a_ne1 * params.a_ne0); - let a_i1 = i / params.a_ne0; - let a_i0 = i % params.a_ne0; - - // handle repetition of b - // index loops back to the beginning and repeats after elements are exhausted = modulo - let b_i0 = a_i0 % params.b_ne0; - let b_i1 = a_i1 % params.b_ne1; - let b_i2 = a_i2 % params.b_ne2; - let b_i3 = a_i3 % params.b_ne3; - - // compute index for position in b's flat array - return b_i0 * params.stride_src1_0 + - b_i1 * params.stride_src1_1 + - b_i2 * params.stride_src1_2 + - b_i3 * params.stride_src1_3; -} diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 04fa1b62d3b4d..0d2cbc530f5fb 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4840,7 +4840,7 @@ struct test_moe_expert_reduce : public test_case { std::vector expert_views(n_expert_used); for (int64_t i = 0; i < n_expert_used; ++i) { - expert_views[i] = ggml_view_2d(ctx, weighted, n_embd, n_tokens, weighted->nb[2], i * weighted->nb[1]); + expert_views[i] = ggml_view_2d(ctx, weighted, n_embd, n_tokens, weighted->nb[1], i * weighted->nb[1]); std::string name = "expert_view_" + std::to_string(i); ggml_set_name(expert_views[i], name.c_str()); From b319672348ba1b3a6dbfad98b2e0c7a3a6367351 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Sat, 1 Nov 2025 21:33:41 -0700 Subject: [PATCH 6/7] Revert "Implement overlap binary operators" This reverts commit ed710b36f51ab3f53fa13db15c1685dc8678a32a. --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 179 +++------ .../ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl | 369 +++--------------- .../ggml-webgpu/wgsl-shaders/binary_head.tmpl | 45 +++ tests/test-backend-ops.cpp | 2 +- 4 files changed, 159 insertions(+), 436 deletions(-) create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index f6b939e140415..70e3013537b2d 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -252,10 +252,10 @@ struct webgpu_context_struct { webgpu_pipeline get_rows_pipeline[30]; webgpu_pipeline get_rows_f32_no_vec_pipeline; webgpu_pipeline cpy_pipeline[2][2]; // src type, dst type - webgpu_pipeline add_pipeline[2][2][2]; // type, inplace, overlap - webgpu_pipeline sub_pipeline[2][2][2]; // type, inplace, overlap - webgpu_pipeline mul_pipeline[2][2][2]; // type, inplace, overlap - webgpu_pipeline div_pipeline[2][2][2]; // type, inplace, overlap + webgpu_pipeline add_pipeline[2][2]; // type, inplace + webgpu_pipeline sub_pipeline[2][2]; // type, inplace + webgpu_pipeline mul_pipeline[2][2]; // type, inplace + webgpu_pipeline div_pipeline[2][2]; // type, inplace webgpu_pipeline rms_norm_pipeline[2]; // inplace webgpu_pipeline rope_pipeline[2][2][2]; // type, ff, inplace webgpu_pipeline glu_pipeline[7][2][2]; // glu-op, type, split @@ -677,12 +677,9 @@ static size_t ggml_webgpu_tensor_align_offset(webgpu_context & ctx, ggml_tensor return offset & ~(ctx->limits.minStorageBufferOffsetAlignment - 1); } -static size_t ggml_webgpu_tensor_align_binding_size(size_t size) { - return (size + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1); -} - static size_t ggml_webgpu_tensor_binding_size(webgpu_context & ctx, ggml_tensor * t) { - return ggml_webgpu_tensor_align_binding_size(ggml_nbytes(t) + ggml_webgpu_tensor_misalignment(ctx, t)); + return (ggml_nbytes(t) + ggml_webgpu_tensor_misalignment(ctx, t) + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & + ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1); } // Used to determine if two tensors are the same for in-place operations @@ -691,12 +688,6 @@ static bool ggml_webgpu_tensor_equal(ggml_tensor * a, ggml_tensor * b) { (ggml_webgpu_tensor_offset(a) == ggml_webgpu_tensor_offset(b)); } -static bool ggml_webgpu_tensor_overlap(ggml_tensor * a, ggml_tensor * b) { - return (ggml_webgpu_tensor_buf(a).Get() == ggml_webgpu_tensor_buf(b).Get()) && - ggml_webgpu_tensor_offset(a) < (ggml_webgpu_tensor_offset(b) + ggml_nbytes(b)) && - ggml_webgpu_tensor_offset(b) < (ggml_webgpu_tensor_offset(a) + ggml_nbytes(a)); -} - static webgpu_command ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { uint32_t ne = (uint32_t) ggml_nelements(dst); @@ -879,27 +870,16 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx, return ggml_backend_webgpu_build(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x); } -template -static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * dst, - webgpu_pipeline (&pipelines)[a][b][c]) { - int inplace = ggml_webgpu_tensor_equal(src0, dst); - int overlap = ggml_webgpu_tensor_overlap(src0, src1); - webgpu_pipeline pipeline = pipelines[dst->type][inplace][overlap]; - - uint32_t src1_offset = ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type); - if (overlap) { - // when overlapped, bind a single buffer covering both src0 and src1 - // TODO: Do other operations need this? - src1_offset = (uint32_t) ((ggml_webgpu_tensor_offset(src1) - ggml_webgpu_tensor_align_offset(ctx, src0)) / - ggml_type_size(src1->type)); - } +static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * dst, + webgpu_pipeline & pipeline, + bool inplace) { std::vector params = { (uint32_t) ggml_nelements(dst), (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)), - src1_offset, + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)), (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)), (uint32_t) (src1->nb[0] / ggml_type_size(src1->type)), (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)), @@ -914,36 +894,25 @@ static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx, (uint32_t) src1->ne[3], }; - size_t src0_binding_size = ggml_webgpu_tensor_binding_size(ctx, src0); - if (overlap) { - const uint64_t base_align = ggml_webgpu_tensor_align_offset(ctx, src0); - // assume end of src1 is >= end of src0 - const uint64_t max_end = ggml_webgpu_tensor_offset(src1) + ggml_nbytes(src1); - src0_binding_size = ggml_webgpu_tensor_align_binding_size(max_end - base_align); - } std::vector entries = { { .binding = 0, .buffer = ggml_webgpu_tensor_buf(src0), .offset = ggml_webgpu_tensor_align_offset(ctx, src0), - .size = src0_binding_size } + .size = ggml_webgpu_tensor_binding_size(ctx, src0) }, + { .binding = 1, + .buffer = ggml_webgpu_tensor_buf(src1), + .offset = ggml_webgpu_tensor_align_offset(ctx, src1), + .size = ggml_webgpu_tensor_binding_size(ctx, src1) } }; - uint32_t binding_num = 1; - if (!overlap) { - entries.push_back({ .binding = binding_num, - .buffer = ggml_webgpu_tensor_buf(src1), - .offset = ggml_webgpu_tensor_align_offset(ctx, src1), - .size = ggml_webgpu_tensor_binding_size(ctx, src1) }); - binding_num++; - } if (!inplace) { - entries.push_back({ .binding = binding_num, + entries.push_back({ .binding = 2, .buffer = ggml_webgpu_tensor_buf(dst), .offset = ggml_webgpu_tensor_align_offset(ctx, dst), .size = ggml_webgpu_tensor_binding_size(ctx, dst) }); } - size_t max_wg_size = ctx->max_wg_size_x; - uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size; + size_t max_wg_size = ctx->max_wg_size_x; + uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size; return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } @@ -1263,13 +1232,25 @@ static std::optional ggml_webgpu_encode_node(webgpu_context ctx, case GGML_OP_MUL_MAT: return ggml_webgpu_mul_mat(ctx, src0, src1, node); case GGML_OP_ADD: - return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline); + { + int inplace = ggml_webgpu_tensor_equal(src0, node); + return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline[node->type][inplace], inplace); + } case GGML_OP_SUB: - return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->sub_pipeline); + { + int inplace = ggml_webgpu_tensor_equal(src0, node); + return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->sub_pipeline[node->type][inplace], inplace); + } case GGML_OP_MUL: - return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline); + { + int inplace = ggml_webgpu_tensor_equal(src0, node); + return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline[node->type][inplace], inplace); + } case GGML_OP_DIV: - return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->div_pipeline); + { + int inplace = ggml_webgpu_tensor_equal(src0, node); + return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->div_pipeline[node->type][inplace], inplace); + } case GGML_OP_RMS_NORM: return ggml_webgpu_rms_norm(ctx, src0, node); case GGML_OP_ROPE: @@ -1719,82 +1700,50 @@ static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) { static void ggml_webgpu_init_add_pipeline(webgpu_context & webgpu_ctx) { std::vector constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][0][0], wgsl_add_f32, - "add_f32", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][0][0], wgsl_add_f16, - "add_f16", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][1][0], wgsl_add_f32_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][0], wgsl_add_f32, "add_f32", + constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][0], wgsl_add_f16, "add_f16", + constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][1], wgsl_add_f32_inplace, "add_f32_inplace", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][1][0], wgsl_add_f16_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][1], wgsl_add_f16_inplace, "add_f16_inplace", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][0][1], wgsl_add_f32_overlap, - "add_f32_overlap", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][1][1], - wgsl_add_f32_inplace_overlap, "add_f32_inplace_overlap", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][0][1], wgsl_add_f16_overlap, - "add_f16_overlap", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][1][1], - wgsl_add_f16_inplace_overlap, "add_f16_inplace_overlap", constants); } static void ggml_webgpu_init_sub_pipeline(webgpu_context & webgpu_ctx) { std::vector constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][0][0], wgsl_sub_f32, - "sub_f32", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][0][0], wgsl_sub_f16, - "sub_f16", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][1][0], wgsl_sub_f32_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][0], wgsl_sub_f32, "sub_f32", + constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][0], wgsl_sub_f16, "sub_f16", + constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][1], wgsl_sub_f32_inplace, "sub_f32_inplace", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][1][0], wgsl_sub_f16_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][1], wgsl_sub_f16_inplace, "sub_f16_inplace", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][0][1], wgsl_sub_f32_overlap, - "sub_f32_overlap", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][1][1], - wgsl_sub_f32_inplace_overlap, "sub_f32_inplace_overlap", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][0][1], wgsl_sub_f16_overlap, - "sub_f16_overlap", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][1][1], - wgsl_sub_f16_inplace_overlap, "sub_f16_inplace_overlap", constants); } static void ggml_webgpu_init_mul_pipeline(webgpu_context & webgpu_ctx) { std::vector constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][0][0], wgsl_mul_f32, - "mul_f32", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][0][0], wgsl_mul_f16, - "mul_f16", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][1][0], wgsl_mul_f32_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][0], wgsl_mul_f32, "mul_f32", + constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][0], wgsl_mul_f16, "mul_f16", + constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][1], wgsl_mul_f32_inplace, "mul_f32_inplace", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][1][0], wgsl_mul_f16_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][1], wgsl_mul_f16_inplace, "mul_f16_inplace", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][0][1], wgsl_mul_f32_overlap, - "mul_f32_overlap", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][1][1], - wgsl_mul_f32_inplace_overlap, "mul_f32_inplace_overlap", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][0][1], wgsl_mul_f16_overlap, - "mul_f16_overlap", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][1][1], - wgsl_mul_f16_inplace_overlap, "mul_f16_inplace_overlap", constants); } static void ggml_webgpu_init_div_pipeline(webgpu_context & webgpu_ctx) { std::vector constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][0][0], wgsl_div_f32, - "div_f32", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][0][0], wgsl_div_f16, - "div_f16", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][1][0], wgsl_div_f32_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][0], wgsl_div_f32, "div_f32", + constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][0], wgsl_div_f16, "div_f16", + constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][1], wgsl_div_f32_inplace, "div_f32_inplace", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][1][0], wgsl_div_f16_inplace, + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][1], wgsl_div_f16_inplace, "div_f16_inplace", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][0][1], wgsl_div_f32_overlap, - "div_f32_overlap", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][1][1], - wgsl_div_f32_inplace_overlap, "div_f32_inplace_overlap", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][0][1], wgsl_div_f16_overlap, - "div_f16_overlap", constants); - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][1][1], - wgsl_div_f16_inplace_overlap, "div_f16_inplace_overlap", constants); } static void ggml_webgpu_init_rms_norm_pipeline(webgpu_context & webgpu_ctx) { @@ -2203,9 +2152,9 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t // TODO: Don't enable for WASM builds, they won't have an effect anyways // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these, // only for native performance? - const char * const deviceEnabledToggles[] = { "disable_robustness", "disable_workgroup_init", - "disable_polyfills_on_integer_div_and_mod" }; - const char * const deviceDisabledToggles[] = { "timestamp_quantization" }; + const char * const deviceEnabledToggles[] = { "skip_validation", "disable_robustness", "disable_workgroup_init", + "disable_polyfills_on_integer_div_and_mod" }; + const char * const deviceDisabledToggles[] = { "timestamp_quantization" }; wgpu::DawnTogglesDescriptor deviceTogglesDesc; deviceTogglesDesc.enabledToggles = deviceEnabledToggles; deviceTogglesDesc.enabledToggleCount = 4; diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl index 5143a1bbf17f8..1ce4d83fa8e50 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl @@ -5,354 +5,129 @@ "SHADER_NAME": "add_f32", "REPLS": { "TYPE" : "f32", - "SRC1_BUF": "src1", - "DST_BUF": "dst", - "OP": "+", - "PARAMS_BINDING": 3 + "OP": "+" }, "DECLS": ["NOT_INPLACE"] }, - { - "SHADER_NAME": "add_f32_inplace", - "REPLS": { - "TYPE" : "f32", - "SRC1_BUF": "src1", - "DST_BUF": "src0", - "OP": "+", - "PARAMS_BINDING": 2 - }, - "DECLS": ["INPLACE"] - }, - { - "SHADER_NAME": "add_f32_overlap", - "REPLS": { - "TYPE" : "f32", - "SRC1_BUF": "src0", - "DST_BUF": "dst", - "OP": "+", - "PARAMS_BINDING": 2 - }, - "DECLS": ["OVERLAP"] - }, - { - "SHADER_NAME": "add_f32_inplace_overlap", - "REPLS": { - "TYPE" : "f32", - "SRC1_BUF": "src0", - "DST_BUF": "src0", - "OP": "+", - "PARAMS_BINDING": 1 - }, - "DECLS": ["INPLACE_OVERLAP"] - }, { "SHADER_NAME": "add_f16", "REPLS": { "TYPE" : "f16", - "SRC1_BUF": "src1", - "DST_BUF": "dst", - "OP": "+", - "PARAMS_BINDING": 3 + "OP": "+" }, "DECLS": ["NOT_INPLACE"] }, { - "SHADER_NAME": "add_f16_inplace", + "SHADER_NAME": "add_f32_inplace", "REPLS": { - "TYPE" : "f16", - "SRC1_BUF": "src1", - "DST_BUF": "src0", - "OP": "+", - "PARAMS_BINDING": 2 + "TYPE" : "f32", + "OP": "+" }, "DECLS": ["INPLACE"] }, { - "SHADER_NAME": "add_f16_overlap", - "REPLS": { - "TYPE" : "f16", - "SRC1_BUF": "src0", - "DST_BUF": "dst", - "OP": "+", - "PARAMS_BINDING": 2 - }, - "DECLS": ["OVERLAP"] - }, - { - "SHADER_NAME": "add_f16_inplace_overlap", + "SHADER_NAME": "add_f16_inplace", "REPLS": { "TYPE" : "f16", - "SRC1_BUF": "src0", - "DST_BUF": "src0", - "OP": "+", - "PARAMS_BINDING": 1 + "OP": "+" }, - "DECLS": ["INPLACE_OVERLAP"] + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "mul_f32", "REPLS": { "TYPE" : "f32", - "SRC1_BUF": "src1", - "DST_BUF": "dst", - "OP": "*", - "PARAMS_BINDING": 3 + "OP": "*" }, "DECLS": ["NOT_INPLACE"] }, - { - "SHADER_NAME": "mul_f32_inplace", - "REPLS": { - "TYPE" : "f32", - "SRC1_BUF": "src1", - "DST_BUF": "src0", - "OP": "*", - "PARAMS_BINDING": 2 - }, - "DECLS": ["INPLACE"] - }, - { - "SHADER_NAME": "mul_f32_overlap", - "REPLS": { - "TYPE" : "f32", - "SRC1_BUF": "src0", - "DST_BUF": "dst", - "OP": "*", - "PARAMS_BINDING": 2 - }, - "DECLS": ["OVERLAP"] - }, - { - "SHADER_NAME": "mul_f32_inplace_overlap", - "REPLS": { - "TYPE" : "f32", - "SRC1_BUF": "src0", - "DST_BUF": "src0", - "OP": "*", - "PARAMS_BINDING": 1 - }, - "DECLS": ["INPLACE_OVERLAP"] - }, { "SHADER_NAME": "mul_f16", "REPLS": { "TYPE" : "f16", - "SRC1_BUF": "src1", - "DST_BUF": "dst", - "OP": "*", - "PARAMS_BINDING": 3 + "OP": "*" }, "DECLS": ["NOT_INPLACE"] }, { - "SHADER_NAME": "mul_f16_inplace", + "SHADER_NAME": "mul_f32_inplace", "REPLS": { - "TYPE" : "f16", - "SRC1_BUF": "src1", - "DST_BUF": "src0", - "OP": "*", - "PARAMS_BINDING": 2 + "TYPE" : "f32", + "OP": "*" }, "DECLS": ["INPLACE"] }, { - "SHADER_NAME": "mul_f16_overlap", - "REPLS": { - "TYPE" : "f16", - "SRC1_BUF": "src0", - "DST_BUF": "dst", - "OP": "*", - "PARAMS_BINDING": 2 - }, - "DECLS": ["OVERLAP"] - }, - { - "SHADER_NAME": "mul_f16_inplace_overlap", + "SHADER_NAME": "mul_f16_inplace", "REPLS": { "TYPE" : "f16", - "SRC1_BUF": "src0", - "DST_BUF": "src0", - "OP": "*", - "PARAMS_BINDING": 1 + "OP": "*" }, - "DECLS": ["INPLACE_OVERLAP"] + "DECLS": ["INPLACE"] }, { "SHADER_NAME": "sub_f32", "REPLS": { "TYPE" : "f32", - "SRC1_BUF": "src1", - "DST_BUF": "dst", - "OP": "-", - "PARAMS_BINDING": 3 + "OP": "-" }, "DECLS": ["NOT_INPLACE"] }, - { - "SHADER_NAME": "sub_f32_inplace", - "REPLS": { - "TYPE" : "f32", - "SRC1_BUF": "src1", - "DST_BUF": "src0", - "OP": "-", - "PARAMS_BINDING": 2 - }, - "DECLS": ["INPLACE"] - }, - { - "SHADER_NAME": "sub_f32_overlap", - "REPLS": { - "TYPE" : "f32", - "SRC1_BUF": "src0", - "DST_BUF": "dst", - "OP": "-", - "PARAMS_BINDING": 2 - }, - "DECLS": ["OVERLAP"] - }, - { - "SHADER_NAME": "sub_f32_inplace_overlap", - "REPLS": { - "TYPE" : "f32", - "SRC1_BUF": "src0", - "DST_BUF": "src0", - "OP": "-", - "PARAMS_BINDING": 1 - }, - "DECLS": ["INPLACE_OVERLAP"] - }, { "SHADER_NAME": "sub_f16", "REPLS": { "TYPE" : "f16", - "SRC1_BUF": "src1", - "DST_BUF": "dst", - "OP": "-", - "PARAMS_BINDING": 3 + "OP": "-" }, "DECLS": ["NOT_INPLACE"] }, { - "SHADER_NAME": "sub_f16_inplace", + "SHADER_NAME": "sub_f32_inplace", "REPLS": { - "TYPE" : "f16", - "SRC1_BUF": "src1", - "DST_BUF": "src0", - "OP": "-", - "PARAMS_BINDING": 2 + "TYPE" : "f32", + "OP": "-" }, "DECLS": ["INPLACE"] }, { - "SHADER_NAME": "sub_f16_overlap", - "REPLS": { - "TYPE" : "f16", - "SRC1_BUF": "src0", - "DST_BUF": "dst", - "OP": "-", - "PARAMS_BINDING": 2 - }, - "DECLS": ["OVERLAP"] - }, - { - "SHADER_NAME": "sub_f16_inplace_overlap", + "SHADER_NAME": "sub_f16_inplace", "REPLS": { "TYPE" : "f16", - "SRC1_BUF": "src0", - "DST_BUF": "src0", - "OP": "-", - "PARAMS_BINDING": 1 + "OP": "-" }, - "DECLS": ["INPLACE_OVERLAP"] + "DECLS": ["INPLACE"] }, - { "SHADER_NAME": "div_f32", "REPLS": { "TYPE" : "f32", - "SRC1_BUF": "src1", - "DST_BUF": "dst", - "OP": "/", - "PARAMS_BINDING": 3 + "OP": "/" }, "DECLS": ["NOT_INPLACE"] }, - { - "SHADER_NAME": "div_f32_inplace", - "REPLS": { - "TYPE" : "f32", - "SRC1_BUF": "src1", - "DST_BUF": "src0", - "OP": "/", - "PARAMS_BINDING": 2 - }, - "DECLS": ["INPLACE"] - }, - { - "SHADER_NAME": "div_f32_overlap", - "REPLS": { - "TYPE" : "f32", - "SRC1_BUF": "src0", - "DST_BUF": "dst", - "OP": "/", - "PARAMS_BINDING": 2 - }, - "DECLS": ["OVERLAP"] - }, - { - "SHADER_NAME": "div_f32_inplace_overlap", - "REPLS": { - "TYPE" : "f32", - "SRC1_BUF": "src0", - "DST_BUF": "src0", - "OP": "/", - "PARAMS_BINDING": 1 - }, - "DECLS": ["INPLACE_OVERLAP"] - }, { "SHADER_NAME": "div_f16", "REPLS": { "TYPE" : "f16", - "SRC1_BUF": "src1", - "DST_BUF": "dst", - "OP": "/", - "PARAMS_BINDING": 3 + "OP": "/" }, "DECLS": ["NOT_INPLACE"] }, { - "SHADER_NAME": "div_f16_inplace", + "SHADER_NAME": "div_f32_inplace", "REPLS": { - "TYPE" : "f16", - "SRC1_BUF": "src1", - "DST_BUF": "src0", - "OP": "/", - "PARAMS_BINDING": 2 + "TYPE" : "f32", + "OP": "/" }, "DECLS": ["INPLACE"] }, { - "SHADER_NAME": "div_f16_overlap", - "REPLS": { - "TYPE" : "f16", - "SRC1_BUF": "src0", - "DST_BUF": "dst", - "OP": "/", - "PARAMS_BINDING": 2 - }, - "DECLS": ["OVERLAP"] - }, - { - "SHADER_NAME": "div_f16_inplace_overlap", + "SHADER_NAME": "div_f16_inplace", "REPLS": { "TYPE" : "f16", - "SRC1_BUF": "src0", - "DST_BUF": "src0", - "OP": "/", - "PARAMS_BINDING": 1 + "OP": "/" }, - "DECLS": ["INPLACE_OVERLAP"] + "DECLS": ["INPLACE"] } ] @@ -362,89 +137,43 @@ #decl(NOT_INPLACE) -@group(0) @binding(1) -var src1: array<{{TYPE}}>; +fn update(dst_i: u32, src0_i: u32, src1_i: u32) { + dst[dst_i] = src0[src0_i] {{OP}} src1[src1_i]; +} @group(0) @binding(2) var dst: array<{{TYPE}}>; +@group(0) @binding(3) +var params: Params; + #enddecl(NOT_INPLACE) #decl(INPLACE) -@group(0) @binding(1) -var src1: array<{{TYPE}}>; - -#enddecl(INPLACE) - -#decl(OVERLAP) - -@group(0) @binding(1) -var dst: array<{{TYPE}}>; - -#enddecl(OVERLAP) +fn update(dst_i: u32, src0_i: u32, src1_i: u32) { + src0[dst_i] = src0[src0_i] {{OP}} src1[src1_i]; +} -#decl(INPLACE_OVERLAP) +@group(0) @binding(2) +var params: Params; -#enddecl(INPLACE_OVERLAP) +#enddecl(INPLACE) #end(DECLS) + #define(SHADER) enable f16; -struct Params { - ne: u32, - - // offsets in elements - offset_src0: u32, - offset_src1: u32, - offset_dst: u32, - - stride_src1_0: u32, - stride_src1_1: u32, - stride_src1_2: u32, - stride_src1_3: u32, - - a_ne0: u32, - a_ne1: u32, - a_ne2: u32, - - b_ne0: u32, - b_ne1: u32, - b_ne2: u32, - b_ne3: u32, -}; - -fn src1_index(_i: u32) -> u32 { - var i = _i; - let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0); - i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0); - let a_i2 = i / (params.a_ne1 * params.a_ne0); - i = i % (params.a_ne1 * params.a_ne0); - let a_i1 = i / params.a_ne0; - let a_i0 = i % params.a_ne0; - - // handle repetition of b - // index loops back to the beginning and repeats after elements are exhausted = modulo - let b_i0 = a_i0 % params.b_ne0; - let b_i1 = a_i1 % params.b_ne1; - let b_i2 = a_i2 % params.b_ne2; - let b_i3 = a_i3 % params.b_ne3; - - // compute index for position in b's flat array - return b_i0 * params.stride_src1_0 + - b_i1 * params.stride_src1_1 + - b_i2 * params.stride_src1_2 + - b_i3 * params.stride_src1_3; -} +#include "binary_head.tmpl" @group(0) @binding(0) var src0: array<{{TYPE}}>; -@group(0) @binding({{PARAMS_BINDING}}) -var params: Params; +@group(0) @binding(1) +var src1: array<{{TYPE}}>; DECLS @@ -452,7 +181,7 @@ override wg_size: u32; @compute @workgroup_size(wg_size) fn main(@builtin(global_invocation_id) gid: vec3) { if (gid.x < params.ne) { - {{DST_BUF}}[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] {{OP}} {{SRC1_BUF}}[params.offset_src1 + src1_index(gid.x)]; + update(params.offset_dst + gid.x, params.offset_src0 + gid.x, params.offset_src1 + src1_index(gid.x)); } } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl new file mode 100644 index 0000000000000..4b254f468d69e --- /dev/null +++ b/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl @@ -0,0 +1,45 @@ +struct Params { + ne: u32, + + // offsets in elements + offset_src0: u32, + offset_src1: u32, + offset_dst: u32, + + stride_src1_0: u32, + stride_src1_1: u32, + stride_src1_2: u32, + stride_src1_3: u32, + + a_ne0: u32, + a_ne1: u32, + a_ne2: u32, + + b_ne0: u32, + b_ne1: u32, + b_ne2: u32, + b_ne3: u32, +}; + +fn src1_index(_i: u32) -> u32 { + var i = _i; + let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0); + i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0); + let a_i2 = i / (params.a_ne1 * params.a_ne0); + i = i % (params.a_ne1 * params.a_ne0); + let a_i1 = i / params.a_ne0; + let a_i0 = i % params.a_ne0; + + // handle repetition of b + // index loops back to the beginning and repeats after elements are exhausted = modulo + let b_i0 = a_i0 % params.b_ne0; + let b_i1 = a_i1 % params.b_ne1; + let b_i2 = a_i2 % params.b_ne2; + let b_i3 = a_i3 % params.b_ne3; + + // compute index for position in b's flat array + return b_i0 * params.stride_src1_0 + + b_i1 * params.stride_src1_1 + + b_i2 * params.stride_src1_2 + + b_i3 * params.stride_src1_3; +} diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 0d2cbc530f5fb..04fa1b62d3b4d 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4840,7 +4840,7 @@ struct test_moe_expert_reduce : public test_case { std::vector expert_views(n_expert_used); for (int64_t i = 0; i < n_expert_used; ++i) { - expert_views[i] = ggml_view_2d(ctx, weighted, n_embd, n_tokens, weighted->nb[1], i * weighted->nb[1]); + expert_views[i] = ggml_view_2d(ctx, weighted, n_embd, n_tokens, weighted->nb[2], i * weighted->nb[1]); std::string name = "expert_view_" + std::to_string(i); ggml_set_name(expert_views[i], name.c_str()); From 9a029e48a28248d6b11e4fd3bc6cd2dfa8d33bcf Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Sat, 1 Nov 2025 21:36:21 -0700 Subject: [PATCH 7/7] Disable support for non-contiguous binary_op tensors and leave note for future support --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 70e3013537b2d..1a15756731580 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -1969,8 +1969,10 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: + // TODO: support non-contiguous tensors, e.g. for MOE_EXPERT_REDUCE + // see https://github.com/ggml-org/llama.cpp/pull/16857 supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type) && - (src1->type == op->type); + (src1->type == op->type) && ggml_is_contiguous(src0) && ggml_is_contiguous(src1); break; case GGML_OP_CPY: case GGML_OP_CONT: