From b56681191313ee1c78e7717374e2ec8210972616 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 15 Oct 2025 19:04:48 +0800
Subject: [PATCH 1/7] Add buffer label and enable dawn-specific toggles to turn
 off some checks

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 56 ++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 05e16cd432ad3..b4558a9e3f1d2 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -309,10 +309,12 @@ struct ggml_backend_webgpu_context {
 struct ggml_backend_webgpu_buffer_context {
     webgpu_context webgpu_ctx;
     wgpu::Buffer   buffer;
+    std::string    label;
 
-    ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf) :
+    ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf, std::string lbl) :
         webgpu_ctx(std::move(ctx)),
-        buffer(std::move(buf)) {}
+        buffer(std::move(buf)),
+        label(std::move(lbl)) {}
 };
 
 /* End struct definitions */
@@ -1336,11 +1338,11 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe
 
     WEBGPU_CPU_PROFILE_TOTAL_START(memset_tensor);
 
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", "
-                                                                 << offset << ", " << size << ")");
-
     ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
 
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buf_ctx->label << ", " << tensor << ", " << value
+                                                                 << ", " << offset << ", " << size << ")");
+
     size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
 
     // This is a trick to set all bytes of a u32 to the same 1 byte value.
@@ -1354,12 +1356,13 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                                   const void *          data,
                                                   size_t                offset,
                                                   size_t                size) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", "
-                                                              << offset << ", " << size << ")");
     WEBGPU_CPU_PROFILE_TOTAL_START(set_tensor);
     ggml_backend_webgpu_buffer_context * buf_ctx    = (ggml_backend_webgpu_buffer_context *) buffer->context;
     webgpu_context                       webgpu_ctx = buf_ctx->webgpu_ctx;
 
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buf_ctx->label << ", " << tensor << ", " << data
+                                                              << ", " << offset << ", " << size << ")");
+
     size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
 
     webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size / 4) * 4);
@@ -1397,12 +1400,12 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                                   void *                data,
                                                   size_t                offset,
                                                   size_t                size) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", "
-                                                              << offset << ", " << size << ")");
     WEBGPU_CPU_PROFILE_TOTAL_START(get_tensor);
-    ggml_backend_webgpu_buffer_context * buf_ctx    = (ggml_backend_webgpu_buffer_context *) buffer->context;
-    webgpu_context                       webgpu_ctx = buf_ctx->webgpu_ctx;
-    wgpu::Device                         device     = webgpu_ctx->device;
+    ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buf_ctx->label << ", " << tensor << ", " << data
+                                                              << ", " << offset << ", " << size << ")");
+    webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx;
+    wgpu::Device   device     = webgpu_ctx->device;
 
     size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
 
@@ -1473,16 +1476,20 @@ static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer
 
 static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
                                                                           size_t                     size) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer(" << size << ")");
+    static std::atomic<int> buffer_count;
+    int                     buffer_id = buffer_count++;
+    std::string             buf_name  = "tensor_buf" + std::to_string(buffer_id);
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer_" << buffer_id << ": " << size << " bytes");
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
 
     wgpu::Buffer buf;
     ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf,
                               (size + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1),
                               wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst,
-                              "allocated_buffer");
+                              buf_name.c_str());
 
-    ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf);
+    ggml_backend_webgpu_buffer_context * buf_ctx =
+        new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf, buf_name);
 
     return ggml_backend_buffer_init(buft, ggml_backend_webgpu_buffer_interface, buf_ctx, size);
 }
@@ -2129,6 +2136,15 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     required_features.push_back(wgpu::FeatureName::TimestampQuery);
 #endif
 
+    const char * const deviceEnabledToggles[]  = { "skip_validation", "disable_robustness", "disable_workgroup_init",
+                                                   "disable_polyfills_on_integer_div_and_mod" };
+    const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
+    wgpu::DawnTogglesDescriptor deviceTogglesDesc;
+    deviceTogglesDesc.enabledToggles      = deviceEnabledToggles;
+    deviceTogglesDesc.enabledToggleCount  = 4;
+    deviceTogglesDesc.disabledToggles     = deviceDisabledToggles;
+    deviceTogglesDesc.disabledToggleCount = 1;
+
     wgpu::DeviceDescriptor dev_desc;
     dev_desc.requiredLimits       = &ctx->limits;
     dev_desc.requiredFeatures     = required_features.data();
@@ -2146,6 +2162,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
             GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason),
                        std::string(message).c_str());
         });
+    dev_desc.nextInChain = &deviceTogglesDesc;
     ctx->instance.WaitAny(ctx->adapter.RequestDevice(
                               &dev_desc, wgpu::CallbackMode::AllowSpontaneous,
                               [ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
@@ -2243,11 +2260,18 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
     ctx.name         = GGML_WEBGPU_NAME;
     ctx.device_count = 1;
 
+    const char * const instanceEnabledToggles[] = { "allow_unsafe_apis" };
+
+    wgpu::DawnTogglesDescriptor instanceTogglesDesc;
+    instanceTogglesDesc.enabledToggles     = instanceEnabledToggles;
+    instanceTogglesDesc.enabledToggleCount = 1;
     wgpu::InstanceDescriptor               instance_descriptor{};
     std::vector<wgpu::InstanceFeatureName> instance_features = { wgpu::InstanceFeatureName::TimedWaitAny };
     instance_descriptor.requiredFeatures                     = instance_features.data();
     instance_descriptor.requiredFeatureCount                 = instance_features.size();
-    webgpu_ctx->instance                                     = wgpu::CreateInstance(&instance_descriptor);
+    instance_descriptor.nextInChain                          = &instanceTogglesDesc;
+
+    webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor);
     GGML_ASSERT(webgpu_ctx->instance != nullptr);
 
     static ggml_backend_reg reg = {

From f2e187c7f26cdf0f4a9284db8ecbb2d74ffb7213 Mon Sep 17 00:00:00 2001
From: neha-ha <137219201+neha-ha@users.noreply.github.com>
Date: Mon, 27 Oct 2025 14:48:39 -0500
Subject: [PATCH 2/7] Minor set_rows optimization (#4)

* updated optimization, fixed errors

* non vectorized version now dispatches one thread per element

* Simplify

* Change logic for set_rows pipelines

---------

Co-authored-by: Neha Abbas <nehaabbas@macbookpro.lan>
Co-authored-by: Neha Abbas <nehaabbas@ReeseLevines-MacBook-Pro.local>
Co-authored-by: Reese Levine <reeselevine1@gmail.com>
---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 25 +++++++---
 .../{set_rows.wgsl => set_rows.tmpl.wgsl}     | 46 ++++++++++++++++---
 2 files changed, 58 insertions(+), 13 deletions(-)
 rename ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl => set_rows.tmpl.wgsl} (68%)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index b4558a9e3f1d2..353c7729bd1f8 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -248,7 +248,7 @@ struct webgpu_context_struct {
 
     webgpu_pipeline memset_pipeline;
     webgpu_pipeline mul_mat_pipeline[30][2];
-    webgpu_pipeline set_rows_pipeline;
+    webgpu_pipeline set_rows_pipeline[1][2];  // dst->type, vectorized (0 for vectorized, 1 for non vectorized)
     webgpu_pipeline get_rows_pipeline[30];
     webgpu_pipeline get_rows_f32_no_vec_pipeline;
     webgpu_pipeline cpy_pipeline[2][2];          // src type, dst type
@@ -766,10 +766,21 @@ static std::optional<webgpu_command> ggml_webgpu_set_rows(webgpu_context & ctx,
         { .binding = 3, .buffer = error_bufs.dev_buf, .offset = 0, .size = error_bufs.dev_buf.GetSize() }
     };
 
-    size_t   max_wg_size = ctx->max_wg_size_x;
-    uint32_t wg_x        = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size;
+    size_t max_wg_size = ctx->max_wg_size_x;
+
+    int             vectorized = src->ne[0] % 4 == 0;
+    webgpu_pipeline pipeline   = ctx->set_rows_pipeline[0][vectorized];
+    // if not evenly divisble by 4, use the non-vectorized version
+    uint32_t        threads;
+    if (vectorized) {
+        threads = (src->ne[1] * src->ne[2] * src->ne[3]) * (src->ne[0] / 4);
+    } else {
+        threads = src->ne[0] * src->ne[1] * src->ne[2] * src->ne[3];
+    }
+
+    uint32_t wg_x = (threads + max_wg_size - 1) / max_wg_size;
 
-    return ggml_backend_webgpu_build(ctx, ctx->set_rows_pipeline, params, entries, wg_x, error_bufs);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, error_bufs);
 }
 
 static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx,
@@ -1620,8 +1631,10 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
 }
 
 static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) {
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline, wgsl_set_rows, "set_rows",
-                                ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline[0][0], wgsl_set_rows_f16,
+                                "set_rows_f16", ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline[0][1], wgsl_set_rows_f16_vec,
+                                "set_rows_f16_vec", ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
 }
 
 static void ggml_webgpu_init_get_rows_pipeline(webgpu_context & webgpu_ctx) {
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl
similarity index 68%
rename from ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl
rename to ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl
index 3567713dc215c..4a6d819d3b145 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl
@@ -1,13 +1,38 @@
+#define(VARIANTS)
+
+[
+  {
+    "SHADER_SUFFIX": "f16_vec",
+    "REPLS": {
+      "TYPE" : "vec4<f32>",
+      "DST_TYPE": "vec4<f16>",
+      "VEC_SIZE": 4
+    }
+  },
+  {
+    "SHADER_SUFFIX": "f16",
+    "REPLS": {
+      "TYPE" : "f32",
+      "DST_TYPE": "f16",
+      "VEC_SIZE": 1
+    }
+  }
+]
+
+#end(VARIANTS)
+
+#define(SHADER)
+
 enable f16;
 
 @group(0) @binding(0)
-var<storage, read_write> src: array<f32>;
+var<storage, read_write> src: array<{{TYPE}}>;
 
 @group(0) @binding(1)
 var<storage, read_write> idx: array<u32>;
 
 @group(0) @binding(2)
-var<storage, read_write> dst: array<f16>;
+var<storage, read_write> dst: array<{{DST_TYPE}}>;
 
 @group(0) @binding(3)
 var<storage, read_write> error: atomic<u32>;
@@ -47,10 +72,14 @@ var<uniform> params: Params;
 override wg_size: u32;
 @compute @workgroup_size(wg_size)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    if (gid.x >= params.n_rows * params.ne2 * params.ne3) {
+    if (gid.x >= (params.ne3 * params.ne2 * params.n_rows * params.ne0) / {{VEC_SIZE}}) {
         return;
     }
-    var i = gid.x;
+
+    // getting the row from gid
+    let elems_per_row = params.ne0 / {{VEC_SIZE}};
+    var i = gid.x / elems_per_row;
+
     let i_src3 = i / (params.ne2 * params.n_rows);
 
     i = i % (params.ne2 * params.n_rows);
@@ -75,7 +104,10 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let i_dst_row = params.offset_dst + idx_high_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3;
     let i_src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3;
 
-    for (var i: u32 = 0; i < params.ne0; i++) {
-      dst[i_dst_row + i] = f16(src[i_src_row + i]);
-    }
+    // starts at what element of that row?
+    let col_idx = (gid.x % elems_per_row);
+    dst[i_dst_row/{{VEC_SIZE}} + col_idx] = {{DST_TYPE}}(src[i_src_row/{{VEC_SIZE}} + col_idx]);
 }
+
+#end(SHADER)
+

From 51aae63b49034ff8171ab623f66ac47a3441628a Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Mon, 27 Oct 2025 13:00:19 -0700
Subject: [PATCH 3/7] Comment on dawn toggles

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 353c7729bd1f8..b4a9f6d579b89 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -2149,6 +2149,10 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     required_features.push_back(wgpu::FeatureName::TimestampQuery);
 #endif
 
+    // Enable Dawn-specific toggles to increase native performance
+    // TODO: Don't enable for WASM builds, they won't have an effect anyways
+    // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
+    //       only for native performance?
     const char * const deviceEnabledToggles[]  = { "skip_validation", "disable_robustness", "disable_workgroup_init",
                                                    "disable_polyfills_on_integer_div_and_mod" };
     const char * const deviceDisabledToggles[] = { "timestamp_quantization" };

From f0cfae49d637ea48b901a72c0b344ba9393f38e8 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Mon, 27 Oct 2025 15:48:00 -0700
Subject: [PATCH 4/7] Remove some comments

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp                 | 3 +--
 ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index b4a9f6d579b89..70e3013537b2d 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -248,7 +248,7 @@ struct webgpu_context_struct {
 
     webgpu_pipeline memset_pipeline;
     webgpu_pipeline mul_mat_pipeline[30][2];
-    webgpu_pipeline set_rows_pipeline[1][2];  // dst->type, vectorized (0 for vectorized, 1 for non vectorized)
+    webgpu_pipeline set_rows_pipeline[1][2];  // dst->type, vectorized
     webgpu_pipeline get_rows_pipeline[30];
     webgpu_pipeline get_rows_f32_no_vec_pipeline;
     webgpu_pipeline cpy_pipeline[2][2];          // src type, dst type
@@ -770,7 +770,6 @@ static std::optional<webgpu_command> ggml_webgpu_set_rows(webgpu_context & ctx,
 
     int             vectorized = src->ne[0] % 4 == 0;
     webgpu_pipeline pipeline   = ctx->set_rows_pipeline[0][vectorized];
-    // if not evenly divisble by 4, use the non-vectorized version
     uint32_t        threads;
     if (vectorized) {
         threads = (src->ne[1] * src->ne[2] * src->ne[3]) * (src->ne[0] / 4);
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl
index 4a6d819d3b145..fca3be6bc27ed 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl
@@ -104,7 +104,6 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let i_dst_row = params.offset_dst + idx_high_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3;
     let i_src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3;
 
-    // starts at what element of that row?
     let col_idx = (gid.x % elems_per_row);
     dst[i_dst_row/{{VEC_SIZE}} + col_idx] = {{DST_TYPE}}(src[i_src_row/{{VEC_SIZE}} + col_idx]);
 }

From ed710b36f51ab3f53fa13db15c1685dc8678a32a Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Fri, 31 Oct 2025 17:35:00 -0700
Subject: [PATCH 5/7] Implement overlap binary operators

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 179 ++++++---
 .../ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl | 369 +++++++++++++++---
 .../ggml-webgpu/wgsl-shaders/binary_head.tmpl |  45 ---
 tests/test-backend-ops.cpp                    |   2 +-
 4 files changed, 436 insertions(+), 159 deletions(-)
 delete mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 70e3013537b2d..f6b939e140415 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -252,10 +252,10 @@ struct webgpu_context_struct {
     webgpu_pipeline get_rows_pipeline[30];
     webgpu_pipeline get_rows_f32_no_vec_pipeline;
     webgpu_pipeline cpy_pipeline[2][2];          // src type, dst type
-    webgpu_pipeline add_pipeline[2][2];          // type, inplace
-    webgpu_pipeline sub_pipeline[2][2];          // type, inplace
-    webgpu_pipeline mul_pipeline[2][2];          // type, inplace
-    webgpu_pipeline div_pipeline[2][2];          // type, inplace
+    webgpu_pipeline add_pipeline[2][2][2];       // type, inplace, overlap
+    webgpu_pipeline sub_pipeline[2][2][2];       // type, inplace, overlap
+    webgpu_pipeline mul_pipeline[2][2][2];       // type, inplace, overlap
+    webgpu_pipeline div_pipeline[2][2][2];       // type, inplace, overlap
     webgpu_pipeline rms_norm_pipeline[2];        // inplace
     webgpu_pipeline rope_pipeline[2][2][2];      // type, ff, inplace
     webgpu_pipeline glu_pipeline[7][2][2];       // glu-op, type, split
@@ -677,9 +677,12 @@ static size_t ggml_webgpu_tensor_align_offset(webgpu_context & ctx, ggml_tensor
     return offset & ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
 }
 
+static size_t ggml_webgpu_tensor_align_binding_size(size_t size) {
+    return (size + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1);
+}
+
 static size_t ggml_webgpu_tensor_binding_size(webgpu_context & ctx, ggml_tensor * t) {
-    return (ggml_nbytes(t) + ggml_webgpu_tensor_misalignment(ctx, t) + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) &
-           ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1);
+    return ggml_webgpu_tensor_align_binding_size(ggml_nbytes(t) + ggml_webgpu_tensor_misalignment(ctx, t));
 }
 
 // Used to determine if two tensors are the same for in-place operations
@@ -688,6 +691,12 @@ static bool ggml_webgpu_tensor_equal(ggml_tensor * a, ggml_tensor * b) {
            (ggml_webgpu_tensor_offset(a) == ggml_webgpu_tensor_offset(b));
 }
 
+static bool ggml_webgpu_tensor_overlap(ggml_tensor * a, ggml_tensor * b) {
+    return (ggml_webgpu_tensor_buf(a).Get() == ggml_webgpu_tensor_buf(b).Get()) &&
+           ggml_webgpu_tensor_offset(a) < (ggml_webgpu_tensor_offset(b) + ggml_nbytes(b)) &&
+           ggml_webgpu_tensor_offset(b) < (ggml_webgpu_tensor_offset(a) + ggml_nbytes(a));
+}
+
 static webgpu_command ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     uint32_t ne = (uint32_t) ggml_nelements(dst);
 
@@ -870,16 +879,27 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
     return ggml_backend_webgpu_build(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x);
 }
 
-static webgpu_command ggml_webgpu_binary_op(webgpu_context &  ctx,
-                                            ggml_tensor *     src0,
-                                            ggml_tensor *     src1,
-                                            ggml_tensor *     dst,
-                                            webgpu_pipeline & pipeline,
-                                            bool              inplace) {
+template <size_t a, size_t b, size_t c>
+static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx,
+                                            ggml_tensor *    src0,
+                                            ggml_tensor *    src1,
+                                            ggml_tensor *    dst,
+                                            webgpu_pipeline (&pipelines)[a][b][c]) {
+    int             inplace  = ggml_webgpu_tensor_equal(src0, dst);
+    int             overlap  = ggml_webgpu_tensor_overlap(src0, src1);
+    webgpu_pipeline pipeline = pipelines[dst->type][inplace][overlap];
+
+    uint32_t src1_offset = ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type);
+    if (overlap) {
+        // when overlapped, bind a single buffer covering both src0 and src1
+        // TODO: Do other operations need this?
+        src1_offset = (uint32_t) ((ggml_webgpu_tensor_offset(src1) - ggml_webgpu_tensor_align_offset(ctx, src0)) /
+                                  ggml_type_size(src1->type));
+    }
     std::vector<uint32_t> params = {
         (uint32_t) ggml_nelements(dst),
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
-        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
+        src1_offset,
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
         (uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
         (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
@@ -894,25 +914,36 @@ static webgpu_command ggml_webgpu_binary_op(webgpu_context &  ctx,
         (uint32_t) src1->ne[3],
     };
 
+    size_t src0_binding_size = ggml_webgpu_tensor_binding_size(ctx, src0);
+    if (overlap) {
+        const uint64_t base_align = ggml_webgpu_tensor_align_offset(ctx, src0);
+        // assume end of src1 is >= end of src0
+        const uint64_t max_end    = ggml_webgpu_tensor_offset(src1) + ggml_nbytes(src1);
+        src0_binding_size         = ggml_webgpu_tensor_align_binding_size(max_end - base_align);
+    }
     std::vector<wgpu::BindGroupEntry> entries = {
         { .binding = 0,
          .buffer  = ggml_webgpu_tensor_buf(src0),
          .offset  = ggml_webgpu_tensor_align_offset(ctx, src0),
-         .size    = ggml_webgpu_tensor_binding_size(ctx, src0) },
-        { .binding = 1,
-         .buffer  = ggml_webgpu_tensor_buf(src1),
-         .offset  = ggml_webgpu_tensor_align_offset(ctx, src1),
-         .size    = ggml_webgpu_tensor_binding_size(ctx, src1) }
+         .size    = src0_binding_size }
     };
+    uint32_t binding_num = 1;
+    if (!overlap) {
+        entries.push_back({ .binding = binding_num,
+                            .buffer  = ggml_webgpu_tensor_buf(src1),
+                            .offset  = ggml_webgpu_tensor_align_offset(ctx, src1),
+                            .size    = ggml_webgpu_tensor_binding_size(ctx, src1) });
+        binding_num++;
+    }
     if (!inplace) {
-        entries.push_back({ .binding = 2,
+        entries.push_back({ .binding = binding_num,
                             .buffer  = ggml_webgpu_tensor_buf(dst),
                             .offset  = ggml_webgpu_tensor_align_offset(ctx, dst),
                             .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
     }
 
-    size_t   max_wg_size = ctx->max_wg_size_x;
-    uint32_t wg_x        = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
+    size_t max_wg_size = ctx->max_wg_size_x;
+    uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
     return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
@@ -1232,25 +1263,13 @@ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx,
         case GGML_OP_MUL_MAT:
             return ggml_webgpu_mul_mat(ctx, src0, src1, node);
         case GGML_OP_ADD:
-            {
-                int inplace = ggml_webgpu_tensor_equal(src0, node);
-                return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline[node->type][inplace], inplace);
-            }
+            return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline);
         case GGML_OP_SUB:
-            {
-                int inplace = ggml_webgpu_tensor_equal(src0, node);
-                return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->sub_pipeline[node->type][inplace], inplace);
-            }
+            return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->sub_pipeline);
         case GGML_OP_MUL:
-            {
-                int inplace = ggml_webgpu_tensor_equal(src0, node);
-                return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline[node->type][inplace], inplace);
-            }
+            return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline);
         case GGML_OP_DIV:
-            {
-                int inplace = ggml_webgpu_tensor_equal(src0, node);
-                return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->div_pipeline[node->type][inplace], inplace);
-            }
+            return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->div_pipeline);
         case GGML_OP_RMS_NORM:
             return ggml_webgpu_rms_norm(ctx, src0, node);
         case GGML_OP_ROPE:
@@ -1700,50 +1719,82 @@ static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) {
 
 static void ggml_webgpu_init_add_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][0], wgsl_add_f32, "add_f32",
-                                constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][0], wgsl_add_f16, "add_f16",
-                                constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][1], wgsl_add_f32_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][0][0], wgsl_add_f32,
+                                "add_f32", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][0][0], wgsl_add_f16,
+                                "add_f16", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][1][0], wgsl_add_f32_inplace,
                                 "add_f32_inplace", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][1], wgsl_add_f16_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][1][0], wgsl_add_f16_inplace,
                                 "add_f16_inplace", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][0][1], wgsl_add_f32_overlap,
+                                "add_f32_overlap", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][1][1],
+                                wgsl_add_f32_inplace_overlap, "add_f32_inplace_overlap", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][0][1], wgsl_add_f16_overlap,
+                                "add_f16_overlap", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][1][1],
+                                wgsl_add_f16_inplace_overlap, "add_f16_inplace_overlap", constants);
 }
 
 static void ggml_webgpu_init_sub_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][0], wgsl_sub_f32, "sub_f32",
-                                constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][0], wgsl_sub_f16, "sub_f16",
-                                constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][1], wgsl_sub_f32_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][0][0], wgsl_sub_f32,
+                                "sub_f32", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][0][0], wgsl_sub_f16,
+                                "sub_f16", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][1][0], wgsl_sub_f32_inplace,
                                 "sub_f32_inplace", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][1], wgsl_sub_f16_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][1][0], wgsl_sub_f16_inplace,
                                 "sub_f16_inplace", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][0][1], wgsl_sub_f32_overlap,
+                                "sub_f32_overlap", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][1][1],
+                                wgsl_sub_f32_inplace_overlap, "sub_f32_inplace_overlap", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][0][1], wgsl_sub_f16_overlap,
+                                "sub_f16_overlap", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][1][1],
+                                wgsl_sub_f16_inplace_overlap, "sub_f16_inplace_overlap", constants);
 }
 
 static void ggml_webgpu_init_mul_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][0], wgsl_mul_f32, "mul_f32",
-                                constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][0], wgsl_mul_f16, "mul_f16",
-                                constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][1], wgsl_mul_f32_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][0][0], wgsl_mul_f32,
+                                "mul_f32", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][0][0], wgsl_mul_f16,
+                                "mul_f16", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][1][0], wgsl_mul_f32_inplace,
                                 "mul_f32_inplace", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][1], wgsl_mul_f16_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][1][0], wgsl_mul_f16_inplace,
                                 "mul_f16_inplace", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][0][1], wgsl_mul_f32_overlap,
+                                "mul_f32_overlap", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][1][1],
+                                wgsl_mul_f32_inplace_overlap, "mul_f32_inplace_overlap", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][0][1], wgsl_mul_f16_overlap,
+                                "mul_f16_overlap", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][1][1],
+                                wgsl_mul_f16_inplace_overlap, "mul_f16_inplace_overlap", constants);
 }
 
 static void ggml_webgpu_init_div_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][0], wgsl_div_f32, "div_f32",
-                                constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][0], wgsl_div_f16, "div_f16",
-                                constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][1], wgsl_div_f32_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][0][0], wgsl_div_f32,
+                                "div_f32", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][0][0], wgsl_div_f16,
+                                "div_f16", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][1][0], wgsl_div_f32_inplace,
                                 "div_f32_inplace", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][1], wgsl_div_f16_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][1][0], wgsl_div_f16_inplace,
                                 "div_f16_inplace", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][0][1], wgsl_div_f32_overlap,
+                                "div_f32_overlap", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][1][1],
+                                wgsl_div_f32_inplace_overlap, "div_f32_inplace_overlap", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][0][1], wgsl_div_f16_overlap,
+                                "div_f16_overlap", constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][1][1],
+                                wgsl_div_f16_inplace_overlap, "div_f16_inplace_overlap", constants);
 }
 
 static void ggml_webgpu_init_rms_norm_pipeline(webgpu_context & webgpu_ctx) {
@@ -2152,9 +2203,9 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     // TODO: Don't enable for WASM builds, they won't have an effect anyways
     // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
     //       only for native performance?
-    const char * const deviceEnabledToggles[]  = { "skip_validation", "disable_robustness", "disable_workgroup_init",
-                                                   "disable_polyfills_on_integer_div_and_mod" };
-    const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
+    const char * const          deviceEnabledToggles[]  = { "disable_robustness", "disable_workgroup_init",
+                                                            "disable_polyfills_on_integer_div_and_mod" };
+    const char * const          deviceDisabledToggles[] = { "timestamp_quantization" };
     wgpu::DawnTogglesDescriptor deviceTogglesDesc;
     deviceTogglesDesc.enabledToggles      = deviceEnabledToggles;
     deviceTogglesDesc.enabledToggleCount  = 4;
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl
index 1ce4d83fa8e50..5143a1bbf17f8 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl
@@ -5,129 +5,354 @@
     "SHADER_NAME": "add_f32",
     "REPLS": {
       "TYPE" : "f32",
-      "OP": "+"
+      "SRC1_BUF": "src1",
+      "DST_BUF": "dst",
+      "OP": "+",
+      "PARAMS_BINDING": 3
     },
     "DECLS": ["NOT_INPLACE"]
   },
+  {
+    "SHADER_NAME": "add_f32_inplace",
+    "REPLS": {
+      "TYPE" : "f32",
+      "SRC1_BUF": "src1",
+      "DST_BUF": "src0",
+      "OP": "+",
+      "PARAMS_BINDING": 2
+    },
+    "DECLS": ["INPLACE"]
+  },
+  {
+    "SHADER_NAME": "add_f32_overlap",
+    "REPLS": {
+      "TYPE" : "f32",
+      "SRC1_BUF": "src0",
+      "DST_BUF": "dst",
+      "OP": "+",
+      "PARAMS_BINDING": 2
+    },
+    "DECLS": ["OVERLAP"]
+  },
+  {
+    "SHADER_NAME": "add_f32_inplace_overlap",
+    "REPLS": {
+      "TYPE" : "f32",
+      "SRC1_BUF": "src0",
+      "DST_BUF": "src0",
+      "OP": "+",
+      "PARAMS_BINDING": 1
+    },
+    "DECLS": ["INPLACE_OVERLAP"]
+  },
   {
     "SHADER_NAME": "add_f16",
     "REPLS": {
       "TYPE" : "f16",
-      "OP": "+"
+      "SRC1_BUF": "src1",
+      "DST_BUF": "dst",
+      "OP": "+",
+      "PARAMS_BINDING": 3
     },
     "DECLS": ["NOT_INPLACE"]
   },
   {
-    "SHADER_NAME": "add_f32_inplace",
+    "SHADER_NAME": "add_f16_inplace",
     "REPLS": {
-      "TYPE" : "f32",
-      "OP": "+"
+      "TYPE" : "f16",
+      "SRC1_BUF": "src1",
+      "DST_BUF": "src0",
+      "OP": "+",
+      "PARAMS_BINDING": 2
     },
     "DECLS": ["INPLACE"]
   },
   {
-    "SHADER_NAME": "add_f16_inplace",
+    "SHADER_NAME": "add_f16_overlap",
     "REPLS": {
       "TYPE" : "f16",
-      "OP": "+"
+      "SRC1_BUF": "src0",
+      "DST_BUF": "dst",
+      "OP": "+",
+      "PARAMS_BINDING": 2
     },
-    "DECLS": ["INPLACE"]
+    "DECLS": ["OVERLAP"]
+  },
+  {
+    "SHADER_NAME": "add_f16_inplace_overlap",
+    "REPLS": {
+      "TYPE" : "f16",
+      "SRC1_BUF": "src0",
+      "DST_BUF": "src0",
+      "OP": "+",
+      "PARAMS_BINDING": 1
+    },
+    "DECLS": ["INPLACE_OVERLAP"]
   },
   {
     "SHADER_NAME": "mul_f32",
     "REPLS": {
       "TYPE" : "f32",
-      "OP": "*"
+      "SRC1_BUF": "src1",
+      "DST_BUF": "dst",
+      "OP": "*",
+      "PARAMS_BINDING": 3
     },
     "DECLS": ["NOT_INPLACE"]
   },
+  {
+    "SHADER_NAME": "mul_f32_inplace",
+    "REPLS": {
+      "TYPE" : "f32",
+      "SRC1_BUF": "src1",
+      "DST_BUF": "src0",
+      "OP": "*",
+      "PARAMS_BINDING": 2
+    },
+    "DECLS": ["INPLACE"]
+  },
+  {
+    "SHADER_NAME": "mul_f32_overlap",
+    "REPLS": {
+      "TYPE" : "f32",
+      "SRC1_BUF": "src0",
+      "DST_BUF": "dst",
+      "OP": "*",
+      "PARAMS_BINDING": 2
+    },
+    "DECLS": ["OVERLAP"]
+  },
+  {
+    "SHADER_NAME": "mul_f32_inplace_overlap",
+    "REPLS": {
+      "TYPE" : "f32",
+      "SRC1_BUF": "src0",
+      "DST_BUF": "src0",
+      "OP": "*",
+      "PARAMS_BINDING": 1
+    },
+    "DECLS": ["INPLACE_OVERLAP"]
+  },
   {
     "SHADER_NAME": "mul_f16",
     "REPLS": {
       "TYPE" : "f16",
-      "OP": "*"
+      "SRC1_BUF": "src1",
+      "DST_BUF": "dst",
+      "OP": "*",
+      "PARAMS_BINDING": 3
     },
     "DECLS": ["NOT_INPLACE"]
   },
   {
-    "SHADER_NAME": "mul_f32_inplace",
+    "SHADER_NAME": "mul_f16_inplace",
     "REPLS": {
-      "TYPE" : "f32",
-      "OP": "*"
+      "TYPE" : "f16",
+      "SRC1_BUF": "src1",
+      "DST_BUF": "src0",
+      "OP": "*",
+      "PARAMS_BINDING": 2
     },
     "DECLS": ["INPLACE"]
   },
   {
-    "SHADER_NAME": "mul_f16_inplace",
+    "SHADER_NAME": "mul_f16_overlap",
     "REPLS": {
       "TYPE" : "f16",
-      "OP": "*"
+      "SRC1_BUF": "src0",
+      "DST_BUF": "dst",
+      "OP": "*",
+      "PARAMS_BINDING": 2
     },
-    "DECLS": ["INPLACE"]
+    "DECLS": ["OVERLAP"]
+  },
+  {
+    "SHADER_NAME": "mul_f16_inplace_overlap",
+    "REPLS": {
+      "TYPE" : "f16",
+      "SRC1_BUF": "src0",
+      "DST_BUF": "src0",
+      "OP": "*",
+      "PARAMS_BINDING": 1
+    },
+    "DECLS": ["INPLACE_OVERLAP"]
   },
   {
     "SHADER_NAME": "sub_f32",
     "REPLS": {
       "TYPE" : "f32",
-      "OP": "-"
+      "SRC1_BUF": "src1",
+      "DST_BUF": "dst",
+      "OP": "-",
+      "PARAMS_BINDING": 3
     },
     "DECLS": ["NOT_INPLACE"]
   },
+  {
+    "SHADER_NAME": "sub_f32_inplace",
+    "REPLS": {
+      "TYPE" : "f32",
+      "SRC1_BUF": "src1",
+      "DST_BUF": "src0",
+      "OP": "-",
+      "PARAMS_BINDING": 2
+    },
+    "DECLS": ["INPLACE"]
+  },
+  {
+    "SHADER_NAME": "sub_f32_overlap",
+    "REPLS": {
+      "TYPE" : "f32",
+      "SRC1_BUF": "src0",
+      "DST_BUF": "dst",
+      "OP": "-",
+      "PARAMS_BINDING": 2
+    },
+    "DECLS": ["OVERLAP"]
+  },
+  {
+    "SHADER_NAME": "sub_f32_inplace_overlap",
+    "REPLS": {
+      "TYPE" : "f32",
+      "SRC1_BUF": "src0",
+      "DST_BUF": "src0",
+      "OP": "-",
+      "PARAMS_BINDING": 1
+    },
+    "DECLS": ["INPLACE_OVERLAP"]
+  },
   {
     "SHADER_NAME": "sub_f16",
     "REPLS": {
       "TYPE" : "f16",
-      "OP": "-"
+      "SRC1_BUF": "src1",
+      "DST_BUF": "dst",
+      "OP": "-",
+      "PARAMS_BINDING": 3
     },
     "DECLS": ["NOT_INPLACE"]
   },
   {
-    "SHADER_NAME": "sub_f32_inplace",
+    "SHADER_NAME": "sub_f16_inplace",
     "REPLS": {
-      "TYPE" : "f32",
-      "OP": "-"
+      "TYPE" : "f16",
+      "SRC1_BUF": "src1",
+      "DST_BUF": "src0",
+      "OP": "-",
+      "PARAMS_BINDING": 2
     },
     "DECLS": ["INPLACE"]
   },
   {
-    "SHADER_NAME": "sub_f16_inplace",
+    "SHADER_NAME": "sub_f16_overlap",
     "REPLS": {
       "TYPE" : "f16",
-      "OP": "-"
+      "SRC1_BUF": "src0",
+      "DST_BUF": "dst",
+      "OP": "-",
+      "PARAMS_BINDING": 2
     },
-    "DECLS": ["INPLACE"]
+    "DECLS": ["OVERLAP"]
+  },
+  {
+    "SHADER_NAME": "sub_f16_inplace_overlap",
+    "REPLS": {
+      "TYPE" : "f16",
+      "SRC1_BUF": "src0",
+      "DST_BUF": "src0",
+      "OP": "-",
+      "PARAMS_BINDING": 1
+    },
+    "DECLS": ["INPLACE_OVERLAP"]
   },
+
   {
     "SHADER_NAME": "div_f32",
     "REPLS": {
       "TYPE" : "f32",
-      "OP": "/"
+      "SRC1_BUF": "src1",
+      "DST_BUF": "dst",
+      "OP": "/",
+      "PARAMS_BINDING": 3
     },
     "DECLS": ["NOT_INPLACE"]
   },
+  {
+    "SHADER_NAME": "div_f32_inplace",
+    "REPLS": {
+      "TYPE" : "f32",
+      "SRC1_BUF": "src1",
+      "DST_BUF": "src0",
+      "OP": "/",
+      "PARAMS_BINDING": 2
+    },
+    "DECLS": ["INPLACE"]
+  },
+  {
+    "SHADER_NAME": "div_f32_overlap",
+    "REPLS": {
+      "TYPE" : "f32",
+      "SRC1_BUF": "src0",
+      "DST_BUF": "dst",
+      "OP": "/",
+      "PARAMS_BINDING": 2
+    },
+    "DECLS": ["OVERLAP"]
+  },
+  {
+    "SHADER_NAME": "div_f32_inplace_overlap",
+    "REPLS": {
+      "TYPE" : "f32",
+      "SRC1_BUF": "src0",
+      "DST_BUF": "src0",
+      "OP": "/",
+      "PARAMS_BINDING": 1
+    },
+    "DECLS": ["INPLACE_OVERLAP"]
+  },
   {
     "SHADER_NAME": "div_f16",
     "REPLS": {
       "TYPE" : "f16",
-      "OP": "/"
+      "SRC1_BUF": "src1",
+      "DST_BUF": "dst",
+      "OP": "/",
+      "PARAMS_BINDING": 3
     },
     "DECLS": ["NOT_INPLACE"]
   },
   {
-    "SHADER_NAME": "div_f32_inplace",
+    "SHADER_NAME": "div_f16_inplace",
     "REPLS": {
-      "TYPE" : "f32",
-      "OP": "/"
+      "TYPE" : "f16",
+      "SRC1_BUF": "src1",
+      "DST_BUF": "src0",
+      "OP": "/",
+      "PARAMS_BINDING": 2
     },
     "DECLS": ["INPLACE"]
   },
   {
-    "SHADER_NAME": "div_f16_inplace",
+    "SHADER_NAME": "div_f16_overlap",
     "REPLS": {
       "TYPE" : "f16",
-      "OP": "/"
+      "SRC1_BUF": "src0",
+      "DST_BUF": "dst",
+      "OP": "/",
+      "PARAMS_BINDING": 2
     },
-    "DECLS": ["INPLACE"]
+    "DECLS": ["OVERLAP"]
+  },
+  {
+    "SHADER_NAME": "div_f16_inplace_overlap",
+    "REPLS": {
+      "TYPE" : "f16",
+      "SRC1_BUF": "src0",
+      "DST_BUF": "src0",
+      "OP": "/",
+      "PARAMS_BINDING": 1
+    },
+    "DECLS": ["INPLACE_OVERLAP"]
   }
 ]
 
@@ -137,43 +362,89 @@
 
 #decl(NOT_INPLACE)
 
-fn update(dst_i: u32, src0_i: u32, src1_i: u32) {
-    dst[dst_i] = src0[src0_i] {{OP}} src1[src1_i];
-}
+@group(0) @binding(1)
+var<storage, read_write> src1: array<{{TYPE}}>;
 
 @group(0) @binding(2)
 var<storage, read_write> dst: array<{{TYPE}}>;
 
-@group(0) @binding(3)
-var<uniform> params: Params;
-
 #enddecl(NOT_INPLACE)
 
 #decl(INPLACE)
 
-fn update(dst_i: u32, src0_i: u32, src1_i: u32) {
-    src0[dst_i] = src0[src0_i] {{OP}} src1[src1_i];
-}
-
-@group(0) @binding(2)
-var<uniform> params: Params;
+@group(0) @binding(1)
+var<storage, read_write> src1: array<{{TYPE}}>;
 
 #enddecl(INPLACE)
 
-#end(DECLS)
+#decl(OVERLAP)
 
+@group(0) @binding(1)
+var<storage, read_write> dst: array<{{TYPE}}>;
+
+#enddecl(OVERLAP)
+
+#decl(INPLACE_OVERLAP)
+
+#enddecl(INPLACE_OVERLAP)
+
+#end(DECLS)
 
 #define(SHADER)
 
 enable f16;
 
-#include "binary_head.tmpl"
+struct Params {
+    ne: u32,
+
+    // offsets in elements
+    offset_src0: u32,
+    offset_src1: u32,
+    offset_dst: u32,
+
+    stride_src1_0: u32,
+    stride_src1_1: u32,
+    stride_src1_2: u32,
+    stride_src1_3: u32,
+
+    a_ne0: u32,
+    a_ne1: u32,
+    a_ne2: u32,
+
+    b_ne0: u32,
+    b_ne1: u32,
+    b_ne2: u32,
+    b_ne3: u32,
+};
+
+fn src1_index(_i: u32) -> u32 {
+    var i = _i;
+    let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
+    i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
+    let a_i2 = i / (params.a_ne1 * params.a_ne0);
+    i = i % (params.a_ne1 * params.a_ne0);
+    let a_i1 = i / params.a_ne0;
+    let a_i0 = i % params.a_ne0;
+
+    // handle repetition of b
+    // index loops back to the beginning and repeats after elements are exhausted = modulo
+    let b_i0 = a_i0 % params.b_ne0;
+    let b_i1 = a_i1 % params.b_ne1;
+    let b_i2 = a_i2 % params.b_ne2;
+    let b_i3 = a_i3 % params.b_ne3;
+
+    // compute index for position in b's flat array
+    return b_i0 * params.stride_src1_0 +
+           b_i1 * params.stride_src1_1 +
+           b_i2 * params.stride_src1_2 +
+           b_i3 * params.stride_src1_3;
+}
 
 @group(0) @binding(0)
 var<storage, read_write> src0: array<{{TYPE}}>;
 
-@group(0) @binding(1)
-var<storage, read_write> src1: array<{{TYPE}}>;
+@group(0) @binding({{PARAMS_BINDING}})
+var<uniform> params: Params;
 
 DECLS
 
@@ -181,7 +452,7 @@ override wg_size: u32;
 @compute @workgroup_size(wg_size)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     if (gid.x < params.ne) {
-        update(params.offset_dst + gid.x, params.offset_src0 + gid.x, params.offset_src1 + src1_index(gid.x));
+        {{DST_BUF}}[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] {{OP}} {{SRC1_BUF}}[params.offset_src1 + src1_index(gid.x)];
     }
 }
 
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl
deleted file mode 100644
index 4b254f468d69e..0000000000000
--- a/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl
+++ /dev/null
@@ -1,45 +0,0 @@
-struct Params {
-    ne: u32,
-
-    // offsets in elements
-    offset_src0: u32,
-    offset_src1: u32,
-    offset_dst: u32,
-
-    stride_src1_0: u32,
-    stride_src1_1: u32,
-    stride_src1_2: u32,
-    stride_src1_3: u32,
-
-    a_ne0: u32,
-    a_ne1: u32,
-    a_ne2: u32,
-
-    b_ne0: u32,
-    b_ne1: u32,
-    b_ne2: u32,
-    b_ne3: u32,
-};
-
-fn src1_index(_i: u32) -> u32 {
-    var i = _i;
-    let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
-    i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
-    let a_i2 = i / (params.a_ne1 * params.a_ne0);
-    i = i % (params.a_ne1 * params.a_ne0);
-    let a_i1 = i / params.a_ne0;
-    let a_i0 = i % params.a_ne0;
-
-    // handle repetition of b
-    // index loops back to the beginning and repeats after elements are exhausted = modulo
-    let b_i0 = a_i0 % params.b_ne0;
-    let b_i1 = a_i1 % params.b_ne1;
-    let b_i2 = a_i2 % params.b_ne2;
-    let b_i3 = a_i3 % params.b_ne3;
-
-    // compute index for position in b's flat array
-    return b_i0 * params.stride_src1_0 +
-           b_i1 * params.stride_src1_1 +
-           b_i2 * params.stride_src1_2 +
-           b_i3 * params.stride_src1_3;
-}
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 04fa1b62d3b4d..0d2cbc530f5fb 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4840,7 +4840,7 @@ struct test_moe_expert_reduce : public test_case {
 
         std::vector<ggml_tensor *> expert_views(n_expert_used);
         for (int64_t i = 0; i < n_expert_used; ++i) {
-            expert_views[i] = ggml_view_2d(ctx, weighted, n_embd, n_tokens, weighted->nb[2], i * weighted->nb[1]);
+            expert_views[i] = ggml_view_2d(ctx, weighted, n_embd, n_tokens, weighted->nb[1], i * weighted->nb[1]);
 
             std::string name = "expert_view_" + std::to_string(i);
             ggml_set_name(expert_views[i], name.c_str());

From b319672348ba1b3a6dbfad98b2e0c7a3a6367351 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Sat, 1 Nov 2025 21:33:41 -0700
Subject: [PATCH 6/7] Revert "Implement overlap binary operators"

This reverts commit ed710b36f51ab3f53fa13db15c1685dc8678a32a.
---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 179 +++------
 .../ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl | 369 +++---------------
 .../ggml-webgpu/wgsl-shaders/binary_head.tmpl |  45 +++
 tests/test-backend-ops.cpp                    |   2 +-
 4 files changed, 159 insertions(+), 436 deletions(-)
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index f6b939e140415..70e3013537b2d 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -252,10 +252,10 @@ struct webgpu_context_struct {
     webgpu_pipeline get_rows_pipeline[30];
     webgpu_pipeline get_rows_f32_no_vec_pipeline;
     webgpu_pipeline cpy_pipeline[2][2];          // src type, dst type
-    webgpu_pipeline add_pipeline[2][2][2];       // type, inplace, overlap
-    webgpu_pipeline sub_pipeline[2][2][2];       // type, inplace, overlap
-    webgpu_pipeline mul_pipeline[2][2][2];       // type, inplace, overlap
-    webgpu_pipeline div_pipeline[2][2][2];       // type, inplace, overlap
+    webgpu_pipeline add_pipeline[2][2];          // type, inplace
+    webgpu_pipeline sub_pipeline[2][2];          // type, inplace
+    webgpu_pipeline mul_pipeline[2][2];          // type, inplace
+    webgpu_pipeline div_pipeline[2][2];          // type, inplace
     webgpu_pipeline rms_norm_pipeline[2];        // inplace
     webgpu_pipeline rope_pipeline[2][2][2];      // type, ff, inplace
     webgpu_pipeline glu_pipeline[7][2][2];       // glu-op, type, split
@@ -677,12 +677,9 @@ static size_t ggml_webgpu_tensor_align_offset(webgpu_context & ctx, ggml_tensor
     return offset & ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
 }
 
-static size_t ggml_webgpu_tensor_align_binding_size(size_t size) {
-    return (size + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1);
-}
-
 static size_t ggml_webgpu_tensor_binding_size(webgpu_context & ctx, ggml_tensor * t) {
-    return ggml_webgpu_tensor_align_binding_size(ggml_nbytes(t) + ggml_webgpu_tensor_misalignment(ctx, t));
+    return (ggml_nbytes(t) + ggml_webgpu_tensor_misalignment(ctx, t) + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) &
+           ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1);
 }
 
 // Used to determine if two tensors are the same for in-place operations
@@ -691,12 +688,6 @@ static bool ggml_webgpu_tensor_equal(ggml_tensor * a, ggml_tensor * b) {
            (ggml_webgpu_tensor_offset(a) == ggml_webgpu_tensor_offset(b));
 }
 
-static bool ggml_webgpu_tensor_overlap(ggml_tensor * a, ggml_tensor * b) {
-    return (ggml_webgpu_tensor_buf(a).Get() == ggml_webgpu_tensor_buf(b).Get()) &&
-           ggml_webgpu_tensor_offset(a) < (ggml_webgpu_tensor_offset(b) + ggml_nbytes(b)) &&
-           ggml_webgpu_tensor_offset(b) < (ggml_webgpu_tensor_offset(a) + ggml_nbytes(a));
-}
-
 static webgpu_command ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     uint32_t ne = (uint32_t) ggml_nelements(dst);
 
@@ -879,27 +870,16 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
     return ggml_backend_webgpu_build(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x);
 }
 
-template <size_t a, size_t b, size_t c>
-static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx,
-                                            ggml_tensor *    src0,
-                                            ggml_tensor *    src1,
-                                            ggml_tensor *    dst,
-                                            webgpu_pipeline (&pipelines)[a][b][c]) {
-    int             inplace  = ggml_webgpu_tensor_equal(src0, dst);
-    int             overlap  = ggml_webgpu_tensor_overlap(src0, src1);
-    webgpu_pipeline pipeline = pipelines[dst->type][inplace][overlap];
-
-    uint32_t src1_offset = ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type);
-    if (overlap) {
-        // when overlapped, bind a single buffer covering both src0 and src1
-        // TODO: Do other operations need this?
-        src1_offset = (uint32_t) ((ggml_webgpu_tensor_offset(src1) - ggml_webgpu_tensor_align_offset(ctx, src0)) /
-                                  ggml_type_size(src1->type));
-    }
+static webgpu_command ggml_webgpu_binary_op(webgpu_context &  ctx,
+                                            ggml_tensor *     src0,
+                                            ggml_tensor *     src1,
+                                            ggml_tensor *     dst,
+                                            webgpu_pipeline & pipeline,
+                                            bool              inplace) {
     std::vector<uint32_t> params = {
         (uint32_t) ggml_nelements(dst),
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
-        src1_offset,
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
         (uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
         (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
@@ -914,36 +894,25 @@ static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx,
         (uint32_t) src1->ne[3],
     };
 
-    size_t src0_binding_size = ggml_webgpu_tensor_binding_size(ctx, src0);
-    if (overlap) {
-        const uint64_t base_align = ggml_webgpu_tensor_align_offset(ctx, src0);
-        // assume end of src1 is >= end of src0
-        const uint64_t max_end    = ggml_webgpu_tensor_offset(src1) + ggml_nbytes(src1);
-        src0_binding_size         = ggml_webgpu_tensor_align_binding_size(max_end - base_align);
-    }
     std::vector<wgpu::BindGroupEntry> entries = {
         { .binding = 0,
          .buffer  = ggml_webgpu_tensor_buf(src0),
          .offset  = ggml_webgpu_tensor_align_offset(ctx, src0),
-         .size    = src0_binding_size }
+         .size    = ggml_webgpu_tensor_binding_size(ctx, src0) },
+        { .binding = 1,
+         .buffer  = ggml_webgpu_tensor_buf(src1),
+         .offset  = ggml_webgpu_tensor_align_offset(ctx, src1),
+         .size    = ggml_webgpu_tensor_binding_size(ctx, src1) }
     };
-    uint32_t binding_num = 1;
-    if (!overlap) {
-        entries.push_back({ .binding = binding_num,
-                            .buffer  = ggml_webgpu_tensor_buf(src1),
-                            .offset  = ggml_webgpu_tensor_align_offset(ctx, src1),
-                            .size    = ggml_webgpu_tensor_binding_size(ctx, src1) });
-        binding_num++;
-    }
     if (!inplace) {
-        entries.push_back({ .binding = binding_num,
+        entries.push_back({ .binding = 2,
                             .buffer  = ggml_webgpu_tensor_buf(dst),
                             .offset  = ggml_webgpu_tensor_align_offset(ctx, dst),
                             .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
     }
 
-    size_t max_wg_size = ctx->max_wg_size_x;
-    uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
+    size_t   max_wg_size = ctx->max_wg_size_x;
+    uint32_t wg_x        = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
     return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
@@ -1263,13 +1232,25 @@ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx,
         case GGML_OP_MUL_MAT:
             return ggml_webgpu_mul_mat(ctx, src0, src1, node);
         case GGML_OP_ADD:
-            return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline);
+            {
+                int inplace = ggml_webgpu_tensor_equal(src0, node);
+                return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline[node->type][inplace], inplace);
+            }
         case GGML_OP_SUB:
-            return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->sub_pipeline);
+            {
+                int inplace = ggml_webgpu_tensor_equal(src0, node);
+                return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->sub_pipeline[node->type][inplace], inplace);
+            }
         case GGML_OP_MUL:
-            return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline);
+            {
+                int inplace = ggml_webgpu_tensor_equal(src0, node);
+                return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline[node->type][inplace], inplace);
+            }
         case GGML_OP_DIV:
-            return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->div_pipeline);
+            {
+                int inplace = ggml_webgpu_tensor_equal(src0, node);
+                return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->div_pipeline[node->type][inplace], inplace);
+            }
         case GGML_OP_RMS_NORM:
             return ggml_webgpu_rms_norm(ctx, src0, node);
         case GGML_OP_ROPE:
@@ -1719,82 +1700,50 @@ static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) {
 
 static void ggml_webgpu_init_add_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][0][0], wgsl_add_f32,
-                                "add_f32", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][0][0], wgsl_add_f16,
-                                "add_f16", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][1][0], wgsl_add_f32_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][0], wgsl_add_f32, "add_f32",
+                                constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][0], wgsl_add_f16, "add_f16",
+                                constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][1], wgsl_add_f32_inplace,
                                 "add_f32_inplace", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][1][0], wgsl_add_f16_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][1], wgsl_add_f16_inplace,
                                 "add_f16_inplace", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][0][1], wgsl_add_f32_overlap,
-                                "add_f32_overlap", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][1][1],
-                                wgsl_add_f32_inplace_overlap, "add_f32_inplace_overlap", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][0][1], wgsl_add_f16_overlap,
-                                "add_f16_overlap", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][1][1],
-                                wgsl_add_f16_inplace_overlap, "add_f16_inplace_overlap", constants);
 }
 
 static void ggml_webgpu_init_sub_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][0][0], wgsl_sub_f32,
-                                "sub_f32", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][0][0], wgsl_sub_f16,
-                                "sub_f16", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][1][0], wgsl_sub_f32_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][0], wgsl_sub_f32, "sub_f32",
+                                constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][0], wgsl_sub_f16, "sub_f16",
+                                constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][1], wgsl_sub_f32_inplace,
                                 "sub_f32_inplace", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][1][0], wgsl_sub_f16_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][1], wgsl_sub_f16_inplace,
                                 "sub_f16_inplace", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][0][1], wgsl_sub_f32_overlap,
-                                "sub_f32_overlap", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][1][1],
-                                wgsl_sub_f32_inplace_overlap, "sub_f32_inplace_overlap", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][0][1], wgsl_sub_f16_overlap,
-                                "sub_f16_overlap", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][1][1],
-                                wgsl_sub_f16_inplace_overlap, "sub_f16_inplace_overlap", constants);
 }
 
 static void ggml_webgpu_init_mul_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][0][0], wgsl_mul_f32,
-                                "mul_f32", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][0][0], wgsl_mul_f16,
-                                "mul_f16", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][1][0], wgsl_mul_f32_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][0], wgsl_mul_f32, "mul_f32",
+                                constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][0], wgsl_mul_f16, "mul_f16",
+                                constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][1], wgsl_mul_f32_inplace,
                                 "mul_f32_inplace", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][1][0], wgsl_mul_f16_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][1], wgsl_mul_f16_inplace,
                                 "mul_f16_inplace", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][0][1], wgsl_mul_f32_overlap,
-                                "mul_f32_overlap", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][1][1],
-                                wgsl_mul_f32_inplace_overlap, "mul_f32_inplace_overlap", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][0][1], wgsl_mul_f16_overlap,
-                                "mul_f16_overlap", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][1][1],
-                                wgsl_mul_f16_inplace_overlap, "mul_f16_inplace_overlap", constants);
 }
 
 static void ggml_webgpu_init_div_pipeline(webgpu_context & webgpu_ctx) {
     std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][0][0], wgsl_div_f32,
-                                "div_f32", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][0][0], wgsl_div_f16,
-                                "div_f16", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][1][0], wgsl_div_f32_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][0], wgsl_div_f32, "div_f32",
+                                constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][0], wgsl_div_f16, "div_f16",
+                                constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][1], wgsl_div_f32_inplace,
                                 "div_f32_inplace", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][1][0], wgsl_div_f16_inplace,
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][1], wgsl_div_f16_inplace,
                                 "div_f16_inplace", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][0][1], wgsl_div_f32_overlap,
-                                "div_f32_overlap", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][1][1],
-                                wgsl_div_f32_inplace_overlap, "div_f32_inplace_overlap", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][0][1], wgsl_div_f16_overlap,
-                                "div_f16_overlap", constants);
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][1][1],
-                                wgsl_div_f16_inplace_overlap, "div_f16_inplace_overlap", constants);
 }
 
 static void ggml_webgpu_init_rms_norm_pipeline(webgpu_context & webgpu_ctx) {
@@ -2203,9 +2152,9 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     // TODO: Don't enable for WASM builds, they won't have an effect anyways
     // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
     //       only for native performance?
-    const char * const          deviceEnabledToggles[]  = { "disable_robustness", "disable_workgroup_init",
-                                                            "disable_polyfills_on_integer_div_and_mod" };
-    const char * const          deviceDisabledToggles[] = { "timestamp_quantization" };
+    const char * const deviceEnabledToggles[]  = { "skip_validation", "disable_robustness", "disable_workgroup_init",
+                                                   "disable_polyfills_on_integer_div_and_mod" };
+    const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
     wgpu::DawnTogglesDescriptor deviceTogglesDesc;
     deviceTogglesDesc.enabledToggles      = deviceEnabledToggles;
     deviceTogglesDesc.enabledToggleCount  = 4;
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl
index 5143a1bbf17f8..1ce4d83fa8e50 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl
@@ -5,354 +5,129 @@
     "SHADER_NAME": "add_f32",
     "REPLS": {
       "TYPE" : "f32",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "dst",
-      "OP": "+",
-      "PARAMS_BINDING": 3
+      "OP": "+"
     },
     "DECLS": ["NOT_INPLACE"]
   },
-  {
-    "SHADER_NAME": "add_f32_inplace",
-    "REPLS": {
-      "TYPE" : "f32",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "src0",
-      "OP": "+",
-      "PARAMS_BINDING": 2
-    },
-    "DECLS": ["INPLACE"]
-  },
-  {
-    "SHADER_NAME": "add_f32_overlap",
-    "REPLS": {
-      "TYPE" : "f32",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "dst",
-      "OP": "+",
-      "PARAMS_BINDING": 2
-    },
-    "DECLS": ["OVERLAP"]
-  },
-  {
-    "SHADER_NAME": "add_f32_inplace_overlap",
-    "REPLS": {
-      "TYPE" : "f32",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "src0",
-      "OP": "+",
-      "PARAMS_BINDING": 1
-    },
-    "DECLS": ["INPLACE_OVERLAP"]
-  },
   {
     "SHADER_NAME": "add_f16",
     "REPLS": {
       "TYPE" : "f16",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "dst",
-      "OP": "+",
-      "PARAMS_BINDING": 3
+      "OP": "+"
     },
     "DECLS": ["NOT_INPLACE"]
   },
   {
-    "SHADER_NAME": "add_f16_inplace",
+    "SHADER_NAME": "add_f32_inplace",
     "REPLS": {
-      "TYPE" : "f16",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "src0",
-      "OP": "+",
-      "PARAMS_BINDING": 2
+      "TYPE" : "f32",
+      "OP": "+"
     },
     "DECLS": ["INPLACE"]
   },
   {
-    "SHADER_NAME": "add_f16_overlap",
-    "REPLS": {
-      "TYPE" : "f16",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "dst",
-      "OP": "+",
-      "PARAMS_BINDING": 2
-    },
-    "DECLS": ["OVERLAP"]
-  },
-  {
-    "SHADER_NAME": "add_f16_inplace_overlap",
+    "SHADER_NAME": "add_f16_inplace",
     "REPLS": {
       "TYPE" : "f16",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "src0",
-      "OP": "+",
-      "PARAMS_BINDING": 1
+      "OP": "+"
     },
-    "DECLS": ["INPLACE_OVERLAP"]
+    "DECLS": ["INPLACE"]
   },
   {
     "SHADER_NAME": "mul_f32",
     "REPLS": {
       "TYPE" : "f32",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "dst",
-      "OP": "*",
-      "PARAMS_BINDING": 3
+      "OP": "*"
     },
     "DECLS": ["NOT_INPLACE"]
   },
-  {
-    "SHADER_NAME": "mul_f32_inplace",
-    "REPLS": {
-      "TYPE" : "f32",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "src0",
-      "OP": "*",
-      "PARAMS_BINDING": 2
-    },
-    "DECLS": ["INPLACE"]
-  },
-  {
-    "SHADER_NAME": "mul_f32_overlap",
-    "REPLS": {
-      "TYPE" : "f32",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "dst",
-      "OP": "*",
-      "PARAMS_BINDING": 2
-    },
-    "DECLS": ["OVERLAP"]
-  },
-  {
-    "SHADER_NAME": "mul_f32_inplace_overlap",
-    "REPLS": {
-      "TYPE" : "f32",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "src0",
-      "OP": "*",
-      "PARAMS_BINDING": 1
-    },
-    "DECLS": ["INPLACE_OVERLAP"]
-  },
   {
     "SHADER_NAME": "mul_f16",
     "REPLS": {
       "TYPE" : "f16",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "dst",
-      "OP": "*",
-      "PARAMS_BINDING": 3
+      "OP": "*"
     },
     "DECLS": ["NOT_INPLACE"]
   },
   {
-    "SHADER_NAME": "mul_f16_inplace",
+    "SHADER_NAME": "mul_f32_inplace",
     "REPLS": {
-      "TYPE" : "f16",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "src0",
-      "OP": "*",
-      "PARAMS_BINDING": 2
+      "TYPE" : "f32",
+      "OP": "*"
     },
     "DECLS": ["INPLACE"]
   },
   {
-    "SHADER_NAME": "mul_f16_overlap",
-    "REPLS": {
-      "TYPE" : "f16",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "dst",
-      "OP": "*",
-      "PARAMS_BINDING": 2
-    },
-    "DECLS": ["OVERLAP"]
-  },
-  {
-    "SHADER_NAME": "mul_f16_inplace_overlap",
+    "SHADER_NAME": "mul_f16_inplace",
     "REPLS": {
       "TYPE" : "f16",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "src0",
-      "OP": "*",
-      "PARAMS_BINDING": 1
+      "OP": "*"
     },
-    "DECLS": ["INPLACE_OVERLAP"]
+    "DECLS": ["INPLACE"]
   },
   {
     "SHADER_NAME": "sub_f32",
     "REPLS": {
       "TYPE" : "f32",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "dst",
-      "OP": "-",
-      "PARAMS_BINDING": 3
+      "OP": "-"
     },
     "DECLS": ["NOT_INPLACE"]
   },
-  {
-    "SHADER_NAME": "sub_f32_inplace",
-    "REPLS": {
-      "TYPE" : "f32",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "src0",
-      "OP": "-",
-      "PARAMS_BINDING": 2
-    },
-    "DECLS": ["INPLACE"]
-  },
-  {
-    "SHADER_NAME": "sub_f32_overlap",
-    "REPLS": {
-      "TYPE" : "f32",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "dst",
-      "OP": "-",
-      "PARAMS_BINDING": 2
-    },
-    "DECLS": ["OVERLAP"]
-  },
-  {
-    "SHADER_NAME": "sub_f32_inplace_overlap",
-    "REPLS": {
-      "TYPE" : "f32",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "src0",
-      "OP": "-",
-      "PARAMS_BINDING": 1
-    },
-    "DECLS": ["INPLACE_OVERLAP"]
-  },
   {
     "SHADER_NAME": "sub_f16",
     "REPLS": {
       "TYPE" : "f16",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "dst",
-      "OP": "-",
-      "PARAMS_BINDING": 3
+      "OP": "-"
     },
     "DECLS": ["NOT_INPLACE"]
   },
   {
-    "SHADER_NAME": "sub_f16_inplace",
+    "SHADER_NAME": "sub_f32_inplace",
     "REPLS": {
-      "TYPE" : "f16",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "src0",
-      "OP": "-",
-      "PARAMS_BINDING": 2
+      "TYPE" : "f32",
+      "OP": "-"
     },
     "DECLS": ["INPLACE"]
   },
   {
-    "SHADER_NAME": "sub_f16_overlap",
-    "REPLS": {
-      "TYPE" : "f16",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "dst",
-      "OP": "-",
-      "PARAMS_BINDING": 2
-    },
-    "DECLS": ["OVERLAP"]
-  },
-  {
-    "SHADER_NAME": "sub_f16_inplace_overlap",
+    "SHADER_NAME": "sub_f16_inplace",
     "REPLS": {
       "TYPE" : "f16",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "src0",
-      "OP": "-",
-      "PARAMS_BINDING": 1
+      "OP": "-"
     },
-    "DECLS": ["INPLACE_OVERLAP"]
+    "DECLS": ["INPLACE"]
   },
-
   {
     "SHADER_NAME": "div_f32",
     "REPLS": {
       "TYPE" : "f32",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "dst",
-      "OP": "/",
-      "PARAMS_BINDING": 3
+      "OP": "/"
     },
     "DECLS": ["NOT_INPLACE"]
   },
-  {
-    "SHADER_NAME": "div_f32_inplace",
-    "REPLS": {
-      "TYPE" : "f32",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "src0",
-      "OP": "/",
-      "PARAMS_BINDING": 2
-    },
-    "DECLS": ["INPLACE"]
-  },
-  {
-    "SHADER_NAME": "div_f32_overlap",
-    "REPLS": {
-      "TYPE" : "f32",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "dst",
-      "OP": "/",
-      "PARAMS_BINDING": 2
-    },
-    "DECLS": ["OVERLAP"]
-  },
-  {
-    "SHADER_NAME": "div_f32_inplace_overlap",
-    "REPLS": {
-      "TYPE" : "f32",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "src0",
-      "OP": "/",
-      "PARAMS_BINDING": 1
-    },
-    "DECLS": ["INPLACE_OVERLAP"]
-  },
   {
     "SHADER_NAME": "div_f16",
     "REPLS": {
       "TYPE" : "f16",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "dst",
-      "OP": "/",
-      "PARAMS_BINDING": 3
+      "OP": "/"
     },
     "DECLS": ["NOT_INPLACE"]
   },
   {
-    "SHADER_NAME": "div_f16_inplace",
+    "SHADER_NAME": "div_f32_inplace",
     "REPLS": {
-      "TYPE" : "f16",
-      "SRC1_BUF": "src1",
-      "DST_BUF": "src0",
-      "OP": "/",
-      "PARAMS_BINDING": 2
+      "TYPE" : "f32",
+      "OP": "/"
     },
     "DECLS": ["INPLACE"]
   },
   {
-    "SHADER_NAME": "div_f16_overlap",
-    "REPLS": {
-      "TYPE" : "f16",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "dst",
-      "OP": "/",
-      "PARAMS_BINDING": 2
-    },
-    "DECLS": ["OVERLAP"]
-  },
-  {
-    "SHADER_NAME": "div_f16_inplace_overlap",
+    "SHADER_NAME": "div_f16_inplace",
     "REPLS": {
       "TYPE" : "f16",
-      "SRC1_BUF": "src0",
-      "DST_BUF": "src0",
-      "OP": "/",
-      "PARAMS_BINDING": 1
+      "OP": "/"
     },
-    "DECLS": ["INPLACE_OVERLAP"]
+    "DECLS": ["INPLACE"]
   }
 ]
 
@@ -362,89 +137,43 @@
 
 #decl(NOT_INPLACE)
 
-@group(0) @binding(1)
-var<storage, read_write> src1: array<{{TYPE}}>;
+fn update(dst_i: u32, src0_i: u32, src1_i: u32) {
+    dst[dst_i] = src0[src0_i] {{OP}} src1[src1_i];
+}
 
 @group(0) @binding(2)
 var<storage, read_write> dst: array<{{TYPE}}>;
 
+@group(0) @binding(3)
+var<uniform> params: Params;
+
 #enddecl(NOT_INPLACE)
 
 #decl(INPLACE)
 
-@group(0) @binding(1)
-var<storage, read_write> src1: array<{{TYPE}}>;
-
-#enddecl(INPLACE)
-
-#decl(OVERLAP)
-
-@group(0) @binding(1)
-var<storage, read_write> dst: array<{{TYPE}}>;
-
-#enddecl(OVERLAP)
+fn update(dst_i: u32, src0_i: u32, src1_i: u32) {
+    src0[dst_i] = src0[src0_i] {{OP}} src1[src1_i];
+}
 
-#decl(INPLACE_OVERLAP)
+@group(0) @binding(2)
+var<uniform> params: Params;
 
-#enddecl(INPLACE_OVERLAP)
+#enddecl(INPLACE)
 
 #end(DECLS)
 
+
 #define(SHADER)
 
 enable f16;
 
-struct Params {
-    ne: u32,
-
-    // offsets in elements
-    offset_src0: u32,
-    offset_src1: u32,
-    offset_dst: u32,
-
-    stride_src1_0: u32,
-    stride_src1_1: u32,
-    stride_src1_2: u32,
-    stride_src1_3: u32,
-
-    a_ne0: u32,
-    a_ne1: u32,
-    a_ne2: u32,
-
-    b_ne0: u32,
-    b_ne1: u32,
-    b_ne2: u32,
-    b_ne3: u32,
-};
-
-fn src1_index(_i: u32) -> u32 {
-    var i = _i;
-    let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
-    i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
-    let a_i2 = i / (params.a_ne1 * params.a_ne0);
-    i = i % (params.a_ne1 * params.a_ne0);
-    let a_i1 = i / params.a_ne0;
-    let a_i0 = i % params.a_ne0;
-
-    // handle repetition of b
-    // index loops back to the beginning and repeats after elements are exhausted = modulo
-    let b_i0 = a_i0 % params.b_ne0;
-    let b_i1 = a_i1 % params.b_ne1;
-    let b_i2 = a_i2 % params.b_ne2;
-    let b_i3 = a_i3 % params.b_ne3;
-
-    // compute index for position in b's flat array
-    return b_i0 * params.stride_src1_0 +
-           b_i1 * params.stride_src1_1 +
-           b_i2 * params.stride_src1_2 +
-           b_i3 * params.stride_src1_3;
-}
+#include "binary_head.tmpl"
 
 @group(0) @binding(0)
 var<storage, read_write> src0: array<{{TYPE}}>;
 
-@group(0) @binding({{PARAMS_BINDING}})
-var<uniform> params: Params;
+@group(0) @binding(1)
+var<storage, read_write> src1: array<{{TYPE}}>;
 
 DECLS
 
@@ -452,7 +181,7 @@ override wg_size: u32;
 @compute @workgroup_size(wg_size)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     if (gid.x < params.ne) {
-        {{DST_BUF}}[params.offset_dst + gid.x] = src0[params.offset_src0 + gid.x] {{OP}} {{SRC1_BUF}}[params.offset_src1 + src1_index(gid.x)];
+        update(params.offset_dst + gid.x, params.offset_src0 + gid.x, params.offset_src1 + src1_index(gid.x));
     }
 }
 
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl
new file mode 100644
index 0000000000000..4b254f468d69e
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl
@@ -0,0 +1,45 @@
+struct Params {
+    ne: u32,
+
+    // offsets in elements
+    offset_src0: u32,
+    offset_src1: u32,
+    offset_dst: u32,
+
+    stride_src1_0: u32,
+    stride_src1_1: u32,
+    stride_src1_2: u32,
+    stride_src1_3: u32,
+
+    a_ne0: u32,
+    a_ne1: u32,
+    a_ne2: u32,
+
+    b_ne0: u32,
+    b_ne1: u32,
+    b_ne2: u32,
+    b_ne3: u32,
+};
+
+fn src1_index(_i: u32) -> u32 {
+    var i = _i;
+    let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
+    i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
+    let a_i2 = i / (params.a_ne1 * params.a_ne0);
+    i = i % (params.a_ne1 * params.a_ne0);
+    let a_i1 = i / params.a_ne0;
+    let a_i0 = i % params.a_ne0;
+
+    // handle repetition of b
+    // index loops back to the beginning and repeats after elements are exhausted = modulo
+    let b_i0 = a_i0 % params.b_ne0;
+    let b_i1 = a_i1 % params.b_ne1;
+    let b_i2 = a_i2 % params.b_ne2;
+    let b_i3 = a_i3 % params.b_ne3;
+
+    // compute index for position in b's flat array
+    return b_i0 * params.stride_src1_0 +
+           b_i1 * params.stride_src1_1 +
+           b_i2 * params.stride_src1_2 +
+           b_i3 * params.stride_src1_3;
+}
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 0d2cbc530f5fb..04fa1b62d3b4d 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4840,7 +4840,7 @@ struct test_moe_expert_reduce : public test_case {
 
         std::vector<ggml_tensor *> expert_views(n_expert_used);
         for (int64_t i = 0; i < n_expert_used; ++i) {
-            expert_views[i] = ggml_view_2d(ctx, weighted, n_embd, n_tokens, weighted->nb[1], i * weighted->nb[1]);
+            expert_views[i] = ggml_view_2d(ctx, weighted, n_embd, n_tokens, weighted->nb[2], i * weighted->nb[1]);
 
             std::string name = "expert_view_" + std::to_string(i);
             ggml_set_name(expert_views[i], name.c_str());

From 9a029e48a28248d6b11e4fd3bc6cd2dfa8d33bcf Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Sat, 1 Nov 2025 21:36:21 -0700
Subject: [PATCH 7/7] Disable support for non-contiguous binary_op tensors and
 leave note for future support

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 70e3013537b2d..1a15756731580 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -1969,8 +1969,10 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
         case GGML_OP_SUB:
         case GGML_OP_MUL:
         case GGML_OP_DIV:
+            // TODO: support non-contiguous tensors, e.g. for MOE_EXPERT_REDUCE
+            // see https://github.com/ggml-org/llama.cpp/pull/16857
             supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type) &&
-                          (src1->type == op->type);
+                          (src1->type == op->type) && ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
             break;
         case GGML_OP_CPY:
         case GGML_OP_CONT: