From 5ae936a3bcdb6d3e950fb22b6df523ae9dbcd8ac Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 1 Oct 2025 09:34:25 -0700
Subject: [PATCH 01/15] Add profiling

---
 ggml/CMakeLists.txt                  |   2 +
 ggml/src/ggml-webgpu/CMakeLists.txt  |   4 +
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 216 ++++++++++++++++++++++-----
 3 files changed, 187 insertions(+), 35 deletions(-)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 6ce52ffc6698b..c4f83c6e88d7e 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -222,6 +222,8 @@ option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_WEBGPU                          "ggml: use WebGPU"                                OFF)
 option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
+option(GGML_WEBGPU_PROFILE                  "ggml: enable WebGPU performance profiling"       OFF)
+
 option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt
index 78a985a4d167a..0f6d65e58035f 100644
--- a/ggml/src/ggml-webgpu/CMakeLists.txt
+++ b/ggml/src/ggml-webgpu/CMakeLists.txt
@@ -50,5 +50,9 @@ if (GGML_WEBGPU_DEBUG)
     target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1)
 endif()
 
+if (GGML_WEBGPU_PROFILE)
+    target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_PROFILE=1)
+endif()
+
 target_include_directories(ggml-webgpu PRIVATE ${SHADER_OUTPUT_DIR})
 target_link_libraries(ggml-webgpu PRIVATE ${DawnWebGPU_TARGET})
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index e795ca3fd92fd..a8d4e058c7a8f 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -25,6 +25,19 @@
 #    define WEBGPU_LOG_DEBUG(msg) ((void) 0)
 #endif  // GGML_WEBGPU_DEBUG
 
+#ifdef GGML_WEBGPU_PROFILE
+#    define WEBGPU_CPU_PROFILE_START(id) auto cpu_start_##id = std::chrono::high_resolution_clock::now();
+#    define WEBGPU_CPU_PROFILE_END(id)                                                                           \
+        auto   cpu_end_##id  = std::chrono::high_resolution_clock::now();                                        \
+        double cpu_time_##id = std::chrono::duration<double, std::milli>(cpu_end_##id - cpu_start_##id).count(); \
+        ctx->cpu_time_ms[#id] += cpu_time_##id;
+#    define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS       100
+#    define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16  // e.g. enough for two timestamps
+#else
+#    define WEBGPU_CPU_PROFILE_START(id) ((void) 0)
+#    define WEBGPU_CPU_PROFILE_END(id)   ((void) 0)
+#endif  // GGML_WEBGPU_PROFILE
+
 /* Constants */
 
 #define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE     16
@@ -112,6 +125,11 @@ struct webgpu_buf_pool {
     }
 };
 
+struct webgpu_pipeline {
+    wgpu::ComputePipeline pipeline;
+    std::string           name;
+};
+
 // All the base objects needed to run operations on a WebGPU device
 struct webgpu_context_struct {
     wgpu::Instance instance;
@@ -129,21 +147,21 @@ struct webgpu_context_struct {
     webgpu_buf_pool param_buf_pool;
     webgpu_buf_pool set_rows_error_buf_pool;
 
-    wgpu::ComputePipeline memset_pipeline;
-    wgpu::ComputePipeline mul_mat_pipeline[30][2];
-    wgpu::ComputePipeline set_rows_pipeline;
-    wgpu::ComputePipeline get_rows_pipeline[30];
-    wgpu::ComputePipeline get_rows_f32_no_vec_pipeline;
-    wgpu::ComputePipeline cpy_pipeline[2][2];          // src type, dst type
-    wgpu::ComputePipeline add_pipeline[2][2];          // type, inplace
-    wgpu::ComputePipeline sub_pipeline[2][2];          // type, inplace
-    wgpu::ComputePipeline mul_pipeline[2][2];          // type, inplace
-    wgpu::ComputePipeline div_pipeline[2][2];          // type, inplace
-    wgpu::ComputePipeline rms_norm_pipeline[2];        // inplace
-    wgpu::ComputePipeline rope_pipeline[2][2][2];      // type, ff, inplace
-    wgpu::ComputePipeline glu_pipeline[7][2][2];       // glu-op, type, split
-    wgpu::ComputePipeline scale_pipeline[2];           // inplace
-    wgpu::ComputePipeline soft_max_pipeline[3][2][2];  // (no_mask, f32_mask, f16_mask), has_sink, inplace
+    webgpu_pipeline memset_pipeline;
+    webgpu_pipeline mul_mat_pipeline[30][2];
+    webgpu_pipeline set_rows_pipeline;
+    webgpu_pipeline get_rows_pipeline[30];
+    webgpu_pipeline get_rows_f32_no_vec_pipeline;
+    webgpu_pipeline cpy_pipeline[2][2];          // src type, dst type
+    webgpu_pipeline add_pipeline[2][2];          // type, inplace
+    webgpu_pipeline sub_pipeline[2][2];          // type, inplace
+    webgpu_pipeline mul_pipeline[2][2];          // type, inplace
+    webgpu_pipeline div_pipeline[2][2];          // type, inplace
+    webgpu_pipeline rms_norm_pipeline[2];        // inplace
+    webgpu_pipeline rope_pipeline[2][2][2];      // type, ff, inplace
+    webgpu_pipeline glu_pipeline[7][2][2];       // glu-op, type, split
+    webgpu_pipeline scale_pipeline[2];           // inplace
+    webgpu_pipeline soft_max_pipeline[3][2][2];  // (no_mask, f32_mask, f16_mask), has_sink, inplace
 
     size_t memset_bytes_per_thread;
 
@@ -164,6 +182,20 @@ struct webgpu_context_struct {
     wgpu::Buffer debug_host_buf;
     wgpu::Buffer debug_dev_buf;
 #endif
+
+#ifdef GGML_WEBGPU_PROFILE
+    // Profiling: per-shader GPU time in ms
+    std::unordered_map<std::string, double> shader_gpu_time_ms;
+
+    // Profiling: labeled CPU time in ms
+    std::unordered_map<std::string, double> cpu_time_ms;
+
+    // Profiling: pool of timestamp query buffers (one per operation)
+    webgpu_buf_pool timestamp_query_buf_pool;
+
+    // Profiling: staged timestamp buffers and their labels for batch submission
+    std::vector<std::pair<webgpu_pool_bufs, std::string>> staged_timestamp_bufs;
+#endif
 };
 
 typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
@@ -199,7 +231,7 @@ struct ggml_backend_webgpu_buffer_context {
 /* WebGPU object initializations */
 
 static void ggml_webgpu_create_pipeline(wgpu::Device &                           device,
-                                        wgpu::ComputePipeline &                  pipeline,
+                                        webgpu_pipeline &                        pipeline,
                                         const char *                             shader_code,
                                         const char *                             label,
                                         const std::vector<wgpu::ConstantEntry> & constants = {}) {
@@ -222,7 +254,7 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &
         pipeline_desc.compute.constants     = constants.data();
         pipeline_desc.compute.constantCount = constants.size();
     }
-    pipeline = device.CreateComputePipeline(&pipeline_desc);
+    pipeline = { device.CreateComputePipeline(&pipeline_desc), label };
 }
 
 static void ggml_webgpu_create_buffer(wgpu::Device &    device,
@@ -271,6 +303,28 @@ static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) {
     }
 }
 
+#ifdef GGML_WEBGPU_PROFILE
+static wgpu::FutureWaitInfo ggml_backend_webgpu_process_timestamps(webgpu_context & ctx,
+                                                                   webgpu_pool_bufs ts_bufs,
+                                                                   std::string      label) {
+    wgpu::Future f = ts_bufs.host_buf.MapAsync(
+        wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
+        [ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) {
+            if (status != wgpu::MapAsyncStatus::Success) {
+                GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str());
+            } else {
+                const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange();
+                // WebGPU timestamps are in ticks; convert to ms using device timestamp period if available
+                double elapsed_ms        = double(ts_data[1] - ts_data[0]) * 1e-6;  // TODO: use actual timestamp period
+                ctx->shader_gpu_time_ms[label] += elapsed_ms;
+                // We can't unmap in here due to WebGPU reentrancy limitations.
+                ctx->timestamp_query_buf_pool.free_bufs({ ts_bufs });
+            }
+        });
+    return { f };
+}
+#endif
+
 static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
     std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_submit_queue()");
@@ -295,6 +349,10 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
     std::vector<webgpu_pool_bufs> staged_param_bufs         = std::move(ctx->staged_param_bufs);
     std::vector<webgpu_pool_bufs> staged_set_row_error_bufs = std::move(ctx->staged_set_row_error_bufs);
 
+#ifdef GGML_WEBGPU_PROFILE
+    std::vector<std::pair<webgpu_pool_bufs, std::string>> staged_timestamp_bufs = std::move(ctx->staged_timestamp_bufs);
+#endif
+
     // Free the staged parameter buffers once the submission completes
     wgpu::Future p_f = ctx->queue.OnSubmittedWorkDone(
         wgpu::CallbackMode::AllowSpontaneous,
@@ -325,6 +383,14 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
             });
         ctx->callback_futures.push_back({ f });
     }
+
+#ifdef GGML_WEBGPU_PROFILE
+    for (auto & pair : staged_timestamp_bufs) {
+        auto &              ts_bufs = pair.first;
+        const std::string & label   = pair.second;
+        ctx->callback_futures.push_back(ggml_backend_webgpu_process_timestamps(ctx, ts_bufs, label));
+    }
+#endif
 }
 
 static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
@@ -365,7 +431,7 @@ static void ggml_backend_webgpu_debug(webgpu_context & ctx) {
 #endif
 
 static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &                  ctx,
-                                                  wgpu::ComputePipeline &           pipeline,
+                                                  webgpu_pipeline &                 pipeline,
                                                   std::vector<uint32_t>             params,
                                                   std::vector<wgpu::BindGroupEntry> bind_group_entries,
                                                   uint32_t                          wg_x,
@@ -388,7 +454,7 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
                                    .size    = params_bufs.dev_buf.GetSize() });
 
     wgpu::BindGroupDescriptor bind_group_desc;
-    bind_group_desc.layout     = pipeline.GetBindGroupLayout(0);
+    bind_group_desc.layout     = pipeline.pipeline.GetBindGroupLayout(0);
     bind_group_desc.entryCount = bind_group_entries.size();
     bind_group_desc.entries    = bind_group_entries.data();
     if (bind_group_label) {
@@ -398,11 +464,40 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
 
     wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
     encoder.CopyBufferToBuffer(params_bufs.host_buf, 0, params_bufs.dev_buf, 0, params_bufs.dev_buf.GetSize());
+
+#ifdef GGML_WEBGPU_PROFILE
+    // --- Profiling: GPU timestamp queries ---
+    // Allocate a timestamp query buffer (2 timestamps: start/end)
+    webgpu_pool_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs();
+    if (ts_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) {
+        ts_bufs.host_buf.Unmap();
+    }
+
+    // Create a query set for 2 timestamps
+    wgpu::QuerySetDescriptor ts_query_set_desc = {};
+    ts_query_set_desc.type                     = wgpu::QueryType::Timestamp;
+    ts_query_set_desc.count                    = 2;
+    wgpu::QuerySet ts_query_set                = ctx->device.CreateQuerySet(&ts_query_set_desc);
+
+    wgpu::PassTimestampWrites   ts_writes = { .querySet                  = ts_query_set,
+                                              .beginningOfPassWriteIndex = 0,
+                                              .endOfPassWriteIndex       = 1 };
+    wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes };
+    wgpu::ComputePassEncoder    pass      = encoder.BeginComputePass(&pass_desc);
+#else
     wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
-    pass.SetPipeline(pipeline);
+#endif
+    pass.SetPipeline(pipeline.pipeline);
     pass.SetBindGroup(0, bind_group);
     pass.DispatchWorkgroups(wg_x, 1, 1);
     pass.End();
+
+#ifdef GGML_WEBGPU_PROFILE
+    // Resolve the query set into the device buffer
+    encoder.ResolveQuerySet(ts_query_set, 0, 2, ts_bufs.dev_buf, 0);
+    encoder.CopyBufferToBuffer(ts_bufs.dev_buf, 0, ts_bufs.host_buf, 0, ts_bufs.host_buf.GetSize());
+#endif
+
     wgpu::CommandBuffer commands = encoder.Finish();
     if (submit_and_wait) {
         // Submit and wait immediately
@@ -416,12 +511,20 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
                                       ctx->param_buf_pool.free_bufs({ params_bufs });
                                   }),
                               UINT64_MAX);
+#ifdef GGML_WEBGPU_PROFILE
+        ctx->instance.WaitAny(ggml_backend_webgpu_process_timestamps(ctx, ts_bufs, pipeline.name).future, UINT64_MAX);
+#endif
     } else {
         // Lock the context mutex when pushing to the staging vectors.
         std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
         // Enqueue commands and only submit if we have enough staged commands
         ctx->staged_command_bufs.push_back(commands);
         ctx->staged_param_bufs.push_back(params_bufs);
+#ifdef GGML_WEBGPU_PROFILE
+        // Store timestamp buffer and label for later processing
+        ctx->staged_timestamp_bufs.push_back(std::make_pair(ts_bufs, pipeline.name));
+#endif
+
         if (ctx->staged_command_bufs.size() == WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
             ggml_backend_webgpu_submit_queue(ctx);
             ggml_backend_webgpu_wait_on_submission(ctx);
@@ -456,8 +559,33 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) {
     ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *) backend->context;
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")");
 
-    // TODO: cleanup
+#ifdef GGML_WEBGPU_PROFILE
+    // Print detailed profiling information
+    std::cout << "\n[ggml_webgpu profiling summary]\n";
+    double total_gpu = 0.0;
+    for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) {
+        total_gpu += kv.second;
+    }
+    double total_cpu = 0.0;
+    for (const auto & kv : ctx->webgpu_ctx->cpu_time_ms) {
+        total_cpu += kv.second;
+    }
+    std::cout << "ggml_webgpu: total cpu time (all shaders): " << total_cpu << " ms\n";
+    std::cout << "ggml_webgpu: total gpu time (all shaders): " << total_gpu << " ms\n";
+    std::cout << "ggml_webgpu: gpu/cpu ratio: " << (total_cpu > 0.0 ? total_gpu / total_cpu : 0.0) << "\n";
+    std::cout << "ggml_webgpu: cpu breakdown:\n";
+    for (const auto & kv : ctx->webgpu_ctx->cpu_time_ms) {
+        double pct = (total_cpu > 0.0) ? (kv.second / total_cpu * 100.0) : 0.0;
+        std::cout << "ggml_webgpu:  " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n";
+    }
+    std::cout << "\nggml_webgpu: gpu breakdown:\n";
+    for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) {
+        double pct = (total_gpu > 0.0) ? (kv.second / total_gpu * 100.0) : 0.0;
+        std::cout << "ggml_webgpu:  " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n";
+    }
+#else
     GGML_UNUSED(ctx);
+#endif
 }
 
 static size_t ggml_webgpu_tensor_offset(const ggml_tensor * tensor) {
@@ -610,7 +738,7 @@ static void ggml_webgpu_get_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
     size_t   max_wg_size = ctx->max_wg_size_x;
     uint32_t wg_x        = (dst->ne[1] * dst->ne[2] * dst->ne[3] + max_wg_size - 1) / max_wg_size;
 
-    wgpu::ComputePipeline pipeline = ctx->get_rows_pipeline[src->type];
+    webgpu_pipeline pipeline = ctx->get_rows_pipeline[src->type];
     if (src->type == GGML_TYPE_F32 && dst->ne[0] % 4 != 0) {
         pipeline = ctx->get_rows_f32_no_vec_pipeline;
     }
@@ -658,12 +786,12 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t
                                           ggml_op_name(dst->op));
 }
 
-static void ggml_webgpu_binary_op(webgpu_context &        ctx,
-                                  ggml_tensor *           src0,
-                                  ggml_tensor *           src1,
-                                  ggml_tensor *           dst,
-                                  wgpu::ComputePipeline & pipeline,
-                                  bool                    inplace) {
+static void ggml_webgpu_binary_op(webgpu_context &  ctx,
+                                  ggml_tensor *     src0,
+                                  ggml_tensor *     src1,
+                                  ggml_tensor *     dst,
+                                  webgpu_pipeline & pipeline,
+                                  bool              inplace) {
     std::vector<uint32_t> params = {
         (uint32_t) ggml_nelements(dst),
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
@@ -822,9 +950,9 @@ static void ggml_webgpu_rope(webgpu_context & ctx,
                             .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
     }
 
-    wgpu::ComputePipeline pipeline    = ctx->rope_pipeline[dst->type][has_freq_factor][inplace];
-    size_t                max_wg_size = ctx->max_wg_size_x;
-    uint32_t              wg_x        = (ggml_nelements(src0) / 2 + max_wg_size - 1) / max_wg_size;
+    webgpu_pipeline pipeline    = ctx->rope_pipeline[dst->type][has_freq_factor][inplace];
+    size_t          max_wg_size = ctx->max_wg_size_x;
+    uint32_t        wg_x        = (ggml_nelements(src0) / 2 + max_wg_size - 1) / max_wg_size;
     ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
 }
 
@@ -875,9 +1003,9 @@ static void ggml_webgpu_glu(webgpu_context & ctx, ggml_tensor * src0, ggml_tenso
                         .offset  = ggml_webgpu_tensor_align_offset(ctx, dst),
                         .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
 
-    wgpu::ComputePipeline pipeline    = ctx->glu_pipeline[ggml_get_glu_op(dst)][dst->type][split];
-    size_t                max_wg_size = ctx->max_wg_size_x;
-    uint32_t              wg_x        = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
+    webgpu_pipeline pipeline    = ctx->glu_pipeline[ggml_get_glu_op(dst)][dst->type][split];
+    size_t          max_wg_size = ctx->max_wg_size_x;
+    uint32_t        wg_x        = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
     ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
 }
 
@@ -1076,6 +1204,9 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
     ggml_backend_webgpu_context * backend_ctx = static_cast<ggml_backend_webgpu_context *>(backend->context);
     webgpu_context                ctx         = backend_ctx->webgpu_ctx;
 
+    // --- Profiling: CPU timing for graph compute ---
+    WEBGPU_CPU_PROFILE_START(graph_compute);
+
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_webgpu_encode_node(ctx, cgraph->nodes[i]);
     }
@@ -1083,6 +1214,8 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
     ggml_backend_webgpu_submit_queue(ctx);
     ggml_backend_webgpu_wait_on_submission(ctx);
 
+    WEBGPU_CPU_PROFILE_END(graph_compute);
+
     return GGML_STATUS_SUCCESS;
 }
 
@@ -1902,7 +2035,11 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     // Initialize device
     std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16,
                                                          wgpu::FeatureName::ImplicitDeviceSynchronization };
-    wgpu::DeviceDescriptor         dev_desc;
+#ifdef GGML_WEBGPU_PROFILE
+    required_features.push_back(wgpu::FeatureName::TimestampQuery);
+#endif
+
+    wgpu::DeviceDescriptor dev_desc;
     dev_desc.requiredLimits       = &ctx->limits;
     dev_desc.requiredFeatures     = required_features.data();
     dev_desc.requiredFeatureCount = required_features.size();
@@ -1939,6 +2076,15 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     ctx->param_buf_pool.init(ctx->device, WEBGPU_NUM_PARAM_BUFS, WEBGPU_PARAMS_BUF_SIZE_BYTES,
                              wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform,
                              wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite);
+
+#ifdef GGML_WEBGPU_PROFILE
+    // Initialize buffer pool for timestamp queries (profiling)
+    ctx->timestamp_query_buf_pool.init(ctx->device, WEBGPU_NUM_TIMESTAMP_QUERY_BUFS,
+                                       WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
+                                       wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc,
+                                       wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst);
+#endif
+
     ctx->set_rows_error_buf_pool.init(ctx->device, WEBGPU_NUM_SET_ROWS_ERROR_BUFS, WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
                                       wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage,
                                       wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead);

From bd3d0807dc340621b3dee85b5416a48f84833a01 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 1 Oct 2025 10:53:30 -0700
Subject: [PATCH 02/15] More detailed profiling

---
 ggml/CMakeLists.txt                  |   3 +-
 ggml/src/ggml-webgpu/CMakeLists.txt  |   8 +-
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 145 +++++++++++++++++++--------
 3 files changed, 113 insertions(+), 43 deletions(-)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index c4f83c6e88d7e..67d02e73f6767 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -222,7 +222,8 @@ option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_WEBGPU                          "ggml: use WebGPU"                                OFF)
 option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
-option(GGML_WEBGPU_PROFILE                  "ggml: enable WebGPU performance profiling"       OFF)
+option(GGML_WEBGPU_CPU_PROFILE              "ggml: enable WebGPU profiling (CPU)"             OFF)
+option(GGML_WEBGPU_GPU_PROFILE              "ggml: enable WebGPU profiling (GPU)"            OFF)
 
 option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt
index 0f6d65e58035f..c6a95d5151245 100644
--- a/ggml/src/ggml-webgpu/CMakeLists.txt
+++ b/ggml/src/ggml-webgpu/CMakeLists.txt
@@ -50,8 +50,12 @@ if (GGML_WEBGPU_DEBUG)
     target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1)
 endif()
 
-if (GGML_WEBGPU_PROFILE)
-    target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_PROFILE=1)
+if (GGML_WEBGPU_CPU_PROFILE)
+    target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_CPU_PROFILE=1)
+endif()
+
+if (GGML_WEBGPU_GPU_PROFILE)
+    target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_GPU_PROFILE=1)
 endif()
 
 target_include_directories(ggml-webgpu PRIVATE ${SHADER_OUTPUT_DIR})
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index a8d4e058c7a8f..d26c185ff02b8 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -25,18 +25,37 @@
 #    define WEBGPU_LOG_DEBUG(msg) ((void) 0)
 #endif  // GGML_WEBGPU_DEBUG
 
-#ifdef GGML_WEBGPU_PROFILE
-#    define WEBGPU_CPU_PROFILE_START(id) auto cpu_start_##id = std::chrono::high_resolution_clock::now();
-#    define WEBGPU_CPU_PROFILE_END(id)                                                                           \
-        auto   cpu_end_##id  = std::chrono::high_resolution_clock::now();                                        \
-        double cpu_time_##id = std::chrono::duration<double, std::milli>(cpu_end_##id - cpu_start_##id).count(); \
-        ctx->cpu_time_ms[#id] += cpu_time_##id;
+#ifdef GGML_WEBGPU_CPU_PROFILE
+    // total timing (aggregated)
+#    define WEBGPU_CPU_PROFILE_TOTAL_START(id) \
+        auto cpu_total_start_##id = std::chrono::high_resolution_clock::now();
+
+#    define WEBGPU_CPU_PROFILE_TOTAL_END(id, ctx) \
+        auto cpu_total_end_##id = std::chrono::high_resolution_clock::now(); \
+        double cpu_total_time_##id = \
+            std::chrono::duration<double, std::milli>(cpu_total_end_##id - cpu_total_start_##id).count(); \
+        (ctx)->cpu_time_ms[#id] += cpu_total_time_##id;
+
+    // fine-grained timing (diagnostics only)
+#    define WEBGPU_CPU_PROFILE_DETAIL_START(id) \
+        auto cpu_detail_start_##id = std::chrono::high_resolution_clock::now();
+
+#    define WEBGPU_CPU_PROFILE_DETAIL_END(id, ctx) \
+        auto cpu_detail_end_##id = std::chrono::high_resolution_clock::now(); \
+        double cpu_detail_time_##id = \
+            std::chrono::duration<double, std::milli>(cpu_detail_end_##id - cpu_detail_start_##id).count(); \
+        (ctx)->cpu_detail_ms[#id] += cpu_detail_time_##id;
+#else
+#    define WEBGPU_CPU_PROFILE_TOTAL_START(id)
+#    define WEBGPU_CPU_PROFILE_TOTAL_END(id, ctx)
+#    define WEBGPU_CPU_PROFILE_DETAIL_START(id)
+#    define WEBGPU_CPU_PROFILE_DETAIL_END(id, ctx)
+#endif  // GGML_WEBGPU_CPU_PROFILE
+
+#ifdef GGML_WEBGPU_GPU_PROFILE
 #    define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS       100
 #    define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16  // e.g. enough for two timestamps
-#else
-#    define WEBGPU_CPU_PROFILE_START(id) ((void) 0)
-#    define WEBGPU_CPU_PROFILE_END(id)   ((void) 0)
-#endif  // GGML_WEBGPU_PROFILE
+#endif
 
 /* Constants */
 
@@ -183,13 +202,16 @@ struct webgpu_context_struct {
     wgpu::Buffer debug_dev_buf;
 #endif
 
-#ifdef GGML_WEBGPU_PROFILE
-    // Profiling: per-shader GPU time in ms
-    std::unordered_map<std::string, double> shader_gpu_time_ms;
-
-    // Profiling: labeled CPU time in ms
+#ifdef GGML_WEBGPU_CPU_PROFILE
+    // Profiling: labeled CPU time in ms (total)
     std::unordered_map<std::string, double> cpu_time_ms;
+    // Profiling: detailed CPU time in ms
+    std::unordered_map<std::string, double> cpu_detail_ms;
+#endif
 
+#ifdef GGML_WEBGPU_GPU_PROFILE
+    // Profiling: per-shader GPU time in ms
+    std::unordered_map<std::string, double> shader_gpu_time_ms;
     // Profiling: pool of timestamp query buffers (one per operation)
     webgpu_buf_pool timestamp_query_buf_pool;
 
@@ -303,7 +325,7 @@ static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) {
     }
 }
 
-#ifdef GGML_WEBGPU_PROFILE
+#ifdef GGML_WEBGPU_GPU_PROFILE
 static wgpu::FutureWaitInfo ggml_backend_webgpu_process_timestamps(webgpu_context & ctx,
                                                                    webgpu_pool_bufs ts_bufs,
                                                                    std::string      label) {
@@ -347,10 +369,13 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
 
     ctx->staged_command_bufs.clear();
     std::vector<webgpu_pool_bufs> staged_param_bufs         = std::move(ctx->staged_param_bufs);
+    ctx->staged_param_bufs.clear();
     std::vector<webgpu_pool_bufs> staged_set_row_error_bufs = std::move(ctx->staged_set_row_error_bufs);
+    ctx->staged_set_row_error_bufs.clear();
 
-#ifdef GGML_WEBGPU_PROFILE
+#ifdef GGML_WEBGPU_GPU_PROFILE
     std::vector<std::pair<webgpu_pool_bufs, std::string>> staged_timestamp_bufs = std::move(ctx->staged_timestamp_bufs);
+    ctx->staged_timestamp_bufs.clear();
 #endif
 
     // Free the staged parameter buffers once the submission completes
@@ -384,7 +409,7 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
         ctx->callback_futures.push_back({ f });
     }
 
-#ifdef GGML_WEBGPU_PROFILE
+#ifdef GGML_WEBGPU_GPU_PROFILE
     for (auto & pair : staged_timestamp_bufs) {
         auto &              ts_bufs = pair.first;
         const std::string & label   = pair.second;
@@ -465,19 +490,23 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
     wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
     encoder.CopyBufferToBuffer(params_bufs.host_buf, 0, params_bufs.dev_buf, 0, params_bufs.dev_buf.GetSize());
 
-#ifdef GGML_WEBGPU_PROFILE
+#ifdef GGML_WEBGPU_GPU_PROFILE
     // --- Profiling: GPU timestamp queries ---
     // Allocate a timestamp query buffer (2 timestamps: start/end)
+    WEBGPU_CPU_PROFILE_DETAIL_START(ts_bufs);
     webgpu_pool_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs();
     if (ts_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) {
         ts_bufs.host_buf.Unmap();
     }
+    WEBGPU_CPU_PROFILE_DETAIL_END(ts_bufs, ctx);
 
     // Create a query set for 2 timestamps
+    WEBGPU_CPU_PROFILE_DETAIL_START(ts_query_set);
     wgpu::QuerySetDescriptor ts_query_set_desc = {};
     ts_query_set_desc.type                     = wgpu::QueryType::Timestamp;
     ts_query_set_desc.count                    = 2;
     wgpu::QuerySet ts_query_set                = ctx->device.CreateQuerySet(&ts_query_set_desc);
+    WEBGPU_CPU_PROFILE_DETAIL_END(ts_query_set, ctx);
 
     wgpu::PassTimestampWrites   ts_writes = { .querySet                  = ts_query_set,
                                               .beginningOfPassWriteIndex = 0,
@@ -492,10 +521,12 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
     pass.DispatchWorkgroups(wg_x, 1, 1);
     pass.End();
 
-#ifdef GGML_WEBGPU_PROFILE
+#ifdef GGML_WEBGPU_GPU_PROFILE
     // Resolve the query set into the device buffer
+    WEBGPU_CPU_PROFILE_DETAIL_START(resolve_query_set);
     encoder.ResolveQuerySet(ts_query_set, 0, 2, ts_bufs.dev_buf, 0);
     encoder.CopyBufferToBuffer(ts_bufs.dev_buf, 0, ts_bufs.host_buf, 0, ts_bufs.host_buf.GetSize());
+    WEBGPU_CPU_PROFILE_DETAIL_END(resolve_query_set, ctx);
 #endif
 
     wgpu::CommandBuffer commands = encoder.Finish();
@@ -511,8 +542,10 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
                                       ctx->param_buf_pool.free_bufs({ params_bufs });
                                   }),
                               UINT64_MAX);
-#ifdef GGML_WEBGPU_PROFILE
+#ifdef GGML_WEBGPU_GPU_PROFILE
+        WEBGPU_CPU_PROFILE_DETAIL_START(submit_wait_process_ts);
         ctx->instance.WaitAny(ggml_backend_webgpu_process_timestamps(ctx, ts_bufs, pipeline.name).future, UINT64_MAX);
+        WEBGPU_CPU_PROFILE_DETAIL_END(submit_wait_process_ts, ctx);
 #endif
     } else {
         // Lock the context mutex when pushing to the staging vectors.
@@ -520,14 +553,15 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
         // Enqueue commands and only submit if we have enough staged commands
         ctx->staged_command_bufs.push_back(commands);
         ctx->staged_param_bufs.push_back(params_bufs);
-#ifdef GGML_WEBGPU_PROFILE
+#ifdef GGML_WEBGPU_GPU_PROFILE
         // Store timestamp buffer and label for later processing
         ctx->staged_timestamp_bufs.push_back(std::make_pair(ts_bufs, pipeline.name));
 #endif
 
         if (ctx->staged_command_bufs.size() == WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
+            WEBGPU_CPU_PROFILE_DETAIL_START(encode_submit_queue);
             ggml_backend_webgpu_submit_queue(ctx);
-            ggml_backend_webgpu_wait_on_submission(ctx);
+            WEBGPU_CPU_PROFILE_DETAIL_END(encode_submit_queue, ctx);
         }
     }
 }
@@ -559,31 +593,44 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) {
     ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *) backend->context;
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")");
 
-#ifdef GGML_WEBGPU_PROFILE
-    // Print detailed profiling information
-    std::cout << "\n[ggml_webgpu profiling summary]\n";
-    double total_gpu = 0.0;
-    for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) {
-        total_gpu += kv.second;
-    }
+#ifdef GGML_WEBGPU_CPU_PROFILE
+    std::cout << "\n[ggml_webgpu cpu profiling summary]\n";
     double total_cpu = 0.0;
     for (const auto & kv : ctx->webgpu_ctx->cpu_time_ms) {
         total_cpu += kv.second;
     }
-    std::cout << "ggml_webgpu: total cpu time (all shaders): " << total_cpu << " ms\n";
-    std::cout << "ggml_webgpu: total gpu time (all shaders): " << total_gpu << " ms\n";
-    std::cout << "ggml_webgpu: gpu/cpu ratio: " << (total_cpu > 0.0 ? total_gpu / total_cpu : 0.0) << "\n";
+    std::cout << "ggml_webgpu: total cpu time: " << total_cpu << " ms\n";
     std::cout << "ggml_webgpu: cpu breakdown:\n";
     for (const auto & kv : ctx->webgpu_ctx->cpu_time_ms) {
         double pct = (total_cpu > 0.0) ? (kv.second / total_cpu * 100.0) : 0.0;
         std::cout << "ggml_webgpu:  " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n";
     }
+    std::cout << "\nggml_webgpu: cpu detailed breakdown:\n";
+    for (const auto & kv : ctx->webgpu_ctx->cpu_detail_ms) {
+        double pct = (total_cpu > 0.0) ? (kv.second / total_cpu * 100.0) : 0.0;
+        std::cout << "ggml_webgpu:  " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n";
+    }
+#endif
+
+#ifdef GGML_WEBGPU_GPU_PROFILE
+    std::cout << "\n[ggml_webgpu gpu profiling summary]\n";
+    double total_gpu = 0.0;
+    for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) {
+        total_gpu += kv.second;
+    }
+    std::cout << "ggml_webgpu: total gpu time (all shaders): " << total_gpu << " ms\n";
     std::cout << "\nggml_webgpu: gpu breakdown:\n";
     for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) {
         double pct = (total_gpu > 0.0) ? (kv.second / total_gpu * 100.0) : 0.0;
         std::cout << "ggml_webgpu:  " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n";
     }
-#else
+#endif
+
+#if defined(GGML_WEBGPU_CPU_PROFILE) && defined(GGML_WEBGPU_GPU_PROFILE)
+    std::cout << "ggml_webgpu: gpu/cpu ratio: " << (total_cpu > 0.0 ? total_gpu / total_cpu : 0.0) << "\n";
+#endif
+
+#if !defined(GGML_WEBGPU_CPU_PROFILE) && !defined(GGML_WEBGPU_GPU_PROFILE)
     GGML_UNUSED(ctx);
 #endif
 }
@@ -1204,17 +1251,23 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
     ggml_backend_webgpu_context * backend_ctx = static_cast<ggml_backend_webgpu_context *>(backend->context);
     webgpu_context                ctx         = backend_ctx->webgpu_ctx;
 
-    // --- Profiling: CPU timing for graph compute ---
-    WEBGPU_CPU_PROFILE_START(graph_compute);
+    WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute);
 
+    WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_encode);
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_webgpu_encode_node(ctx, cgraph->nodes[i]);
     }
+    WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_encode, ctx);
 
+    WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_submit);
     ggml_backend_webgpu_submit_queue(ctx);
+    WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_submit, ctx);
+
+    WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_wait);
     ggml_backend_webgpu_wait_on_submission(ctx);
+    WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_wait, ctx);
 
-    WEBGPU_CPU_PROFILE_END(graph_compute);
+    WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx);
 
     return GGML_STATUS_SUCCESS;
 }
@@ -1262,6 +1315,8 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe
         return;
     }
 
+    WEBGPU_CPU_PROFILE_TOTAL_START(memset_tensor);
+
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", "
                                                                  << offset << ", " << size << ")");
 
@@ -1272,6 +1327,7 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe
     // This is a trick to set all bytes of a u32 to the same 1 byte value.
     uint32_t val32 = (uint32_t) value * 0x01010101;
     ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, val32, total_offset, size);
+    WEBGPU_CPU_PROFILE_TOTAL_END(memset_tensor, buf_ctx->webgpu_ctx);
 }
 
 static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
@@ -1281,6 +1337,7 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                                   size_t                size) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", "
                                                               << offset << ", " << size << ")");
+    WEBGPU_CPU_PROFILE_TOTAL_START(set_tensor);
     ggml_backend_webgpu_buffer_context * buf_ctx    = (ggml_backend_webgpu_buffer_context *) buffer->context;
     webgpu_context                       webgpu_ctx = buf_ctx->webgpu_ctx;
 
@@ -1305,6 +1362,7 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
         // wait for WriteBuffer to complete
         ggml_backend_webgpu_wait_on_submission(webgpu_ctx);
     }
+    WEBGPU_CPU_PROFILE_TOTAL_END(set_tensor, webgpu_ctx);
 }
 
 static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
@@ -1314,7 +1372,7 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                                   size_t                size) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", "
                                                               << offset << ", " << size << ")");
-
+    WEBGPU_CPU_PROFILE_TOTAL_START(get_tensor);
     ggml_backend_webgpu_buffer_context * buf_ctx    = (ggml_backend_webgpu_buffer_context *) buffer->context;
     webgpu_context                       webgpu_ctx = buf_ctx->webgpu_ctx;
     wgpu::Device                         device     = webgpu_ctx->device;
@@ -1354,12 +1412,15 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
     // Copy the data from the mapped range to the output buffer
     std::memcpy(data, mapped_range, size);
     webgpu_ctx->get_tensor_staging_buf.Unmap();
+    WEBGPU_CPU_PROFILE_TOTAL_END(get_tensor, webgpu_ctx);
 }
 
 static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << (uint32_t) value << ")");
+    WEBGPU_CPU_PROFILE_TOTAL_START(clear);
     ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
     ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, 0, buffer->size);
+    WEBGPU_CPU_PROFILE_TOTAL_END(clear, buf_ctx->webgpu_ctx);
 }
 
 static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = {
@@ -2009,6 +2070,8 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     GGML_ASSERT(index == 0);
     WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()");
 
+    WEBGPU_CPU_PROFILE_TOTAL_START(reg_get_device);
+
     ggml_backend_webgpu_reg_context * reg_ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
 
     webgpu_context ctx = reg_ctx->webgpu_ctx;
@@ -2035,7 +2098,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     // Initialize device
     std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16,
                                                          wgpu::FeatureName::ImplicitDeviceSynchronization };
-#ifdef GGML_WEBGPU_PROFILE
+#ifdef GGML_WEBGPU_GPU_PROFILE
     required_features.push_back(wgpu::FeatureName::TimestampQuery);
 #endif
 
@@ -2077,7 +2140,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
                              wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform,
                              wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite);
 
-#ifdef GGML_WEBGPU_PROFILE
+#ifdef GGML_WEBGPU_GPU_PROFILE
     // Initialize buffer pool for timestamp queries (profiling)
     ctx->timestamp_query_buf_pool.init(ctx->device, WEBGPU_NUM_TIMESTAMP_QUERY_BUFS,
                                        WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
@@ -2129,6 +2192,8 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
         /* .reg     = */ reg,
         /* .context = */ &device_ctx,
     };
+
+    WEBGPU_CPU_PROFILE_TOTAL_END(reg_get_device, ctx);
     return &device;
 }
 

From 400a58dfafa6922134f19d1292458fb042804981 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 1 Oct 2025 15:51:24 -0700
Subject: [PATCH 03/15] Rework command submission to avoid global locks

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 422 ++++++++----------
 .../wgsl-shaders/mul_mat.tmpl.wgsl            |   2 +-
 2 files changed, 184 insertions(+), 240 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index d26c185ff02b8..14e3d3243792d 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -15,6 +15,7 @@
 #include <cstring>
 #include <iostream>
 #include <mutex>
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -26,23 +27,21 @@
 #endif  // GGML_WEBGPU_DEBUG
 
 #ifdef GGML_WEBGPU_CPU_PROFILE
-    // total timing (aggregated)
-#    define WEBGPU_CPU_PROFILE_TOTAL_START(id) \
-        auto cpu_total_start_##id = std::chrono::high_resolution_clock::now();
+// total timing (aggregated)
+#    define WEBGPU_CPU_PROFILE_TOTAL_START(id) auto cpu_total_start_##id = std::chrono::high_resolution_clock::now();
 
-#    define WEBGPU_CPU_PROFILE_TOTAL_END(id, ctx) \
-        auto cpu_total_end_##id = std::chrono::high_resolution_clock::now(); \
-        double cpu_total_time_##id = \
+#    define WEBGPU_CPU_PROFILE_TOTAL_END(id, ctx)                                                         \
+        auto   cpu_total_end_##id = std::chrono::high_resolution_clock::now();                            \
+        double cpu_total_time_##id =                                                                      \
             std::chrono::duration<double, std::milli>(cpu_total_end_##id - cpu_total_start_##id).count(); \
         (ctx)->cpu_time_ms[#id] += cpu_total_time_##id;
 
-    // fine-grained timing (diagnostics only)
-#    define WEBGPU_CPU_PROFILE_DETAIL_START(id) \
-        auto cpu_detail_start_##id = std::chrono::high_resolution_clock::now();
+// fine-grained timing (diagnostics only)
+#    define WEBGPU_CPU_PROFILE_DETAIL_START(id) auto cpu_detail_start_##id = std::chrono::high_resolution_clock::now();
 
-#    define WEBGPU_CPU_PROFILE_DETAIL_END(id, ctx) \
-        auto cpu_detail_end_##id = std::chrono::high_resolution_clock::now(); \
-        double cpu_detail_time_##id = \
+#    define WEBGPU_CPU_PROFILE_DETAIL_END(id, ctx)                                                          \
+        auto   cpu_detail_end_##id = std::chrono::high_resolution_clock::now();                             \
+        double cpu_detail_time_##id =                                                                       \
             std::chrono::duration<double, std::milli>(cpu_detail_end_##id - cpu_detail_start_##id).count(); \
         (ctx)->cpu_detail_ms[#id] += cpu_detail_time_##id;
 #else
@@ -59,9 +58,9 @@
 
 /* Constants */
 
-#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE     16
+#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE     8
 #define WEBGPU_WAIT_ANY_BATCH_SIZE           64
-#define WEBGPU_MUL_MAT_WG_SIZE               64
+#define WEBGPU_MUL_MAT_WG_SIZE               256
 #define WEBGPU_NUM_PARAM_BUFS                100
 #define WEBGPU_PARAMS_BUF_SIZE_BYTES         128  // enough for 32 parameters
 #define WEBGPU_NUM_SET_ROWS_ERROR_BUFS       32
@@ -149,6 +148,16 @@ struct webgpu_pipeline {
     std::string           name;
 };
 
+struct webgpu_command {
+    wgpu::CommandBuffer   commands;
+    webgpu_pool_bufs params_bufs;
+    std::optional<webgpu_pool_bufs> set_rows_error_bufs;
+#ifdef GGML_WEBGPU_GPU_PROFILE
+    webgpu_pool_bufs timestamp_query_bufs;
+    std::string pipeline_name;
+#endif
+};
+
 // All the base objects needed to run operations on a WebGPU device
 struct webgpu_context_struct {
     wgpu::Instance instance;
@@ -187,16 +196,6 @@ struct webgpu_context_struct {
     // Staging buffer for reading data from the GPU
     wgpu::Buffer get_tensor_staging_buf;
 
-    // Command buffers which need to be submitted
-    std::vector<wgpu::CommandBuffer> staged_command_bufs;
-
-    // Parameter buffers associated with the staged command buffers
-    std::vector<webgpu_pool_bufs> staged_param_bufs;
-    // Buffers associated with set_rows operations, used to store potential errors
-    std::vector<webgpu_pool_bufs> staged_set_row_error_bufs;
-
-    std::vector<wgpu::FutureWaitInfo> callback_futures;
-
 #ifdef GGML_WEBGPU_DEBUG
     wgpu::Buffer debug_host_buf;
     wgpu::Buffer debug_dev_buf;
@@ -213,10 +212,7 @@ struct webgpu_context_struct {
     // Profiling: per-shader GPU time in ms
     std::unordered_map<std::string, double> shader_gpu_time_ms;
     // Profiling: pool of timestamp query buffers (one per operation)
-    webgpu_buf_pool timestamp_query_buf_pool;
-
-    // Profiling: staged timestamp buffers and their labels for batch submission
-    std::vector<std::pair<webgpu_pool_bufs, std::string>> staged_timestamp_bufs;
+    webgpu_buf_pool                         timestamp_query_buf_pool;
 #endif
 };
 
@@ -301,27 +297,12 @@ static void ggml_webgpu_create_buffer(wgpu::Device &    device,
 /** WebGPU Actions */
 
 // Wait for the queue to finish processing all submitted work
-static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) {
-    std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
-    if (ctx->callback_futures.empty()) {
-        // no existing callbacks, wait on queue submission
-        ctx->instance.WaitAny(
-            ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
-                                           [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
-                                               if (status != wgpu::QueueWorkDoneStatus::Success) {
-                                                   GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n",
-                                                                  std::string(message).c_str());
-                                               }
-                                           }),
-            UINT64_MAX);
-    } else {
-        // WebGPU implementations may limit the number of futures that can be waited on at once,
-        // so wait in batches (64 is what Dawn supports).
-        for (size_t i = 0; i < ctx->callback_futures.size(); i += WEBGPU_WAIT_ANY_BATCH_SIZE) {
-            size_t end = std::min(i + WEBGPU_WAIT_ANY_BATCH_SIZE, ctx->callback_futures.size());
-            ctx->instance.WaitAny(end - i, ctx->callback_futures.data() + i, UINT64_MAX);
-        }
-        ctx->callback_futures.clear();
+static void ggml_backend_webgpu_wait(webgpu_context & ctx, std::vector<wgpu::FutureWaitInfo> & wait_infos) {
+    // WebGPU implementations may limit the number of futures that can be waited on at once,
+    // so wait in batches (64 is what Dawn supports).
+    for (size_t i = 0; i < wait_infos.size(); i += WEBGPU_WAIT_ANY_BATCH_SIZE) {
+        size_t end = std::min(i + WEBGPU_WAIT_ANY_BATCH_SIZE, wait_infos.size());
+        ctx->instance.WaitAny(end - i, wait_infos.data() + i, UINT64_MAX);
     }
 }
 
@@ -347,77 +328,6 @@ static wgpu::FutureWaitInfo ggml_backend_webgpu_process_timestamps(webgpu_contex
 }
 #endif
 
-static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
-    std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_submit_queue()");
-    if (ctx->staged_command_bufs.empty()) {
-        // Nothing to submit
-        return;
-    }
-    ctx->queue.Submit(ctx->staged_command_bufs.size(), ctx->staged_command_bufs.data());
-
-    // If there are SET_ROWS operations in this submission, copy their error buffers to the host.
-    if (ctx->staged_set_row_error_bufs.size() > 0) {
-        wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
-        for (auto & error_bufs : ctx->staged_set_row_error_bufs) {
-            // Copy the error buffer to the host buffer
-            encoder.CopyBufferToBuffer(error_bufs.dev_buf, 0, error_bufs.host_buf, 0, error_bufs.host_buf.GetSize());
-        }
-        wgpu::CommandBuffer commands = encoder.Finish();
-        ctx->queue.Submit(1, &commands);
-    }
-
-    ctx->staged_command_bufs.clear();
-    std::vector<webgpu_pool_bufs> staged_param_bufs         = std::move(ctx->staged_param_bufs);
-    ctx->staged_param_bufs.clear();
-    std::vector<webgpu_pool_bufs> staged_set_row_error_bufs = std::move(ctx->staged_set_row_error_bufs);
-    ctx->staged_set_row_error_bufs.clear();
-
-#ifdef GGML_WEBGPU_GPU_PROFILE
-    std::vector<std::pair<webgpu_pool_bufs, std::string>> staged_timestamp_bufs = std::move(ctx->staged_timestamp_bufs);
-    ctx->staged_timestamp_bufs.clear();
-#endif
-
-    // Free the staged parameter buffers once the submission completes
-    wgpu::Future p_f = ctx->queue.OnSubmittedWorkDone(
-        wgpu::CallbackMode::AllowSpontaneous,
-        [ctx, staged_param_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
-            if (status != wgpu::QueueWorkDoneStatus::Success) {
-                GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", std::string(message).c_str());
-            }
-            // Free the staged buffers
-            ctx->param_buf_pool.free_bufs(staged_param_bufs);
-        });
-    ctx->callback_futures.push_back({ p_f });
-
-    // Check for errrors in SET_ROWS operations
-    for (auto & error_bufs : staged_set_row_error_bufs) {
-        wgpu::Future f = error_bufs.host_buf.MapAsync(
-            wgpu::MapMode::Read, 0, error_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
-            [ctx, error_bufs](wgpu::MapAsyncStatus status, wgpu::StringView message) {
-                if (status != wgpu::MapAsyncStatus::Success) {
-                    GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", std::string(message).c_str());
-                } else {
-                    const uint32_t * error_data = (const uint32_t *) error_bufs.host_buf.GetConstMappedRange();
-                    if (*error_data) {
-                        GGML_ABORT("ggml_webgpu: SET_ROWS index > 2^32, unsupported.");
-                    }
-                    // We can't unmap in here due to WebGPU reentrancy limitations.
-                    ctx->set_rows_error_buf_pool.free_bufs({ error_bufs });
-                }
-            });
-        ctx->callback_futures.push_back({ f });
-    }
-
-#ifdef GGML_WEBGPU_GPU_PROFILE
-    for (auto & pair : staged_timestamp_bufs) {
-        auto &              ts_bufs = pair.first;
-        const std::string & label   = pair.second;
-        ctx->callback_futures.push_back(ggml_backend_webgpu_process_timestamps(ctx, ts_bufs, label));
-    }
-#endif
-}
-
 static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
                                            wgpu::Buffer &   buffer,
                                            wgpu::MapMode    mode,
@@ -438,7 +348,6 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
 // To use, add a bind group entry to the setup for the shader you are debugging, add the buffer and
 // debug statements in the shader, and then call this function after encoding the commands and submitting them.
 static void ggml_backend_webgpu_debug(webgpu_context & ctx) {
-    ggml_backend_webgpu_submit_queue(ctx);
     wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
     encoder.CopyBufferToBuffer(ctx->debug_dev_buf, 0, ctx->debug_host_buf, 0, ctx->debug_host_buf.GetSize());
     wgpu::CommandBuffer commands = encoder.Finish();
@@ -455,13 +364,73 @@ static void ggml_backend_webgpu_debug(webgpu_context & ctx) {
 }
 #endif
 
-static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &                  ctx,
-                                                  webgpu_pipeline &                 pipeline,
-                                                  std::vector<uint32_t>             params,
-                                                  std::vector<wgpu::BindGroupEntry> bind_group_entries,
-                                                  uint32_t                          wg_x,
-                                                  const char *                      bind_group_label = nullptr,
-                                                  bool                              submit_and_wait  = false) {
+static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_context              ctx,
+                                                                    std::vector<webgpu_command> builds) {
+    std::vector<wgpu::CommandBuffer> commands;
+    std::vector<webgpu_pool_bufs>    params_bufs;
+    std::vector<webgpu_pool_bufs>    set_rows_error_bufs;
+#ifdef GGML_WEBGPU_GPU_PROFILE
+    std::vector<std::pair<std::string, webgpu_pool_bufs>> pipeline_name_and_ts_bufs;
+#endif
+
+    for (const auto & build : builds) {
+        commands.push_back(build.commands);
+        params_bufs.push_back(build.params_bufs);
+        if (build.set_rows_error_bufs) {
+            set_rows_error_bufs.push_back(build.set_rows_error_bufs.value());
+        }
+#ifdef GGML_WEBGPU_GPU_PROFILE
+        pipeline_name_and_ts_bufs.push_back({ build.pipeline_name, build.timestamp_query_bufs });
+#endif
+    }
+    ctx->queue.Submit(commands.size(), commands.data());
+
+    std::vector<wgpu::FutureWaitInfo> wait_infos;
+
+    wgpu::Future p_f = ctx->queue.OnSubmittedWorkDone(
+        wgpu::CallbackMode::AllowSpontaneous,
+        [ctx, params_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
+            if (status != wgpu::QueueWorkDoneStatus::Success) {
+                GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", std::string(message).c_str());
+            }
+            // Free the staged buffers
+            ctx->param_buf_pool.free_bufs({ params_bufs });
+        });
+    wait_infos.push_back({ p_f });
+
+    for (const auto & bufs : set_rows_error_bufs) {
+        wgpu::Future f = bufs.host_buf.MapAsync(
+            wgpu::MapMode::Read, 0, bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
+            [ctx, bufs](wgpu::MapAsyncStatus status, wgpu::StringView message) {
+                if (status != wgpu::MapAsyncStatus::Success) {
+                    GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", std::string(message).c_str());
+                } else {
+                    const uint32_t * error_data = (const uint32_t *) bufs.host_buf.GetConstMappedRange();
+                    if (*error_data) {
+                        GGML_ABORT("ggml_webgpu: SET_ROWS index > 2^32, unsupported.");
+                    }
+                    // We can't unmap in here due to WebGPU reentrancy limitations.
+                    ctx->set_rows_error_buf_pool.free_bufs({ bufs });
+                }
+            });
+        wait_infos.push_back({ f });
+    }
+
+#ifdef GGML_WEBGPU_GPU_PROFILE
+    for (const auto & name_and_bufs : pipeline_name_and_ts_bufs) {
+        wgpu::FutureWaitInfo f = ggml_backend_webgpu_process_timestamps(ctx, name_and_bufs.second, name_and_bufs.first);
+        wait_infos.push_back(f);
+    }
+#endif
+    return wait_infos;
+}
+
+static webgpu_command ggml_backend_webgpu_build(webgpu_context &                  ctx,
+                                                webgpu_pipeline &                 pipeline,
+                                                std::vector<uint32_t>             params,
+                                                std::vector<wgpu::BindGroupEntry> bind_group_entries,
+                                                uint32_t                          wg_x,
+                                                std::optional<webgpu_pool_bufs>   set_rows_error_bufs = std::nullopt) {
     webgpu_pool_bufs params_bufs = ctx->param_buf_pool.alloc_bufs();
 
     ggml_backend_webgpu_map_buffer(ctx, params_bufs.host_buf, wgpu::MapMode::Write, 0, params_bufs.host_buf.GetSize());
@@ -482,9 +451,7 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
     bind_group_desc.layout     = pipeline.pipeline.GetBindGroupLayout(0);
     bind_group_desc.entryCount = bind_group_entries.size();
     bind_group_desc.entries    = bind_group_entries.data();
-    if (bind_group_label) {
-        bind_group_desc.label = bind_group_label;
-    }
+    bind_group_desc.label      = pipeline.name.c_str();
     wgpu::BindGroup bind_group = ctx->device.CreateBindGroup(&bind_group_desc);
 
     wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
@@ -493,20 +460,16 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
 #ifdef GGML_WEBGPU_GPU_PROFILE
     // --- Profiling: GPU timestamp queries ---
     // Allocate a timestamp query buffer (2 timestamps: start/end)
-    WEBGPU_CPU_PROFILE_DETAIL_START(ts_bufs);
     webgpu_pool_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs();
     if (ts_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) {
         ts_bufs.host_buf.Unmap();
     }
-    WEBGPU_CPU_PROFILE_DETAIL_END(ts_bufs, ctx);
 
     // Create a query set for 2 timestamps
-    WEBGPU_CPU_PROFILE_DETAIL_START(ts_query_set);
     wgpu::QuerySetDescriptor ts_query_set_desc = {};
     ts_query_set_desc.type                     = wgpu::QueryType::Timestamp;
     ts_query_set_desc.count                    = 2;
     wgpu::QuerySet ts_query_set                = ctx->device.CreateQuerySet(&ts_query_set_desc);
-    WEBGPU_CPU_PROFILE_DETAIL_END(ts_query_set, ctx);
 
     wgpu::PassTimestampWrites   ts_writes = { .querySet                  = ts_query_set,
                                               .beginningOfPassWriteIndex = 0,
@@ -523,47 +486,26 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
     // Resolve the query set into the device buffer
-    WEBGPU_CPU_PROFILE_DETAIL_START(resolve_query_set);
     encoder.ResolveQuerySet(ts_query_set, 0, 2, ts_bufs.dev_buf, 0);
     encoder.CopyBufferToBuffer(ts_bufs.dev_buf, 0, ts_bufs.host_buf, 0, ts_bufs.host_buf.GetSize());
-    WEBGPU_CPU_PROFILE_DETAIL_END(resolve_query_set, ctx);
 #endif
 
+    // If there are SET_ROWS operations in this submission, copy their error buffers to the host.
+    if (set_rows_error_bufs) {
+        encoder.CopyBufferToBuffer(set_rows_error_bufs->dev_buf, 0, set_rows_error_bufs->host_buf, 0,
+                                   set_rows_error_bufs->host_buf.GetSize());
+    }
+
     wgpu::CommandBuffer commands = encoder.Finish();
-    if (submit_and_wait) {
-        // Submit and wait immediately
-        ctx->queue.Submit(1, &commands);
-        ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone(
-                                  wgpu::CallbackMode::AllowSpontaneous,
-                                  [ctx, params_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
-                                      if (status != wgpu::QueueWorkDoneStatus::Success) {
-                                          GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data);
-                                      }
-                                      ctx->param_buf_pool.free_bufs({ params_bufs });
-                                  }),
-                              UINT64_MAX);
-#ifdef GGML_WEBGPU_GPU_PROFILE
-        WEBGPU_CPU_PROFILE_DETAIL_START(submit_wait_process_ts);
-        ctx->instance.WaitAny(ggml_backend_webgpu_process_timestamps(ctx, ts_bufs, pipeline.name).future, UINT64_MAX);
-        WEBGPU_CPU_PROFILE_DETAIL_END(submit_wait_process_ts, ctx);
-#endif
-    } else {
-        // Lock the context mutex when pushing to the staging vectors.
-        std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
-        // Enqueue commands and only submit if we have enough staged commands
-        ctx->staged_command_bufs.push_back(commands);
-        ctx->staged_param_bufs.push_back(params_bufs);
+    webgpu_command      result   = {};
+    result.commands              = commands;
+    result.params_bufs           = params_bufs;
+    result.set_rows_error_bufs   = set_rows_error_bufs;
 #ifdef GGML_WEBGPU_GPU_PROFILE
-        // Store timestamp buffer and label for later processing
-        ctx->staged_timestamp_bufs.push_back(std::make_pair(ts_bufs, pipeline.name));
+    result.timestamp_query_bufs = ts_bufs;
+    result.pipeline_name       = pipeline.name;
 #endif
-
-        if (ctx->staged_command_bufs.size() == WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
-            WEBGPU_CPU_PROFILE_DETAIL_START(encode_submit_queue);
-            ggml_backend_webgpu_submit_queue(ctx);
-            WEBGPU_CPU_PROFILE_DETAIL_END(encode_submit_queue, ctx);
-        }
-    }
+    return result;
 }
 
 static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx,
@@ -577,7 +519,10 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx,
     };
     size_t   bytes_per_wg = ctx->max_wg_size_x * ctx->memset_bytes_per_thread;
     uint32_t wg_x         = ((size + 3) + bytes_per_wg - 1) / bytes_per_wg;
-    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->memset_pipeline, params, entries, wg_x, "MEMSET", true);
+
+    webgpu_command command = ggml_backend_webgpu_build(ctx, ctx->memset_pipeline, params, entries, wg_x);
+    auto futures = ggml_backend_webgpu_submit(ctx, { command });
+    ggml_backend_webgpu_wait(ctx, futures);
 }
 
 /** End WebGPU Actions */
@@ -605,7 +550,9 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) {
         double pct = (total_cpu > 0.0) ? (kv.second / total_cpu * 100.0) : 0.0;
         std::cout << "ggml_webgpu:  " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n";
     }
-    std::cout << "\nggml_webgpu: cpu detailed breakdown:\n";
+    if (ctx->webgpu_ctx->cpu_detail_ms.size() > 0) {
+        std::cout << "ggml_webgpu: cpu detailed breakdown:\n";
+    }
     for (const auto & kv : ctx->webgpu_ctx->cpu_detail_ms) {
         double pct = (total_cpu > 0.0) ? (kv.second / total_cpu * 100.0) : 0.0;
         std::cout << "ggml_webgpu:  " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n";
@@ -665,7 +612,7 @@ static bool ggml_webgpu_tensor_equal(ggml_tensor * a, ggml_tensor * b) {
            (ggml_webgpu_tensor_offset(a) == ggml_webgpu_tensor_offset(b));
 }
 
-static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
+static webgpu_command ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     uint32_t ne = (uint32_t) ggml_nelements(dst);
 
     std::vector<uint32_t> params = {
@@ -694,14 +641,13 @@ static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor
 
     size_t   max_wg_size = ctx->max_wg_size_x;
     uint32_t wg_x        = (ne + max_wg_size - 1) / max_wg_size;
-    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->cpy_pipeline[src->type][dst->type], params, entries, wg_x,
-                                          ggml_op_name(dst->op));
+    return ggml_backend_webgpu_build(ctx, ctx->cpy_pipeline[src->type][dst->type], params, entries, wg_x);
 }
 
-static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) {
+static std::optional<webgpu_command> ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) {
     // For set rows specifically, we need to check if src and idx are empty tensors.
     if (ggml_is_empty(src) || ggml_is_empty(idx)) {
-        return;
+        return std::nullopt;
     }
 
     webgpu_pool_bufs error_bufs = ctx->set_rows_error_buf_pool.alloc_bufs();
@@ -744,13 +690,10 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
     size_t   max_wg_size = ctx->max_wg_size_x;
     uint32_t wg_x        = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size;
 
-    std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
-    ctx->staged_set_row_error_bufs.push_back(error_bufs);
-
-    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->set_rows_pipeline, params, entries, wg_x, ggml_op_name(dst->op));
+    return ggml_backend_webgpu_build(ctx, ctx->set_rows_pipeline, params, entries, wg_x, error_bufs);
 }
 
-static void ggml_webgpu_get_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) {
+static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) {
     std::vector<uint32_t> params = {
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)),
@@ -789,10 +732,10 @@ static void ggml_webgpu_get_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
     if (src->type == GGML_TYPE_F32 && dst->ne[0] % 4 != 0) {
         pipeline = ctx->get_rows_f32_no_vec_pipeline;
     }
-    ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
+static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
     std::vector<uint32_t> params = {
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
@@ -829,11 +772,10 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t
 
     uint32_t wg_x =
         (dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE;
-    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x,
-                                          ggml_op_name(dst->op));
+    return ggml_backend_webgpu_build(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x);
 }
 
-static void ggml_webgpu_binary_op(webgpu_context &  ctx,
+static webgpu_command ggml_webgpu_binary_op(webgpu_context &  ctx,
                                   ggml_tensor *     src0,
                                   ggml_tensor *     src1,
                                   ggml_tensor *     dst,
@@ -876,10 +818,10 @@ static void ggml_webgpu_binary_op(webgpu_context &  ctx,
 
     size_t   max_wg_size = ctx->max_wg_size_x;
     uint32_t wg_x        = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
-    ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static void ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
+static webgpu_command ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     int inplace = ggml_webgpu_tensor_equal(src, dst);
 
     std::vector<uint32_t> params = {
@@ -911,11 +853,10 @@ static void ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_t
                             .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
     }
 
-    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->rms_norm_pipeline[inplace], params, entries, ggml_nrows(src),
-                                          ggml_op_name(dst->op));
+    return ggml_backend_webgpu_build(ctx, ctx->rms_norm_pipeline[inplace], params, entries, ggml_nrows(src));
 }
 
-static void ggml_webgpu_rope(webgpu_context & ctx,
+static webgpu_command ggml_webgpu_rope(webgpu_context & ctx,
                              ggml_tensor *    src0,
                              ggml_tensor *    src1,
                              ggml_tensor *    src2,
@@ -1000,10 +941,10 @@ static void ggml_webgpu_rope(webgpu_context & ctx,
     webgpu_pipeline pipeline    = ctx->rope_pipeline[dst->type][has_freq_factor][inplace];
     size_t          max_wg_size = ctx->max_wg_size_x;
     uint32_t        wg_x        = (ggml_nelements(src0) / 2 + max_wg_size - 1) / max_wg_size;
-    ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static void ggml_webgpu_glu(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
+static webgpu_command ggml_webgpu_glu(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
     const int split = (src1 != nullptr);
 
     std::vector<uint32_t> params = {
@@ -1053,10 +994,10 @@ static void ggml_webgpu_glu(webgpu_context & ctx, ggml_tensor * src0, ggml_tenso
     webgpu_pipeline pipeline    = ctx->glu_pipeline[ggml_get_glu_op(dst)][dst->type][split];
     size_t          max_wg_size = ctx->max_wg_size_x;
     uint32_t        wg_x        = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
-    ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op));
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static void ggml_webgpu_scale(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
+static webgpu_command ggml_webgpu_scale(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     int inplace = ggml_webgpu_tensor_equal(src, dst);
 
     std::vector<uint32_t> params = {
@@ -1091,11 +1032,10 @@ static void ggml_webgpu_scale(webgpu_context & ctx, ggml_tensor * src, ggml_tens
 
     size_t   max_wg_size = ctx->max_wg_size_x;
     uint32_t wg_x        = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
-    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->scale_pipeline[inplace], params, entries, wg_x,
-                                          ggml_op_name(dst->op));
+    return ggml_backend_webgpu_build(ctx, ctx->scale_pipeline[inplace], params, entries, wg_x);
 }
 
-static void ggml_webgpu_soft_max(webgpu_context & ctx,
+static webgpu_command ggml_webgpu_soft_max(webgpu_context & ctx,
                                  ggml_tensor *    src0,
                                  ggml_tensor *    src1,
                                  ggml_tensor *    src2,
@@ -1164,14 +1104,14 @@ static void ggml_webgpu_soft_max(webgpu_context & ctx,
                             .size    = ggml_webgpu_tensor_binding_size(ctx, dst) });
     }
 
-    ggml_backend_webgpu_build_and_enqueue(ctx, ctx->soft_max_pipeline[mask_type][has_sink][inplace], params, entries,
-                                          ggml_nrows(dst), ggml_op_name(dst->op));
+    return ggml_backend_webgpu_build(ctx, ctx->soft_max_pipeline[mask_type][has_sink][inplace], params, entries,
+                                         ggml_nrows(dst));
 }
 
 // Returns true if node has enqueued work into the queue, false otherwise
-static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
+static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
     if (ggml_is_empty(node)) {
-        return false;
+        return std::nullopt;
     }
     WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")");
 
@@ -1186,63 +1126,49 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
         case GGML_OP_RESHAPE:
-            return false;
+            return std::nullopt;
         case GGML_OP_CPY:
         case GGML_OP_CONT:
-            ggml_webgpu_cpy(ctx, src0, node);
-            break;
+            return ggml_webgpu_cpy(ctx, src0, node);
         case GGML_OP_SET_ROWS:
-            ggml_webgpu_set_rows(ctx, src0, src1, node);
-            break;
+            return ggml_webgpu_set_rows(ctx, src0, src1, node);
         case GGML_OP_GET_ROWS:
-            ggml_webgpu_get_rows(ctx, src0, src1, node);
-            break;
+            return ggml_webgpu_get_rows(ctx, src0, src1, node);
         case GGML_OP_MUL_MAT:
-            ggml_webgpu_mul_mat(ctx, src0, src1, node);
-            break;
+            return ggml_webgpu_mul_mat(ctx, src0, src1, node);
         case GGML_OP_ADD:
             {
                 int inplace = ggml_webgpu_tensor_equal(src0, node);
-                ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline[node->type][inplace], inplace);
-                break;
+                return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline[node->type][inplace], inplace);
             }
         case GGML_OP_SUB:
             {
                 int inplace = ggml_webgpu_tensor_equal(src0, node);
-                ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->sub_pipeline[node->type][inplace], inplace);
-                break;
+                return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->sub_pipeline[node->type][inplace], inplace);
             }
         case GGML_OP_MUL:
             {
                 int inplace = ggml_webgpu_tensor_equal(src0, node);
-                ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline[node->type][inplace], inplace);
-                break;
+                return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline[node->type][inplace], inplace);
             }
         case GGML_OP_DIV:
             {
                 int inplace = ggml_webgpu_tensor_equal(src0, node);
-                ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->div_pipeline[node->type][inplace], inplace);
-                break;
+                return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->div_pipeline[node->type][inplace], inplace);
             }
         case GGML_OP_RMS_NORM:
-            ggml_webgpu_rms_norm(ctx, src0, node);
-            break;
+            return ggml_webgpu_rms_norm(ctx, src0, node);
         case GGML_OP_ROPE:
-            ggml_webgpu_rope(ctx, src0, src1, src2, node);
-            break;
+            return ggml_webgpu_rope(ctx, src0, src1, src2, node);
         case GGML_OP_GLU:
-            ggml_webgpu_glu(ctx, src0, src1, node);
-            break;
+            return ggml_webgpu_glu(ctx, src0, src1, node);
         case GGML_OP_SCALE:
-            ggml_webgpu_scale(ctx, src0, node);
-            break;
+            return ggml_webgpu_scale(ctx, src0, node);
         case GGML_OP_SOFT_MAX:
-            ggml_webgpu_soft_max(ctx, src0, src1, src2, node);
-            break;
+            return ggml_webgpu_soft_max(ctx, src0, src1, src2, node);
         default:
-            return false;
+            return std::nullopt;
     }
-    return true;
 }
 
 static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
@@ -1254,17 +1180,27 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
     WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute);
 
     WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_encode);
+    std::vector<webgpu_command> commands;
+    std::vector<wgpu::FutureWaitInfo> wait_infos;
     for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_webgpu_encode_node(ctx, cgraph->nodes[i]);
+        if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) {
+            commands.push_back(*cmd);
+        }
+        if (commands.size() >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
+            std::vector<wgpu::FutureWaitInfo> new_infos = ggml_backend_webgpu_submit(ctx, commands);
+            wait_infos.insert(wait_infos.end(), new_infos.begin(), new_infos.end());
+            commands.clear();
+        }
+    }
+    if (!commands.empty()) {
+        std::vector<wgpu::FutureWaitInfo> new_infos = ggml_backend_webgpu_submit(ctx, commands);
+        wait_infos.insert(wait_infos.end(), new_infos.begin(), new_infos.end());
+        commands.clear();
     }
     WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_encode, ctx);
 
-    WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_submit);
-    ggml_backend_webgpu_submit_queue(ctx);
-    WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_submit, ctx);
-
     WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_wait);
-    ggml_backend_webgpu_wait_on_submission(ctx);
+    ggml_backend_webgpu_wait(ctx, wait_infos);
     WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_wait, ctx);
 
     WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx);
@@ -1360,7 +1296,15 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                           remaining_size);
     } else {
         // wait for WriteBuffer to complete
-        ggml_backend_webgpu_wait_on_submission(webgpu_ctx);
+        webgpu_ctx->instance.WaitAny(
+            webgpu_ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
+                                           [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
+                                               if (status != wgpu::QueueWorkDoneStatus::Success) {
+                                                   GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n",
+                                                                  std::string(message).c_str());
+                                               }
+                                           }),
+            UINT64_MAX);
     }
     WEBGPU_CPU_PROFILE_TOTAL_END(set_tensor, webgpu_ctx);
 }
@@ -2116,7 +2060,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     dev_desc.SetUncapturedErrorCallback(
         [](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) {
             GGML_UNUSED(device);
-            GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason),
+            GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason),
                            std::string(message).c_str());
         });
     ctx->instance.WaitAny(ctx->adapter.RequestDevice(
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl
index 25e2185de84ee..141db9b39d957 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl
@@ -870,7 +870,7 @@ struct MulMatParams {
 
 @group(0) @binding(3) var<uniform> params: MulMatParams;
 
-@compute @workgroup_size(64)
+@compute @workgroup_size(256)
 fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
     let total = params.m * params.n * params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3;
     if (global_id.x >= total) {

From ca43faa1a8023268f3f11c9deeeac5ec765aeb0f Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Sun, 5 Oct 2025 13:53:39 -0700
Subject: [PATCH 04/15] Update wait handling

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 118 +++++++++++++++++----------
 1 file changed, 73 insertions(+), 45 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 14e3d3243792d..764690fb388f5 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -149,12 +149,12 @@ struct webgpu_pipeline {
 };
 
 struct webgpu_command {
-    wgpu::CommandBuffer   commands;
-    webgpu_pool_bufs params_bufs;
+    wgpu::CommandBuffer             commands;
+    webgpu_pool_bufs                params_bufs;
     std::optional<webgpu_pool_bufs> set_rows_error_bufs;
 #ifdef GGML_WEBGPU_GPU_PROFILE
     webgpu_pool_bufs timestamp_query_bufs;
-    std::string pipeline_name;
+    std::string      pipeline_name;
 #endif
 };
 
@@ -297,12 +297,29 @@ static void ggml_webgpu_create_buffer(wgpu::Device &    device,
 /** WebGPU Actions */
 
 // Wait for the queue to finish processing all submitted work
-static void ggml_backend_webgpu_wait(webgpu_context & ctx, std::vector<wgpu::FutureWaitInfo> & wait_infos) {
+static void ggml_backend_webgpu_wait(webgpu_context &                    ctx,
+                                     std::vector<wgpu::FutureWaitInfo> & futures,
+                                     uint64_t                            timeout_ms = UINT64_MAX) {
     // WebGPU implementations may limit the number of futures that can be waited on at once,
     // so wait in batches (64 is what Dawn supports).
-    for (size_t i = 0; i < wait_infos.size(); i += WEBGPU_WAIT_ANY_BATCH_SIZE) {
-        size_t end = std::min(i + WEBGPU_WAIT_ANY_BATCH_SIZE, wait_infos.size());
-        ctx->instance.WaitAny(end - i, wait_infos.data() + i, UINT64_MAX);
+    size_t i = 0;
+    while (i < futures.size()) {
+        size_t end        = std::min(i + WEBGPU_WAIT_ANY_BATCH_SIZE, futures.size());
+        auto   waitStatus = ctx->instance.WaitAny(end - i, futures.data() + i, timeout_ms);
+        switch (waitStatus) {
+            case wgpu::WaitStatus::Success:
+                futures.erase(futures.begin() + i, futures.begin() + end);
+                break;
+            case wgpu::WaitStatus::TimedOut:
+                i += WEBGPU_WAIT_ANY_BATCH_SIZE;
+                break;
+            case wgpu::WaitStatus::Error:
+                GGML_LOG_ERROR("ggml_webgpu: WaitAny returned an error\n");
+                break;
+            default:
+                GGML_LOG_ERROR("ggml_webgpu: WaitAny returned an unknown status\n");
+                break;
+        }
     }
 }
 
@@ -385,7 +402,7 @@ static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_conte
     }
     ctx->queue.Submit(commands.size(), commands.data());
 
-    std::vector<wgpu::FutureWaitInfo> wait_infos;
+    std::vector<wgpu::FutureWaitInfo> futures;
 
     wgpu::Future p_f = ctx->queue.OnSubmittedWorkDone(
         wgpu::CallbackMode::AllowSpontaneous,
@@ -396,7 +413,7 @@ static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_conte
             // Free the staged buffers
             ctx->param_buf_pool.free_bufs({ params_bufs });
         });
-    wait_infos.push_back({ p_f });
+    futures.push_back({ p_f });
 
     for (const auto & bufs : set_rows_error_bufs) {
         wgpu::Future f = bufs.host_buf.MapAsync(
@@ -413,16 +430,16 @@ static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_conte
                     ctx->set_rows_error_buf_pool.free_bufs({ bufs });
                 }
             });
-        wait_infos.push_back({ f });
+        futures.push_back({ f });
     }
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
     for (const auto & name_and_bufs : pipeline_name_and_ts_bufs) {
         wgpu::FutureWaitInfo f = ggml_backend_webgpu_process_timestamps(ctx, name_and_bufs.second, name_and_bufs.first);
-        wait_infos.push_back(f);
+        futures.push_back(f);
     }
 #endif
-    return wait_infos;
+    return futures;
 }
 
 static webgpu_command ggml_backend_webgpu_build(webgpu_context &                  ctx,
@@ -503,7 +520,7 @@ static webgpu_command ggml_backend_webgpu_build(webgpu_context &
     result.set_rows_error_bufs   = set_rows_error_bufs;
 #ifdef GGML_WEBGPU_GPU_PROFILE
     result.timestamp_query_bufs = ts_bufs;
-    result.pipeline_name       = pipeline.name;
+    result.pipeline_name        = pipeline.name;
 #endif
     return result;
 }
@@ -521,7 +538,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx,
     uint32_t wg_x         = ((size + 3) + bytes_per_wg - 1) / bytes_per_wg;
 
     webgpu_command command = ggml_backend_webgpu_build(ctx, ctx->memset_pipeline, params, entries, wg_x);
-    auto futures = ggml_backend_webgpu_submit(ctx, { command });
+    auto           futures = ggml_backend_webgpu_submit(ctx, { command });
     ggml_backend_webgpu_wait(ctx, futures);
 }
 
@@ -644,7 +661,10 @@ static webgpu_command ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, g
     return ggml_backend_webgpu_build(ctx, ctx->cpy_pipeline[src->type][dst->type], params, entries, wg_x);
 }
 
-static std::optional<webgpu_command> ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) {
+static std::optional<webgpu_command> ggml_webgpu_set_rows(webgpu_context & ctx,
+                                                          ggml_tensor *    src,
+                                                          ggml_tensor *    idx,
+                                                          ggml_tensor *    dst) {
     // For set rows specifically, we need to check if src and idx are empty tensors.
     if (ggml_is_empty(src) || ggml_is_empty(idx)) {
         return std::nullopt;
@@ -693,7 +713,10 @@ static std::optional<webgpu_command> ggml_webgpu_set_rows(webgpu_context & ctx,
     return ggml_backend_webgpu_build(ctx, ctx->set_rows_pipeline, params, entries, wg_x, error_bufs);
 }
 
-static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) {
+static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx,
+                                           ggml_tensor *    src,
+                                           ggml_tensor *    idx,
+                                           ggml_tensor *    dst) {
     std::vector<uint32_t> params = {
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)),
@@ -735,7 +758,10 @@ static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx, ggml_tensor * s
     return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
-static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
+static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
+                                          ggml_tensor *    src0,
+                                          ggml_tensor *    src1,
+                                          ggml_tensor *    dst) {
     std::vector<uint32_t> params = {
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
@@ -776,11 +802,11 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * sr
 }
 
 static webgpu_command ggml_webgpu_binary_op(webgpu_context &  ctx,
-                                  ggml_tensor *     src0,
-                                  ggml_tensor *     src1,
-                                  ggml_tensor *     dst,
-                                  webgpu_pipeline & pipeline,
-                                  bool              inplace) {
+                                            ggml_tensor *     src0,
+                                            ggml_tensor *     src1,
+                                            ggml_tensor *     dst,
+                                            webgpu_pipeline & pipeline,
+                                            bool              inplace) {
     std::vector<uint32_t> params = {
         (uint32_t) ggml_nelements(dst),
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
@@ -857,10 +883,10 @@ static webgpu_command ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * s
 }
 
 static webgpu_command ggml_webgpu_rope(webgpu_context & ctx,
-                             ggml_tensor *    src0,
-                             ggml_tensor *    src1,
-                             ggml_tensor *    src2,
-                             ggml_tensor *    dst) {
+                                       ggml_tensor *    src0,
+                                       ggml_tensor *    src1,
+                                       ggml_tensor *    src2,
+                                       ggml_tensor *    dst) {
     const int inplace         = ggml_webgpu_tensor_equal(src0, dst);
     const int has_freq_factor = (src2 != nullptr);
 
@@ -1036,10 +1062,10 @@ static webgpu_command ggml_webgpu_scale(webgpu_context & ctx, ggml_tensor * src,
 }
 
 static webgpu_command ggml_webgpu_soft_max(webgpu_context & ctx,
-                                 ggml_tensor *    src0,
-                                 ggml_tensor *    src1,
-                                 ggml_tensor *    src2,
-                                 ggml_tensor *    dst) {
+                                           ggml_tensor *    src0,
+                                           ggml_tensor *    src1,
+                                           ggml_tensor *    src2,
+                                           ggml_tensor *    dst) {
     const int inplace   = ggml_webgpu_tensor_equal(src0, dst);
     const int mask_type = (src1 != nullptr) ? src1->type : 2;  // use 2 for no mask here
     const int has_sink  = (src2 != nullptr);
@@ -1105,7 +1131,7 @@ static webgpu_command ggml_webgpu_soft_max(webgpu_context & ctx,
     }
 
     return ggml_backend_webgpu_build(ctx, ctx->soft_max_pipeline[mask_type][has_sink][inplace], params, entries,
-                                         ggml_nrows(dst));
+                                     ggml_nrows(dst));
 }
 
 // Returns true if node has enqueued work into the queue, false otherwise
@@ -1180,27 +1206,29 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
     WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute);
 
     WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_encode);
-    std::vector<webgpu_command> commands;
-    std::vector<wgpu::FutureWaitInfo> wait_infos;
+    std::vector<webgpu_command>       commands;
+    std::vector<wgpu::FutureWaitInfo> futures;
     for (int i = 0; i < cgraph->n_nodes; i++) {
         if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) {
             commands.push_back(*cmd);
         }
         if (commands.size() >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
-            std::vector<wgpu::FutureWaitInfo> new_infos = ggml_backend_webgpu_submit(ctx, commands);
-            wait_infos.insert(wait_infos.end(), new_infos.begin(), new_infos.end());
+            std::vector<wgpu::FutureWaitInfo> new_futures = ggml_backend_webgpu_submit(ctx, commands);
+            // check if previous futures have finished
+            ggml_backend_webgpu_wait(ctx, futures, 1);
+            futures.insert(futures.end(), new_futures.begin(), new_futures.end());
             commands.clear();
         }
     }
     if (!commands.empty()) {
-        std::vector<wgpu::FutureWaitInfo> new_infos = ggml_backend_webgpu_submit(ctx, commands);
-        wait_infos.insert(wait_infos.end(), new_infos.begin(), new_infos.end());
+        std::vector<wgpu::FutureWaitInfo> new_futures = ggml_backend_webgpu_submit(ctx, commands);
+        futures.insert(futures.end(), new_futures.begin(), new_futures.end());
         commands.clear();
     }
     WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_encode, ctx);
 
     WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_wait);
-    ggml_backend_webgpu_wait(ctx, wait_infos);
+    ggml_backend_webgpu_wait(ctx, futures);
     WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_wait, ctx);
 
     WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx);
@@ -1298,12 +1326,12 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
         // wait for WriteBuffer to complete
         webgpu_ctx->instance.WaitAny(
             webgpu_ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
-                                           [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
-                                               if (status != wgpu::QueueWorkDoneStatus::Success) {
-                                                   GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n",
-                                                                  std::string(message).c_str());
-                                               }
-                                           }),
+                                                  [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
+                                                      if (status != wgpu::QueueWorkDoneStatus::Success) {
+                                                          GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n",
+                                                                         std::string(message).c_str());
+                                                      }
+                                                  }),
             UINT64_MAX);
     }
     WEBGPU_CPU_PROFILE_TOTAL_END(set_tensor, webgpu_ctx);
@@ -2061,7 +2089,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
         [](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) {
             GGML_UNUSED(device);
             GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason),
-                           std::string(message).c_str());
+                       std::string(message).c_str());
         });
     ctx->instance.WaitAny(ctx->adapter.RequestDevice(
                               &dev_desc, wgpu::CallbackMode::AllowSpontaneous,

From 98d98a2a5fd5d26c005acc7411ca7729fb3fd532 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Sun, 5 Oct 2025 16:53:41 -0700
Subject: [PATCH 05/15] try new method of waiting on futures

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 764690fb388f5..3481301d3db8b 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -298,20 +298,19 @@ static void ggml_webgpu_create_buffer(wgpu::Device &    device,
 
 // Wait for the queue to finish processing all submitted work
 static void ggml_backend_webgpu_wait(webgpu_context &                    ctx,
-                                     std::vector<wgpu::FutureWaitInfo> & futures,
+                                     std::vector<std::vector<wgpu::FutureWaitInfo>> & futures,
                                      uint64_t                            timeout_ms = UINT64_MAX) {
     // WebGPU implementations may limit the number of futures that can be waited on at once,
     // so wait in batches (64 is what Dawn supports).
     size_t i = 0;
     while (i < futures.size()) {
-        size_t end        = std::min(i + WEBGPU_WAIT_ANY_BATCH_SIZE, futures.size());
-        auto   waitStatus = ctx->instance.WaitAny(end - i, futures.data() + i, timeout_ms);
+        auto   waitStatus = ctx->instance.WaitAny(futures[i].size(), futures[i].data(), timeout_ms);
         switch (waitStatus) {
             case wgpu::WaitStatus::Success:
-                futures.erase(futures.begin() + i, futures.begin() + end);
+                futures.erase(futures.begin() + i);
                 break;
             case wgpu::WaitStatus::TimedOut:
-                i += WEBGPU_WAIT_ANY_BATCH_SIZE;
+                i++;
                 break;
             case wgpu::WaitStatus::Error:
                 GGML_LOG_ERROR("ggml_webgpu: WaitAny returned an error\n");
@@ -538,7 +537,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx,
     uint32_t wg_x         = ((size + 3) + bytes_per_wg - 1) / bytes_per_wg;
 
     webgpu_command command = ggml_backend_webgpu_build(ctx, ctx->memset_pipeline, params, entries, wg_x);
-    auto           futures = ggml_backend_webgpu_submit(ctx, { command });
+    std::vector<std::vector<wgpu::FutureWaitInfo>> futures = { ggml_backend_webgpu_submit(ctx, { command }) };
     ggml_backend_webgpu_wait(ctx, futures);
 }
 
@@ -1207,7 +1206,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
 
     WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_encode);
     std::vector<webgpu_command>       commands;
-    std::vector<wgpu::FutureWaitInfo> futures;
+    std::vector<std::vector<wgpu::FutureWaitInfo>> futures;
     for (int i = 0; i < cgraph->n_nodes; i++) {
         if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) {
             commands.push_back(*cmd);
@@ -1215,15 +1214,14 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
         if (commands.size() >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
             std::vector<wgpu::FutureWaitInfo> new_futures = ggml_backend_webgpu_submit(ctx, commands);
             // check if previous futures have finished
-            ggml_backend_webgpu_wait(ctx, futures, 1);
-            futures.insert(futures.end(), new_futures.begin(), new_futures.end());
+            ggml_backend_webgpu_wait(ctx, futures);
+            futures.push_back({ new_futures });
             commands.clear();
         }
     }
     if (!commands.empty()) {
         std::vector<wgpu::FutureWaitInfo> new_futures = ggml_backend_webgpu_submit(ctx, commands);
-        futures.insert(futures.end(), new_futures.begin(), new_futures.end());
-        commands.clear();
+        futures.push_back({ new_futures });
     }
     WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_encode, ctx);
 

From 26c44f8e4edf5527652db48606df3c54385b8e0f Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Sun, 5 Oct 2025 20:15:04 -0700
Subject: [PATCH 06/15] Add serializing of command submission in some cases

---
 .github/workflows/build.yml          |  2 +-
 ggml/CMakeLists.txt                  |  3 ++-
 ggml/src/ggml-webgpu/CMakeLists.txt  |  4 +++
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 40 +++++++++++++++-------------
 4 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index db885907cdaed..c24b11d3bc088 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -485,7 +485,7 @@ jobs:
         id: cmake_build
         run: |
           export Dawn_DIR=dawn/lib64/cmake/Dawn
-          cmake -B build -DGGML_WEBGPU=ON
+          cmake -B build -DGGML_WEBGPU=ON -DGGML_WEBGPU_SERIALIZE_SUBMIT=ON
           cmake --build build --config Release -j $(nproc)
 
       - name: Test
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 67d02e73f6767..f8bccd5a657b6 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -221,9 +221,10 @@ option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug in
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_WEBGPU                          "ggml: use WebGPU"                                OFF)
+option(GGML_WEBGPU_SERIALIZE_SUBMIT         "ggml: enable WebGPU command serialization"       OFF)
 option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
 option(GGML_WEBGPU_CPU_PROFILE              "ggml: enable WebGPU profiling (CPU)"             OFF)
-option(GGML_WEBGPU_GPU_PROFILE              "ggml: enable WebGPU profiling (GPU)"            OFF)
+option(GGML_WEBGPU_GPU_PROFILE              "ggml: enable WebGPU profiling (GPU)"             OFF)
 
 option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt
index c6a95d5151245..d45ce0acffba7 100644
--- a/ggml/src/ggml-webgpu/CMakeLists.txt
+++ b/ggml/src/ggml-webgpu/CMakeLists.txt
@@ -46,6 +46,10 @@ else()
     set(DawnWebGPU_TARGET dawn::webgpu_dawn)
 endif()
 
+if (GGML_WEBGPU_SERIALIZE_SUBMIT)
+    target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_SERIALIZE_SUBMIT=1)
+endif()
+
 if (GGML_WEBGPU_DEBUG)
     target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1)
 endif()
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 3481301d3db8b..2487e1b5560a4 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -56,10 +56,16 @@
 #    define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16  // e.g. enough for two timestamps
 #endif
 
+#ifdef GGML_WEBGPU_SERIALIZE_SUBMIT
+#    define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 1
+#    define WEBGPU_WAIT_ANY_TIMEOUT_MS       UINT64_MAX
+#else
+#    define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8
+#    define WEBGPU_WAIT_ANY_TIMEOUT_MS       1
+#endif
+
 /* Constants */
 
-#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE     8
-#define WEBGPU_WAIT_ANY_BATCH_SIZE           64
 #define WEBGPU_MUL_MAT_WG_SIZE               256
 #define WEBGPU_NUM_PARAM_BUFS                100
 #define WEBGPU_PARAMS_BUF_SIZE_BYTES         128  // enough for 32 parameters
@@ -297,14 +303,12 @@ static void ggml_webgpu_create_buffer(wgpu::Device &    device,
 /** WebGPU Actions */
 
 // Wait for the queue to finish processing all submitted work
-static void ggml_backend_webgpu_wait(webgpu_context &                    ctx,
+static void ggml_backend_webgpu_wait(webgpu_context &                                 ctx,
                                      std::vector<std::vector<wgpu::FutureWaitInfo>> & futures,
-                                     uint64_t                            timeout_ms = UINT64_MAX) {
-    // WebGPU implementations may limit the number of futures that can be waited on at once,
-    // so wait in batches (64 is what Dawn supports).
+                                     uint64_t                                         timeout_ms = UINT64_MAX) {
     size_t i = 0;
     while (i < futures.size()) {
-        auto   waitStatus = ctx->instance.WaitAny(futures[i].size(), futures[i].data(), timeout_ms);
+        auto waitStatus = ctx->instance.WaitAny(futures[i].size(), futures[i].data(), timeout_ms);
         switch (waitStatus) {
             case wgpu::WaitStatus::Success:
                 futures.erase(futures.begin() + i);
@@ -381,25 +385,25 @@ static void ggml_backend_webgpu_debug(webgpu_context & ctx) {
 #endif
 
 static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_context              ctx,
-                                                                    std::vector<webgpu_command> builds) {
-    std::vector<wgpu::CommandBuffer> commands;
+                                                                    std::vector<webgpu_command> commands) {
+    std::vector<wgpu::CommandBuffer> command_buffers;
     std::vector<webgpu_pool_bufs>    params_bufs;
     std::vector<webgpu_pool_bufs>    set_rows_error_bufs;
 #ifdef GGML_WEBGPU_GPU_PROFILE
     std::vector<std::pair<std::string, webgpu_pool_bufs>> pipeline_name_and_ts_bufs;
 #endif
 
-    for (const auto & build : builds) {
-        commands.push_back(build.commands);
-        params_bufs.push_back(build.params_bufs);
-        if (build.set_rows_error_bufs) {
-            set_rows_error_bufs.push_back(build.set_rows_error_bufs.value());
+    for (const auto & command : commands) {
+        command_buffers.push_back(command.commands);
+        params_bufs.push_back(command.params_bufs);
+        if (command.set_rows_error_bufs) {
+            set_rows_error_bufs.push_back(command.set_rows_error_bufs.value());
         }
 #ifdef GGML_WEBGPU_GPU_PROFILE
-        pipeline_name_and_ts_bufs.push_back({ build.pipeline_name, build.timestamp_query_bufs });
+        pipeline_name_and_ts_bufs.push_back({ command.pipeline_name, command.timestamp_query_bufs });
 #endif
     }
-    ctx->queue.Submit(commands.size(), commands.data());
+    ctx->queue.Submit(command_buffers.size(), command_buffers.data());
 
     std::vector<wgpu::FutureWaitInfo> futures;
 
@@ -1205,7 +1209,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
     WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute);
 
     WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_encode);
-    std::vector<webgpu_command>       commands;
+    std::vector<webgpu_command>                    commands;
     std::vector<std::vector<wgpu::FutureWaitInfo>> futures;
     for (int i = 0; i < cgraph->n_nodes; i++) {
         if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) {
@@ -1214,7 +1218,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
         if (commands.size() >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
             std::vector<wgpu::FutureWaitInfo> new_futures = ggml_backend_webgpu_submit(ctx, commands);
             // check if previous futures have finished
-            ggml_backend_webgpu_wait(ctx, futures);
+            ggml_backend_webgpu_wait(ctx, futures, WEBGPU_WAIT_ANY_TIMEOUT_MS);
             futures.push_back({ new_futures });
             commands.clear();
         }

From eabab9e0577304e30429cf2fcf652adea7e1926c Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Mon, 6 Oct 2025 10:53:55 -0700
Subject: [PATCH 07/15] Add new pool for timestamp queries and clean up logging

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 149 +++++++++++++++++----------
 1 file changed, 92 insertions(+), 57 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 2487e1b5560a4..76344d341fd93 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -36,7 +36,7 @@
             std::chrono::duration<double, std::milli>(cpu_total_end_##id - cpu_total_start_##id).count(); \
         (ctx)->cpu_time_ms[#id] += cpu_total_time_##id;
 
-// fine-grained timing (diagnostics only)
+// fine-grained timing (not included in totals)
 #    define WEBGPU_CPU_PROFILE_DETAIL_START(id) auto cpu_detail_start_##id = std::chrono::high_resolution_clock::now();
 
 #    define WEBGPU_CPU_PROFILE_DETAIL_END(id, ctx)                                                          \
@@ -52,7 +52,7 @@
 #endif  // GGML_WEBGPU_CPU_PROFILE
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
-#    define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS       100
+#    define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS       24
 #    define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16  // e.g. enough for two timestamps
 #endif
 
@@ -67,7 +67,7 @@
 /* Constants */
 
 #define WEBGPU_MUL_MAT_WG_SIZE               256
-#define WEBGPU_NUM_PARAM_BUFS                100
+#define WEBGPU_NUM_PARAM_BUFS                32
 #define WEBGPU_PARAMS_BUF_SIZE_BYTES         128  // enough for 32 parameters
 #define WEBGPU_NUM_SET_ROWS_ERROR_BUFS       32
 #define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
@@ -149,6 +149,68 @@ struct webgpu_buf_pool {
     }
 };
 
+#ifdef GGML_WEBGPU_GPU_PROFILE
+struct webgpu_gpu_profile_bufs {
+    wgpu::Buffer   host_buf;
+    wgpu::Buffer   dev_buf;
+    wgpu::QuerySet query_set;
+};
+
+// Holds a pool of parameter buffers for WebGPU operations
+struct webgpu_gpu_profile_buf_pool {
+    std::vector<webgpu_gpu_profile_bufs> free;
+
+    std::mutex mutex;
+
+    std::condition_variable cv;
+
+    void init(wgpu::Device      device,
+              int               num_bufs,
+              size_t            buf_size,
+              wgpu::BufferUsage dev_buf_usage,
+              wgpu::BufferUsage host_buf_usage) {
+        for (int i = 0; i < num_bufs; i++) {
+            wgpu::Buffer host_buf;
+            wgpu::Buffer dev_buf;
+            ggml_webgpu_create_buffer(device, host_buf, buf_size, host_buf_usage, "ggml_webgpu_host_profile_buf");
+            ggml_webgpu_create_buffer(device, dev_buf, buf_size, dev_buf_usage, "ggml_webgpu_dev_profile_buf");
+            // Create a query set for 2 timestamps
+            wgpu::QuerySetDescriptor ts_query_set_desc = {};
+
+            ts_query_set_desc.type      = wgpu::QueryType::Timestamp;
+            ts_query_set_desc.count     = 2;
+            wgpu::QuerySet ts_query_set = device.CreateQuerySet(&ts_query_set_desc);
+
+            free.push_back({ host_buf, dev_buf, ts_query_set });
+        }
+    }
+
+    webgpu_gpu_profile_bufs alloc_bufs() {
+        std::unique_lock<std::mutex> lock(mutex);
+        cv.wait(lock, [this] { return !free.empty(); });
+        webgpu_gpu_profile_bufs bufs = free.back();
+        free.pop_back();
+        return bufs;
+    }
+
+    void free_bufs(std::vector<webgpu_gpu_profile_bufs> bufs) {
+        std::lock_guard<std::mutex> lock(mutex);
+        free.insert(free.end(), bufs.begin(), bufs.end());
+        cv.notify_all();
+    }
+
+    void cleanup() {
+        std::lock_guard<std::mutex> lock(mutex);
+        for (auto & bufs : free) {
+            bufs.host_buf.Destroy();
+            bufs.dev_buf.Destroy();
+            bufs.query_set.Destroy();
+        }
+        free.clear();
+    }
+};
+#endif
+
 struct webgpu_pipeline {
     wgpu::ComputePipeline pipeline;
     std::string           name;
@@ -159,8 +221,8 @@ struct webgpu_command {
     webgpu_pool_bufs                params_bufs;
     std::optional<webgpu_pool_bufs> set_rows_error_bufs;
 #ifdef GGML_WEBGPU_GPU_PROFILE
-    webgpu_pool_bufs timestamp_query_bufs;
-    std::string      pipeline_name;
+    webgpu_gpu_profile_bufs timestamp_query_bufs;
+    std::string             pipeline_name;
 #endif
 };
 
@@ -218,7 +280,7 @@ struct webgpu_context_struct {
     // Profiling: per-shader GPU time in ms
     std::unordered_map<std::string, double> shader_gpu_time_ms;
     // Profiling: pool of timestamp query buffers (one per operation)
-    webgpu_buf_pool                         timestamp_query_buf_pool;
+    webgpu_gpu_profile_buf_pool             timestamp_query_buf_pool;
 #endif
 };
 
@@ -259,8 +321,6 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &
                                         const char *                             shader_code,
                                         const char *                             label,
                                         const std::vector<wgpu::ConstantEntry> & constants = {}) {
-    WEBGPU_LOG_DEBUG("ggml_webgpu_create_pipeline()");
-
     wgpu::ShaderSourceWGSL shader_source;
     shader_source.code = shader_code;
 
@@ -286,8 +346,6 @@ static void ggml_webgpu_create_buffer(wgpu::Device &    device,
                                       size_t            size,
                                       wgpu::BufferUsage usage,
                                       const char *      label) {
-    WEBGPU_LOG_DEBUG("ggml_webgpu_create_buffer()");
-
     wgpu::BufferDescriptor buffer_desc;
     buffer_desc.size             = size;
     buffer_desc.usage            = usage;
@@ -326,28 +384,6 @@ static void ggml_backend_webgpu_wait(webgpu_context &
     }
 }
 
-#ifdef GGML_WEBGPU_GPU_PROFILE
-static wgpu::FutureWaitInfo ggml_backend_webgpu_process_timestamps(webgpu_context & ctx,
-                                                                   webgpu_pool_bufs ts_bufs,
-                                                                   std::string      label) {
-    wgpu::Future f = ts_bufs.host_buf.MapAsync(
-        wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
-        [ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) {
-            if (status != wgpu::MapAsyncStatus::Success) {
-                GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str());
-            } else {
-                const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange();
-                // WebGPU timestamps are in ticks; convert to ms using device timestamp period if available
-                double elapsed_ms        = double(ts_data[1] - ts_data[0]) * 1e-6;  // TODO: use actual timestamp period
-                ctx->shader_gpu_time_ms[label] += elapsed_ms;
-                // We can't unmap in here due to WebGPU reentrancy limitations.
-                ctx->timestamp_query_buf_pool.free_bufs({ ts_bufs });
-            }
-        });
-    return { f };
-}
-#endif
-
 static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
                                            wgpu::Buffer &   buffer,
                                            wgpu::MapMode    mode,
@@ -390,7 +426,7 @@ static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_conte
     std::vector<webgpu_pool_bufs>    params_bufs;
     std::vector<webgpu_pool_bufs>    set_rows_error_bufs;
 #ifdef GGML_WEBGPU_GPU_PROFILE
-    std::vector<std::pair<std::string, webgpu_pool_bufs>> pipeline_name_and_ts_bufs;
+    std::vector<std::pair<std::string, webgpu_gpu_profile_bufs>> pipeline_name_and_ts_bufs;
 #endif
 
     for (const auto & command : commands) {
@@ -399,9 +435,6 @@ static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_conte
         if (command.set_rows_error_bufs) {
             set_rows_error_bufs.push_back(command.set_rows_error_bufs.value());
         }
-#ifdef GGML_WEBGPU_GPU_PROFILE
-        pipeline_name_and_ts_bufs.push_back({ command.pipeline_name, command.timestamp_query_bufs });
-#endif
     }
     ctx->queue.Submit(command_buffers.size(), command_buffers.data());
 
@@ -437,9 +470,25 @@ static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_conte
     }
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
-    for (const auto & name_and_bufs : pipeline_name_and_ts_bufs) {
-        wgpu::FutureWaitInfo f = ggml_backend_webgpu_process_timestamps(ctx, name_and_bufs.second, name_and_bufs.first);
-        futures.push_back(f);
+    for (const auto & command : commands) {
+        auto label   = command.pipeline_name;
+        auto ts_bufs = command.timestamp_query_bufs;
+
+        wgpu::Future f = ts_bufs.host_buf.MapAsync(
+            wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
+            [ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) {
+                if (status != wgpu::MapAsyncStatus::Success) {
+                    GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str());
+                } else {
+                    const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange();
+                    // WebGPU timestamps are in ns; convert to ms
+                    double elapsed_ms = double(ts_data[1] - ts_data[0]) * 1e-6;
+                    ctx->shader_gpu_time_ms[label] += elapsed_ms;
+                    // We can't unmap in here due to WebGPU reentrancy limitations.
+                    ctx->timestamp_query_buf_pool.free_bufs({ ts_bufs });
+                }
+            });
+        futures.push_back({ f });
     }
 #endif
     return futures;
@@ -480,18 +529,12 @@ static webgpu_command ggml_backend_webgpu_build(webgpu_context &
 #ifdef GGML_WEBGPU_GPU_PROFILE
     // --- Profiling: GPU timestamp queries ---
     // Allocate a timestamp query buffer (2 timestamps: start/end)
-    webgpu_pool_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs();
+    webgpu_gpu_profile_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs();
     if (ts_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) {
         ts_bufs.host_buf.Unmap();
     }
 
-    // Create a query set for 2 timestamps
-    wgpu::QuerySetDescriptor ts_query_set_desc = {};
-    ts_query_set_desc.type                     = wgpu::QueryType::Timestamp;
-    ts_query_set_desc.count                    = 2;
-    wgpu::QuerySet ts_query_set                = ctx->device.CreateQuerySet(&ts_query_set_desc);
-
-    wgpu::PassTimestampWrites   ts_writes = { .querySet                  = ts_query_set,
+    wgpu::PassTimestampWrites   ts_writes = { .querySet                  = ts_bufs.query_set,
                                               .beginningOfPassWriteIndex = 0,
                                               .endOfPassWriteIndex       = 1 };
     wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes };
@@ -506,7 +549,7 @@ static webgpu_command ggml_backend_webgpu_build(webgpu_context &
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
     // Resolve the query set into the device buffer
-    encoder.ResolveQuerySet(ts_query_set, 0, 2, ts_bufs.dev_buf, 0);
+    encoder.ResolveQuerySet(ts_bufs.query_set, 0, 2, ts_bufs.dev_buf, 0);
     encoder.CopyBufferToBuffer(ts_bufs.dev_buf, 0, ts_bufs.host_buf, 0, ts_bufs.host_buf.GetSize());
 #endif
 
@@ -1137,7 +1180,7 @@ static webgpu_command ggml_webgpu_soft_max(webgpu_context & ctx,
                                      ggml_nrows(dst));
 }
 
-// Returns true if node has enqueued work into the queue, false otherwise
+// Returns the encoded command, or std::nullopt if the operation is a no-op
 static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
     if (ggml_is_empty(node)) {
         return std::nullopt;
@@ -1208,7 +1251,6 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
 
     WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute);
 
-    WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_encode);
     std::vector<webgpu_command>                    commands;
     std::vector<std::vector<wgpu::FutureWaitInfo>> futures;
     for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -1227,14 +1269,8 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
         std::vector<wgpu::FutureWaitInfo> new_futures = ggml_backend_webgpu_submit(ctx, commands);
         futures.push_back({ new_futures });
     }
-    WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_encode, ctx);
-
-    WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_wait);
     ggml_backend_webgpu_wait(ctx, futures);
-    WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_wait, ctx);
-
     WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx);
-
     return GGML_STATUS_SUCCESS;
 }
 
@@ -1260,7 +1296,6 @@ static ggml_backend_i ggml_backend_webgpu_i = {
 /* GGML Backend Buffer Interface */
 
 static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_free_buffer()");
     ggml_backend_webgpu_buffer_context * ctx = static_cast<ggml_backend_webgpu_buffer_context *>(buffer->context);
     ctx->buffer.Destroy();
 }

From c30b22e2fd69fd0df55906b1407f3207bbfbdabe Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Mon, 6 Oct 2025 19:26:21 -0700
Subject: [PATCH 08/15] Serialize command submission in CI and leave a TODO
 note

---
 .github/workflows/build.yml          | 2 +-
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c24b11d3bc088..b326048c2fb18 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -175,7 +175,7 @@ jobs:
         id: cmake_build
         run: |
           export CMAKE_PREFIX_PATH=dawn
-          cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
+          cmake -B build -DGGML_WEBGPU=ON -DGGML_WEBGPU_SERIALIZE_SUBMIT=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 76344d341fd93..e80af29aa3066 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -56,6 +56,10 @@
 #    define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16  // e.g. enough for two timestamps
 #endif
 
+// TODO: The WebGPU backend can deadlock in multi-threaded scenarios if the parameter buffer pool
+// is exhausted and the command submit batch size is too high, or in cases where the underlying
+// WebGPU implementation has bugs in handling concurrent operations. Serializing command submission
+// is a workaround, but we should also investigate better solutions.
 #ifdef GGML_WEBGPU_SERIALIZE_SUBMIT
 #    define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 1
 #    define WEBGPU_WAIT_ANY_TIMEOUT_MS       UINT64_MAX

From b926b0c5f0f8b0ce66f182220c7bba7c30426521 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Mon, 6 Oct 2025 19:57:23 -0700
Subject: [PATCH 09/15] Update webgpu CI

---
 .github/workflows/build.yml | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b77da9f911a5d..5a37f6ebd1e80 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -444,8 +444,8 @@ jobs:
           # This is using llvmpipe and runs slower than other backends
           ctest -L main --verbose --timeout 4200
 
-  ubuntu-22-cmake-webgpu:
-    runs-on: ubuntu-22.04
+  ubuntu-24-cmake-webgpu:
+    runs-on: ubuntu-24.04
 
     steps:
       - name: Clone
@@ -455,16 +455,34 @@ jobs:
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.16
         with:
-          key: ubuntu-22-cmake-webgpu
+          key: ubuntu-24-cmake-webgpu
           evict-old-files: 1d
 
-      - name: Vulkan SDK Dependencies
-        id: vulkan-depends
+      - name: Dependencies
+        id: depends
         run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          sudo add-apt-repository -y ppa:kisak/kisak-mesa
           sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
+          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
+
+      - name: Get latest Vulkan SDK version
+        id: vulkan_sdk_version
+        run: |
+          echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
+
+      - name: Use Vulkan SDK Cache
+        uses: actions/cache@v4
+        id: cache-sdk
+        with:
+          path: ./vulkan_sdk
+          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
+
+      - name: Setup Vulkan SDK
+        if: steps.cache-sdk.outputs.cache-hit != 'true'
+        uses: ./.github/actions/linux-setup-vulkan
+        with:
+          path: ./vulkan_sdk
+          version: ${{ env.VULKAN_SDK_VERSION }}
 
       - name: Dawn Dependency
         id: dawn-depends

From d501ef4e2e5c62b24ed6f8b79c9b9008ffe165f5 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 7 Oct 2025 09:07:03 -0700
Subject: [PATCH 10/15] Add myself as WebGPU codeowner

---
 CODEOWNERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CODEOWNERS b/CODEOWNERS
index 4ee93fa4babb2..bfffcf3fcdce5 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -74,6 +74,7 @@
 /ggml/src/ggml.c                        @ggerganov @slaren
 /ggml/src/ggml.cpp                      @ggerganov @slaren
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
+/ggml/src/ggml-webgpu/                  @reeselevine
 /gguf-py/                               @CISC
 /media/                                 @ggerganov
 /scripts/gen*                           @ggerganov

From d56e0613fb8ed6e18af1982b287c6b7448b782b4 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 7 Oct 2025 12:29:50 -0700
Subject: [PATCH 11/15] Deadlock avoidance

---
 .github/workflows/build.yml          |  4 +-
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 59 +++++++++++++++++++---------
 2 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 5a37f6ebd1e80..ec2ab5a58d027 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -175,7 +175,7 @@ jobs:
         id: cmake_build
         run: |
           export CMAKE_PREFIX_PATH=dawn
-          cmake -B build -DGGML_WEBGPU=ON -DGGML_WEBGPU_SERIALIZE_SUBMIT=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
+          cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -502,7 +502,7 @@ jobs:
         id: cmake_build
         run: |
           export Dawn_DIR=dawn/lib64/cmake/Dawn
-          cmake -B build -DGGML_WEBGPU=ON -DGGML_WEBGPU_SERIALIZE_SUBMIT=ON
+          cmake -B build -DGGML_WEBGPU=ON
           cmake --build build --config Release -j $(nproc)
 
       - name: Test
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index e80af29aa3066..c3961726f19b2 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -11,6 +11,7 @@
 
 #include <webgpu/webgpu_cpp.h>
 
+#include <atomic>
 #include <condition_variable>
 #include <cstring>
 #include <iostream>
@@ -65,13 +66,15 @@
 #    define WEBGPU_WAIT_ANY_TIMEOUT_MS       UINT64_MAX
 #else
 #    define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8
-#    define WEBGPU_WAIT_ANY_TIMEOUT_MS       1
+#    define WEBGPU_WAIT_ANY_TIMEOUT_MS       0
 #endif
 
 /* Constants */
 
 #define WEBGPU_MUL_MAT_WG_SIZE               256
 #define WEBGPU_NUM_PARAM_BUFS                32
+// Maximum number of in-flight submissions per-thread, to avoid exhausting the parameter buffer pool
+#define WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD  WEBGPU_NUM_PARAM_BUFS / WEBGPU_COMMAND_SUBMIT_BATCH_SIZE
 #define WEBGPU_PARAMS_BUF_SIZE_BYTES         128  // enough for 32 parameters
 #define WEBGPU_NUM_SET_ROWS_ERROR_BUFS       32
 #define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
@@ -107,6 +110,11 @@ struct webgpu_pool_bufs {
     wgpu::Buffer dev_buf;
 };
 
+// The futures to wait on for a single queue submission
+struct webgpu_submission_futures {
+    std::vector<wgpu::FutureWaitInfo> futures;
+};
+
 // Holds a pool of parameter buffers for WebGPU operations
 struct webgpu_buf_pool {
     std::vector<webgpu_pool_bufs> free;
@@ -243,6 +251,7 @@ struct webgpu_context_struct {
     uint32_t max_wg_size_x;
 
     std::recursive_mutex mutex;
+    std::atomic_int      inflight_threads = 0;
 
     webgpu_buf_pool param_buf_pool;
     webgpu_buf_pool set_rows_error_buf_pool;
@@ -365,12 +374,19 @@ static void ggml_webgpu_create_buffer(wgpu::Device &    device,
 /** WebGPU Actions */
 
 // Wait for the queue to finish processing all submitted work
-static void ggml_backend_webgpu_wait(webgpu_context &                                 ctx,
-                                     std::vector<std::vector<wgpu::FutureWaitInfo>> & futures,
-                                     uint64_t                                         timeout_ms = UINT64_MAX) {
+static void ggml_backend_webgpu_wait(webgpu_context &                         ctx,
+                                     std::vector<webgpu_submission_futures> & futures,
+                                     uint64_t                                 timeout_ms = UINT64_MAX) {
+    // If we have too many in-flight submissions, wait on the oldest one first. If there are many threads,
+    // inflight_max may be 0, meaning that we must wait on all futures.
+    int inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / ctx->inflight_threads;
+    while (futures.size() >= inflight_max && futures.size() > 0) {
+        ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX);
+        futures.erase(futures.begin());
+    }
     size_t i = 0;
     while (i < futures.size()) {
-        auto waitStatus = ctx->instance.WaitAny(futures[i].size(), futures[i].data(), timeout_ms);
+        auto waitStatus = ctx->instance.WaitAny(futures[i].futures.size(), futures[i].futures.data(), timeout_ms);
         switch (waitStatus) {
             case wgpu::WaitStatus::Success:
                 futures.erase(futures.begin() + i);
@@ -424,8 +440,7 @@ static void ggml_backend_webgpu_debug(webgpu_context & ctx) {
 }
 #endif
 
-static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_context              ctx,
-                                                                    std::vector<webgpu_command> commands) {
+static webgpu_submission_futures ggml_backend_webgpu_submit(webgpu_context ctx, std::vector<webgpu_command> commands) {
     std::vector<wgpu::CommandBuffer> command_buffers;
     std::vector<webgpu_pool_bufs>    params_bufs;
     std::vector<webgpu_pool_bufs>    set_rows_error_bufs;
@@ -484,9 +499,9 @@ static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_conte
                 if (status != wgpu::MapAsyncStatus::Success) {
                     GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str());
                 } else {
-                    const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange();
+                    const uint64_t * ts_data    = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange();
                     // WebGPU timestamps are in ns; convert to ms
-                    double elapsed_ms = double(ts_data[1] - ts_data[0]) * 1e-6;
+                    double           elapsed_ms = double(ts_data[1] - ts_data[0]) * 1e-6;
                     ctx->shader_gpu_time_ms[label] += elapsed_ms;
                     // We can't unmap in here due to WebGPU reentrancy limitations.
                     ctx->timestamp_query_buf_pool.free_bufs({ ts_bufs });
@@ -495,7 +510,7 @@ static std::vector<wgpu::FutureWaitInfo> ggml_backend_webgpu_submit(webgpu_conte
         futures.push_back({ f });
     }
 #endif
-    return futures;
+    return { futures };
 }
 
 static webgpu_command ggml_backend_webgpu_build(webgpu_context &                  ctx,
@@ -588,7 +603,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx,
     uint32_t wg_x         = ((size + 3) + bytes_per_wg - 1) / bytes_per_wg;
 
     webgpu_command command = ggml_backend_webgpu_build(ctx, ctx->memset_pipeline, params, entries, wg_x);
-    std::vector<std::vector<wgpu::FutureWaitInfo>> futures = { ggml_backend_webgpu_submit(ctx, { command }) };
+    std::vector<webgpu_submission_futures> futures = { ggml_backend_webgpu_submit(ctx, { command }) };
     ggml_backend_webgpu_wait(ctx, futures);
 }
 
@@ -1255,25 +1270,31 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
 
     WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute);
 
-    std::vector<webgpu_command>                    commands;
-    std::vector<std::vector<wgpu::FutureWaitInfo>> futures;
+    ctx->inflight_threads++;
+
+    std::vector<webgpu_command>            commands;
+    std::vector<webgpu_submission_futures> futures;
     for (int i = 0; i < cgraph->n_nodes; i++) {
         if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) {
             commands.push_back(*cmd);
         }
-        if (commands.size() >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
-            std::vector<wgpu::FutureWaitInfo> new_futures = ggml_backend_webgpu_submit(ctx, commands);
-            // check if previous futures have finished
+        // compute the batch size based on the number of inflight threads
+        int batch_size = std::min(std::max(1, WEBGPU_NUM_PARAM_BUFS / ctx->inflight_threads),
+                                  WEBGPU_COMMAND_SUBMIT_BATCH_SIZE);
+        if (commands.size() >= batch_size) {
+            futures.push_back(ggml_backend_webgpu_submit(ctx, commands));
+            // Process events and check for completed submissions
+            ctx->instance.ProcessEvents();
             ggml_backend_webgpu_wait(ctx, futures, WEBGPU_WAIT_ANY_TIMEOUT_MS);
-            futures.push_back({ new_futures });
             commands.clear();
         }
     }
     if (!commands.empty()) {
-        std::vector<wgpu::FutureWaitInfo> new_futures = ggml_backend_webgpu_submit(ctx, commands);
-        futures.push_back({ new_futures });
+        webgpu_submission_futures new_futures = ggml_backend_webgpu_submit(ctx, commands);
+        futures.push_back(new_futures);
     }
     ggml_backend_webgpu_wait(ctx, futures);
+    ctx->inflight_threads--;
     WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx);
     return GGML_STATUS_SUCCESS;
 }

From bcf8d9bdfbd23de0c075ea0345ea7556f8251f2d Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 7 Oct 2025 12:46:34 -0700
Subject: [PATCH 12/15] Leave WebGPU/Vulkan CI serialized

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ec2ab5a58d027..22db2192c40fb 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -502,7 +502,7 @@ jobs:
         id: cmake_build
         run: |
           export Dawn_DIR=dawn/lib64/cmake/Dawn
-          cmake -B build -DGGML_WEBGPU=ON
+          cmake -B build -DGGML_WEBGPU=ON -DGGML_WEBGPU_SERIALIZE_SUBMIT=ON
           cmake --build build --config Release -j $(nproc)
 
       - name: Test

From 8a848cb093922fa7b407833dfd145c86b83dcb49 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 7 Oct 2025 12:48:13 -0700
Subject: [PATCH 13/15] Fix divide by 0

---
 .github/workflows/build.yml          | 2 +-
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 22db2192c40fb..ec2ab5a58d027 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -502,7 +502,7 @@ jobs:
         id: cmake_build
         run: |
           export Dawn_DIR=dawn/lib64/cmake/Dawn
-          cmake -B build -DGGML_WEBGPU=ON -DGGML_WEBGPU_SERIALIZE_SUBMIT=ON
+          cmake -B build -DGGML_WEBGPU=ON
           cmake --build build --config Release -j $(nproc)
 
       - name: Test
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index c3961726f19b2..99dddd132343a 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -379,7 +379,7 @@ static void ggml_backend_webgpu_wait(webgpu_context &                         ct
                                      uint64_t                                 timeout_ms = UINT64_MAX) {
     // If we have too many in-flight submissions, wait on the oldest one first. If there are many threads,
     // inflight_max may be 0, meaning that we must wait on all futures.
-    int inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / ctx->inflight_threads;
+    int inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::min(ctx->inflight_threads, 1);
     while (futures.size() >= inflight_max && futures.size() > 0) {
         ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX);
         futures.erase(futures.begin());

From d3c7ddd4ce5b8bc622a7d07e60e91137e841f5cc Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 7 Oct 2025 13:24:50 -0700
Subject: [PATCH 14/15] Fix logic in division by inflight_threads

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 99dddd132343a..db9bfc3b77431 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -62,17 +62,17 @@
 // WebGPU implementation has bugs in handling concurrent operations. Serializing command submission
 // is a workaround, but we should also investigate better solutions.
 #ifdef GGML_WEBGPU_SERIALIZE_SUBMIT
-#    define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 1
+#    define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 1u
 #    define WEBGPU_WAIT_ANY_TIMEOUT_MS       UINT64_MAX
 #else
-#    define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8
+#    define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8u
 #    define WEBGPU_WAIT_ANY_TIMEOUT_MS       0
 #endif
 
 /* Constants */
 
 #define WEBGPU_MUL_MAT_WG_SIZE               256
-#define WEBGPU_NUM_PARAM_BUFS                32
+#define WEBGPU_NUM_PARAM_BUFS                32u
 // Maximum number of in-flight submissions per-thread, to avoid exhausting the parameter buffer pool
 #define WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD  WEBGPU_NUM_PARAM_BUFS / WEBGPU_COMMAND_SUBMIT_BATCH_SIZE
 #define WEBGPU_PARAMS_BUF_SIZE_BYTES         128  // enough for 32 parameters
@@ -251,7 +251,7 @@ struct webgpu_context_struct {
     uint32_t max_wg_size_x;
 
     std::recursive_mutex mutex;
-    std::atomic_int      inflight_threads = 0;
+    std::atomic_uint     inflight_threads = 0;
 
     webgpu_buf_pool param_buf_pool;
     webgpu_buf_pool set_rows_error_buf_pool;
@@ -379,7 +379,8 @@ static void ggml_backend_webgpu_wait(webgpu_context &                         ct
                                      uint64_t                                 timeout_ms = UINT64_MAX) {
     // If we have too many in-flight submissions, wait on the oldest one first. If there are many threads,
     // inflight_max may be 0, meaning that we must wait on all futures.
-    int inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::min(ctx->inflight_threads, 1);
+    uint inflight_threads = ctx->inflight_threads;
+    uint inflight_max     = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
     while (futures.size() >= inflight_max && futures.size() > 0) {
         ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX);
         futures.erase(futures.begin());
@@ -1279,8 +1280,9 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
             commands.push_back(*cmd);
         }
         // compute the batch size based on the number of inflight threads
-        int batch_size = std::min(std::max(1, WEBGPU_NUM_PARAM_BUFS / ctx->inflight_threads),
-                                  WEBGPU_COMMAND_SUBMIT_BATCH_SIZE);
+        uint inflight_threads = ctx->inflight_threads;
+        uint batch_size       = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)),
+                                         WEBGPU_COMMAND_SUBMIT_BATCH_SIZE);
         if (commands.size() >= batch_size) {
             futures.push_back(ggml_backend_webgpu_submit(ctx, commands));
             // Process events and check for completed submissions

From a5e26c2d3bdd93c065e638838a278d74a213de4d Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 7 Oct 2025 13:39:07 -0700
Subject: [PATCH 15/15] Update CODEOWNERS and remove serialize submit option

---
 CODEOWNERS                           |  2 +-
 ggml/CMakeLists.txt                  |  1 -
 ggml/src/ggml-webgpu/CMakeLists.txt  |  4 ----
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 23 +++++++----------------
 4 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index bfffcf3fcdce5..3b696bf94a147 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -70,11 +70,11 @@
 /ggml/src/ggml-rpc/                     @rgerganov
 /ggml/src/ggml-threading.*              @ggerganov @slaren
 /ggml/src/ggml-vulkan/                  @0cc4m
+/ggml/src/ggml-webgpu/                  @reeselevine
 /ggml/src/ggml-zdnn/                    @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
 /ggml/src/ggml.c                        @ggerganov @slaren
 /ggml/src/ggml.cpp                      @ggerganov @slaren
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
-/ggml/src/ggml-webgpu/                  @reeselevine
 /gguf-py/                               @CISC
 /media/                                 @ggerganov
 /scripts/gen*                           @ggerganov
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index f8bccd5a657b6..73032be68e153 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -221,7 +221,6 @@ option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug in
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_WEBGPU                          "ggml: use WebGPU"                                OFF)
-option(GGML_WEBGPU_SERIALIZE_SUBMIT         "ggml: enable WebGPU command serialization"       OFF)
 option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
 option(GGML_WEBGPU_CPU_PROFILE              "ggml: enable WebGPU profiling (CPU)"             OFF)
 option(GGML_WEBGPU_GPU_PROFILE              "ggml: enable WebGPU profiling (GPU)"             OFF)
diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt
index d45ce0acffba7..c6a95d5151245 100644
--- a/ggml/src/ggml-webgpu/CMakeLists.txt
+++ b/ggml/src/ggml-webgpu/CMakeLists.txt
@@ -46,10 +46,6 @@ else()
     set(DawnWebGPU_TARGET dawn::webgpu_dawn)
 endif()
 
-if (GGML_WEBGPU_SERIALIZE_SUBMIT)
-    target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_SERIALIZE_SUBMIT=1)
-endif()
-
 if (GGML_WEBGPU_DEBUG)
     target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1)
 endif()
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index db9bfc3b77431..05e16cd432ad3 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -57,22 +57,12 @@
 #    define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16  // e.g. enough for two timestamps
 #endif
 
-// TODO: The WebGPU backend can deadlock in multi-threaded scenarios if the parameter buffer pool
-// is exhausted and the command submit batch size is too high, or in cases where the underlying
-// WebGPU implementation has bugs in handling concurrent operations. Serializing command submission
-// is a workaround, but we should also investigate better solutions.
-#ifdef GGML_WEBGPU_SERIALIZE_SUBMIT
-#    define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 1u
-#    define WEBGPU_WAIT_ANY_TIMEOUT_MS       UINT64_MAX
-#else
-#    define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8u
-#    define WEBGPU_WAIT_ANY_TIMEOUT_MS       0
-#endif
-
 /* Constants */
 
 #define WEBGPU_MUL_MAT_WG_SIZE               256
 #define WEBGPU_NUM_PARAM_BUFS                32u
+#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE     8u
+#define WEBGPU_WAIT_ANY_TIMEOUT_MS           0
 // Maximum number of in-flight submissions per-thread, to avoid exhausting the parameter buffer pool
 #define WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD  WEBGPU_NUM_PARAM_BUFS / WEBGPU_COMMAND_SUBMIT_BATCH_SIZE
 #define WEBGPU_PARAMS_BUF_SIZE_BYTES         128  // enough for 32 parameters
@@ -376,11 +366,12 @@ static void ggml_webgpu_create_buffer(wgpu::Device &    device,
 // Wait for the queue to finish processing all submitted work
 static void ggml_backend_webgpu_wait(webgpu_context &                         ctx,
                                      std::vector<webgpu_submission_futures> & futures,
-                                     uint64_t                                 timeout_ms = UINT64_MAX) {
+                                     bool                                     block = true) {
     // If we have too many in-flight submissions, wait on the oldest one first. If there are many threads,
     // inflight_max may be 0, meaning that we must wait on all futures.
-    uint inflight_threads = ctx->inflight_threads;
-    uint inflight_max     = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
+    uint64_t timeout_ms       = block ? UINT64_MAX : 0;
+    uint     inflight_threads = ctx->inflight_threads;
+    uint     inflight_max     = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
     while (futures.size() >= inflight_max && futures.size() > 0) {
         ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX);
         futures.erase(futures.begin());
@@ -1287,7 +1278,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
             futures.push_back(ggml_backend_webgpu_submit(ctx, commands));
             // Process events and check for completed submissions
             ctx->instance.ProcessEvents();
-            ggml_backend_webgpu_wait(ctx, futures, WEBGPU_WAIT_ANY_TIMEOUT_MS);
+            ggml_backend_webgpu_wait(ctx, futures, false);
             commands.clear();
         }
     }