From 5ae936a3bcdb6d3e950fb22b6df523ae9dbcd8ac Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 1 Oct 2025 09:34:25 -0700 Subject: [PATCH 01/15] Add profiling --- ggml/CMakeLists.txt | 2 + ggml/src/ggml-webgpu/CMakeLists.txt | 4 + ggml/src/ggml-webgpu/ggml-webgpu.cpp | 216 ++++++++++++++++++++++----- 3 files changed, 187 insertions(+), 35 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 6ce52ffc6698b..c4f83c6e88d7e 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -222,6 +222,8 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) option(GGML_WEBGPU "ggml: use WebGPU" OFF) option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF) +option(GGML_WEBGPU_PROFILE "ggml: enable WebGPU performance profiling" OFF) + option(GGML_ZDNN "ggml: use zDNN" OFF) option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF) diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt index 78a985a4d167a..0f6d65e58035f 100644 --- a/ggml/src/ggml-webgpu/CMakeLists.txt +++ b/ggml/src/ggml-webgpu/CMakeLists.txt @@ -50,5 +50,9 @@ if (GGML_WEBGPU_DEBUG) target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1) endif() +if (GGML_WEBGPU_PROFILE) + target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_PROFILE=1) +endif() + target_include_directories(ggml-webgpu PRIVATE ${SHADER_OUTPUT_DIR}) target_link_libraries(ggml-webgpu PRIVATE ${DawnWebGPU_TARGET}) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index e795ca3fd92fd..a8d4e058c7a8f 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -25,6 +25,19 @@ # define WEBGPU_LOG_DEBUG(msg) ((void) 0) #endif // GGML_WEBGPU_DEBUG +#ifdef GGML_WEBGPU_PROFILE +# define WEBGPU_CPU_PROFILE_START(id) auto cpu_start_##id = std::chrono::high_resolution_clock::now(); +# define WEBGPU_CPU_PROFILE_END(id) \ + auto cpu_end_##id = std::chrono::high_resolution_clock::now(); \ + double cpu_time_##id = std::chrono::duration(cpu_end_##id - cpu_start_##id).count(); \ + ctx->cpu_time_ms[#id] += cpu_time_##id; +# define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS 100 +# define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16 // e.g. enough for two timestamps +#else +# define WEBGPU_CPU_PROFILE_START(id) ((void) 0) +# define WEBGPU_CPU_PROFILE_END(id) ((void) 0) +#endif // GGML_WEBGPU_PROFILE + /* Constants */ #define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 16 @@ -112,6 +125,11 @@ struct webgpu_buf_pool { } }; +struct webgpu_pipeline { + wgpu::ComputePipeline pipeline; + std::string name; +}; + // All the base objects needed to run operations on a WebGPU device struct webgpu_context_struct { wgpu::Instance instance; @@ -129,21 +147,21 @@ struct webgpu_context_struct { webgpu_buf_pool param_buf_pool; webgpu_buf_pool set_rows_error_buf_pool; - wgpu::ComputePipeline memset_pipeline; - wgpu::ComputePipeline mul_mat_pipeline[30][2]; - wgpu::ComputePipeline set_rows_pipeline; - wgpu::ComputePipeline get_rows_pipeline[30]; - wgpu::ComputePipeline get_rows_f32_no_vec_pipeline; - wgpu::ComputePipeline cpy_pipeline[2][2]; // src type, dst type - wgpu::ComputePipeline add_pipeline[2][2]; // type, inplace - wgpu::ComputePipeline sub_pipeline[2][2]; // type, inplace - wgpu::ComputePipeline mul_pipeline[2][2]; // type, inplace - wgpu::ComputePipeline div_pipeline[2][2]; // type, inplace - wgpu::ComputePipeline rms_norm_pipeline[2]; // inplace - wgpu::ComputePipeline rope_pipeline[2][2][2]; // type, ff, inplace - wgpu::ComputePipeline glu_pipeline[7][2][2]; // glu-op, type, split - wgpu::ComputePipeline scale_pipeline[2]; // inplace - wgpu::ComputePipeline soft_max_pipeline[3][2][2]; // (no_mask, f32_mask, f16_mask), has_sink, inplace + webgpu_pipeline memset_pipeline; + webgpu_pipeline mul_mat_pipeline[30][2]; + webgpu_pipeline set_rows_pipeline; + webgpu_pipeline get_rows_pipeline[30]; + webgpu_pipeline get_rows_f32_no_vec_pipeline; + webgpu_pipeline cpy_pipeline[2][2]; // src type, dst type + webgpu_pipeline add_pipeline[2][2]; // type, inplace + webgpu_pipeline sub_pipeline[2][2]; // type, inplace + webgpu_pipeline mul_pipeline[2][2]; // type, inplace + webgpu_pipeline div_pipeline[2][2]; // type, inplace + webgpu_pipeline rms_norm_pipeline[2]; // inplace + webgpu_pipeline rope_pipeline[2][2][2]; // type, ff, inplace + webgpu_pipeline glu_pipeline[7][2][2]; // glu-op, type, split + webgpu_pipeline scale_pipeline[2]; // inplace + webgpu_pipeline soft_max_pipeline[3][2][2]; // (no_mask, f32_mask, f16_mask), has_sink, inplace size_t memset_bytes_per_thread; @@ -164,6 +182,20 @@ struct webgpu_context_struct { wgpu::Buffer debug_host_buf; wgpu::Buffer debug_dev_buf; #endif + +#ifdef GGML_WEBGPU_PROFILE + // Profiling: per-shader GPU time in ms + std::unordered_map shader_gpu_time_ms; + + // Profiling: labeled CPU time in ms + std::unordered_map cpu_time_ms; + + // Profiling: pool of timestamp query buffers (one per operation) + webgpu_buf_pool timestamp_query_buf_pool; + + // Profiling: staged timestamp buffers and their labels for batch submission + std::vector> staged_timestamp_bufs; +#endif }; typedef std::shared_ptr webgpu_context; @@ -199,7 +231,7 @@ struct ggml_backend_webgpu_buffer_context { /* WebGPU object initializations */ static void ggml_webgpu_create_pipeline(wgpu::Device & device, - wgpu::ComputePipeline & pipeline, + webgpu_pipeline & pipeline, const char * shader_code, const char * label, const std::vector & constants = {}) { @@ -222,7 +254,7 @@ static void ggml_webgpu_create_pipeline(wgpu::Device & pipeline_desc.compute.constants = constants.data(); pipeline_desc.compute.constantCount = constants.size(); } - pipeline = device.CreateComputePipeline(&pipeline_desc); + pipeline = { device.CreateComputePipeline(&pipeline_desc), label }; } static void ggml_webgpu_create_buffer(wgpu::Device & device, @@ -271,6 +303,28 @@ static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) { } } +#ifdef GGML_WEBGPU_PROFILE +static wgpu::FutureWaitInfo ggml_backend_webgpu_process_timestamps(webgpu_context & ctx, + webgpu_pool_bufs ts_bufs, + std::string label) { + wgpu::Future f = ts_bufs.host_buf.MapAsync( + wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous, + [ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) { + if (status != wgpu::MapAsyncStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str()); + } else { + const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange(); + // WebGPU timestamps are in ticks; convert to ms using device timestamp period if available + double elapsed_ms = double(ts_data[1] - ts_data[0]) * 1e-6; // TODO: use actual timestamp period + ctx->shader_gpu_time_ms[label] += elapsed_ms; + // We can't unmap in here due to WebGPU reentrancy limitations. + ctx->timestamp_query_buf_pool.free_bufs({ ts_bufs }); + } + }); + return { f }; +} +#endif + static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) { std::lock_guard lock(ctx->mutex); WEBGPU_LOG_DEBUG("ggml_backend_webgpu_submit_queue()"); @@ -295,6 +349,10 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) { std::vector staged_param_bufs = std::move(ctx->staged_param_bufs); std::vector staged_set_row_error_bufs = std::move(ctx->staged_set_row_error_bufs); +#ifdef GGML_WEBGPU_PROFILE + std::vector> staged_timestamp_bufs = std::move(ctx->staged_timestamp_bufs); +#endif + // Free the staged parameter buffers once the submission completes wgpu::Future p_f = ctx->queue.OnSubmittedWorkDone( wgpu::CallbackMode::AllowSpontaneous, @@ -325,6 +383,14 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) { }); ctx->callback_futures.push_back({ f }); } + +#ifdef GGML_WEBGPU_PROFILE + for (auto & pair : staged_timestamp_bufs) { + auto & ts_bufs = pair.first; + const std::string & label = pair.second; + ctx->callback_futures.push_back(ggml_backend_webgpu_process_timestamps(ctx, ts_bufs, label)); + } +#endif } static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx, @@ -365,7 +431,7 @@ static void ggml_backend_webgpu_debug(webgpu_context & ctx) { #endif static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & ctx, - wgpu::ComputePipeline & pipeline, + webgpu_pipeline & pipeline, std::vector params, std::vector bind_group_entries, uint32_t wg_x, @@ -388,7 +454,7 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & .size = params_bufs.dev_buf.GetSize() }); wgpu::BindGroupDescriptor bind_group_desc; - bind_group_desc.layout = pipeline.GetBindGroupLayout(0); + bind_group_desc.layout = pipeline.pipeline.GetBindGroupLayout(0); bind_group_desc.entryCount = bind_group_entries.size(); bind_group_desc.entries = bind_group_entries.data(); if (bind_group_label) { @@ -398,11 +464,40 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder(); encoder.CopyBufferToBuffer(params_bufs.host_buf, 0, params_bufs.dev_buf, 0, params_bufs.dev_buf.GetSize()); + +#ifdef GGML_WEBGPU_PROFILE + // --- Profiling: GPU timestamp queries --- + // Allocate a timestamp query buffer (2 timestamps: start/end) + webgpu_pool_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs(); + if (ts_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) { + ts_bufs.host_buf.Unmap(); + } + + // Create a query set for 2 timestamps + wgpu::QuerySetDescriptor ts_query_set_desc = {}; + ts_query_set_desc.type = wgpu::QueryType::Timestamp; + ts_query_set_desc.count = 2; + wgpu::QuerySet ts_query_set = ctx->device.CreateQuerySet(&ts_query_set_desc); + + wgpu::PassTimestampWrites ts_writes = { .querySet = ts_query_set, + .beginningOfPassWriteIndex = 0, + .endOfPassWriteIndex = 1 }; + wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes }; + wgpu::ComputePassEncoder pass = encoder.BeginComputePass(&pass_desc); +#else wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); - pass.SetPipeline(pipeline); +#endif + pass.SetPipeline(pipeline.pipeline); pass.SetBindGroup(0, bind_group); pass.DispatchWorkgroups(wg_x, 1, 1); pass.End(); + +#ifdef GGML_WEBGPU_PROFILE + // Resolve the query set into the device buffer + encoder.ResolveQuerySet(ts_query_set, 0, 2, ts_bufs.dev_buf, 0); + encoder.CopyBufferToBuffer(ts_bufs.dev_buf, 0, ts_bufs.host_buf, 0, ts_bufs.host_buf.GetSize()); +#endif + wgpu::CommandBuffer commands = encoder.Finish(); if (submit_and_wait) { // Submit and wait immediately @@ -416,12 +511,20 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & ctx->param_buf_pool.free_bufs({ params_bufs }); }), UINT64_MAX); +#ifdef GGML_WEBGPU_PROFILE + ctx->instance.WaitAny(ggml_backend_webgpu_process_timestamps(ctx, ts_bufs, pipeline.name).future, UINT64_MAX); +#endif } else { // Lock the context mutex when pushing to the staging vectors. std::lock_guard lock(ctx->mutex); // Enqueue commands and only submit if we have enough staged commands ctx->staged_command_bufs.push_back(commands); ctx->staged_param_bufs.push_back(params_bufs); +#ifdef GGML_WEBGPU_PROFILE + // Store timestamp buffer and label for later processing + ctx->staged_timestamp_bufs.push_back(std::make_pair(ts_bufs, pipeline.name)); +#endif + if (ctx->staged_command_bufs.size() == WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) { ggml_backend_webgpu_submit_queue(ctx); ggml_backend_webgpu_wait_on_submission(ctx); @@ -456,8 +559,33 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) { ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *) backend->context; WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")"); - // TODO: cleanup +#ifdef GGML_WEBGPU_PROFILE + // Print detailed profiling information + std::cout << "\n[ggml_webgpu profiling summary]\n"; + double total_gpu = 0.0; + for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) { + total_gpu += kv.second; + } + double total_cpu = 0.0; + for (const auto & kv : ctx->webgpu_ctx->cpu_time_ms) { + total_cpu += kv.second; + } + std::cout << "ggml_webgpu: total cpu time (all shaders): " << total_cpu << " ms\n"; + std::cout << "ggml_webgpu: total gpu time (all shaders): " << total_gpu << " ms\n"; + std::cout << "ggml_webgpu: gpu/cpu ratio: " << (total_cpu > 0.0 ? total_gpu / total_cpu : 0.0) << "\n"; + std::cout << "ggml_webgpu: cpu breakdown:\n"; + for (const auto & kv : ctx->webgpu_ctx->cpu_time_ms) { + double pct = (total_cpu > 0.0) ? (kv.second / total_cpu * 100.0) : 0.0; + std::cout << "ggml_webgpu: " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n"; + } + std::cout << "\nggml_webgpu: gpu breakdown:\n"; + for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) { + double pct = (total_gpu > 0.0) ? (kv.second / total_gpu * 100.0) : 0.0; + std::cout << "ggml_webgpu: " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n"; + } +#else GGML_UNUSED(ctx); +#endif } static size_t ggml_webgpu_tensor_offset(const ggml_tensor * tensor) { @@ -610,7 +738,7 @@ static void ggml_webgpu_get_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t size_t max_wg_size = ctx->max_wg_size_x; uint32_t wg_x = (dst->ne[1] * dst->ne[2] * dst->ne[3] + max_wg_size - 1) / max_wg_size; - wgpu::ComputePipeline pipeline = ctx->get_rows_pipeline[src->type]; + webgpu_pipeline pipeline = ctx->get_rows_pipeline[src->type]; if (src->type == GGML_TYPE_F32 && dst->ne[0] % 4 != 0) { pipeline = ctx->get_rows_f32_no_vec_pipeline; } @@ -658,12 +786,12 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t ggml_op_name(dst->op)); } -static void ggml_webgpu_binary_op(webgpu_context & ctx, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * dst, - wgpu::ComputePipeline & pipeline, - bool inplace) { +static void ggml_webgpu_binary_op(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * dst, + webgpu_pipeline & pipeline, + bool inplace) { std::vector params = { (uint32_t) ggml_nelements(dst), (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)), @@ -822,9 +950,9 @@ static void ggml_webgpu_rope(webgpu_context & ctx, .size = ggml_webgpu_tensor_binding_size(ctx, dst) }); } - wgpu::ComputePipeline pipeline = ctx->rope_pipeline[dst->type][has_freq_factor][inplace]; - size_t max_wg_size = ctx->max_wg_size_x; - uint32_t wg_x = (ggml_nelements(src0) / 2 + max_wg_size - 1) / max_wg_size; + webgpu_pipeline pipeline = ctx->rope_pipeline[dst->type][has_freq_factor][inplace]; + size_t max_wg_size = ctx->max_wg_size_x; + uint32_t wg_x = (ggml_nelements(src0) / 2 + max_wg_size - 1) / max_wg_size; ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op)); } @@ -875,9 +1003,9 @@ static void ggml_webgpu_glu(webgpu_context & ctx, ggml_tensor * src0, ggml_tenso .offset = ggml_webgpu_tensor_align_offset(ctx, dst), .size = ggml_webgpu_tensor_binding_size(ctx, dst) }); - wgpu::ComputePipeline pipeline = ctx->glu_pipeline[ggml_get_glu_op(dst)][dst->type][split]; - size_t max_wg_size = ctx->max_wg_size_x; - uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size; + webgpu_pipeline pipeline = ctx->glu_pipeline[ggml_get_glu_op(dst)][dst->type][split]; + size_t max_wg_size = ctx->max_wg_size_x; + uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size; ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op)); } @@ -1076,6 +1204,9 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str ggml_backend_webgpu_context * backend_ctx = static_cast(backend->context); webgpu_context ctx = backend_ctx->webgpu_ctx; + // --- Profiling: CPU timing for graph compute --- + WEBGPU_CPU_PROFILE_START(graph_compute); + for (int i = 0; i < cgraph->n_nodes; i++) { ggml_webgpu_encode_node(ctx, cgraph->nodes[i]); } @@ -1083,6 +1214,8 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str ggml_backend_webgpu_submit_queue(ctx); ggml_backend_webgpu_wait_on_submission(ctx); + WEBGPU_CPU_PROFILE_END(graph_compute); + return GGML_STATUS_SUCCESS; } @@ -1902,7 +2035,11 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t // Initialize device std::vector required_features = { wgpu::FeatureName::ShaderF16, wgpu::FeatureName::ImplicitDeviceSynchronization }; - wgpu::DeviceDescriptor dev_desc; +#ifdef GGML_WEBGPU_PROFILE + required_features.push_back(wgpu::FeatureName::TimestampQuery); +#endif + + wgpu::DeviceDescriptor dev_desc; dev_desc.requiredLimits = &ctx->limits; dev_desc.requiredFeatures = required_features.data(); dev_desc.requiredFeatureCount = required_features.size(); @@ -1939,6 +2076,15 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t ctx->param_buf_pool.init(ctx->device, WEBGPU_NUM_PARAM_BUFS, WEBGPU_PARAMS_BUF_SIZE_BYTES, wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform, wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite); + +#ifdef GGML_WEBGPU_PROFILE + // Initialize buffer pool for timestamp queries (profiling) + ctx->timestamp_query_buf_pool.init(ctx->device, WEBGPU_NUM_TIMESTAMP_QUERY_BUFS, + WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES, + wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc, + wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst); +#endif + ctx->set_rows_error_buf_pool.init(ctx->device, WEBGPU_NUM_SET_ROWS_ERROR_BUFS, WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES, wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage, wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead); From bd3d0807dc340621b3dee85b5416a48f84833a01 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 1 Oct 2025 10:53:30 -0700 Subject: [PATCH 02/15] More detailed profiling --- ggml/CMakeLists.txt | 3 +- ggml/src/ggml-webgpu/CMakeLists.txt | 8 +- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 145 +++++++++++++++++++-------- 3 files changed, 113 insertions(+), 43 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index c4f83c6e88d7e..67d02e73f6767 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -222,7 +222,8 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) option(GGML_WEBGPU "ggml: use WebGPU" OFF) option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF) -option(GGML_WEBGPU_PROFILE "ggml: enable WebGPU performance profiling" OFF) +option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF) +option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF) option(GGML_ZDNN "ggml: use zDNN" OFF) option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt index 0f6d65e58035f..c6a95d5151245 100644 --- a/ggml/src/ggml-webgpu/CMakeLists.txt +++ b/ggml/src/ggml-webgpu/CMakeLists.txt @@ -50,8 +50,12 @@ if (GGML_WEBGPU_DEBUG) target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1) endif() -if (GGML_WEBGPU_PROFILE) - target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_PROFILE=1) +if (GGML_WEBGPU_CPU_PROFILE) + target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_CPU_PROFILE=1) +endif() + +if (GGML_WEBGPU_GPU_PROFILE) + target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_GPU_PROFILE=1) endif() target_include_directories(ggml-webgpu PRIVATE ${SHADER_OUTPUT_DIR}) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index a8d4e058c7a8f..d26c185ff02b8 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -25,18 +25,37 @@ # define WEBGPU_LOG_DEBUG(msg) ((void) 0) #endif // GGML_WEBGPU_DEBUG -#ifdef GGML_WEBGPU_PROFILE -# define WEBGPU_CPU_PROFILE_START(id) auto cpu_start_##id = std::chrono::high_resolution_clock::now(); -# define WEBGPU_CPU_PROFILE_END(id) \ - auto cpu_end_##id = std::chrono::high_resolution_clock::now(); \ - double cpu_time_##id = std::chrono::duration(cpu_end_##id - cpu_start_##id).count(); \ - ctx->cpu_time_ms[#id] += cpu_time_##id; +#ifdef GGML_WEBGPU_CPU_PROFILE + // total timing (aggregated) +# define WEBGPU_CPU_PROFILE_TOTAL_START(id) \ + auto cpu_total_start_##id = std::chrono::high_resolution_clock::now(); + +# define WEBGPU_CPU_PROFILE_TOTAL_END(id, ctx) \ + auto cpu_total_end_##id = std::chrono::high_resolution_clock::now(); \ + double cpu_total_time_##id = \ + std::chrono::duration(cpu_total_end_##id - cpu_total_start_##id).count(); \ + (ctx)->cpu_time_ms[#id] += cpu_total_time_##id; + + // fine-grained timing (diagnostics only) +# define WEBGPU_CPU_PROFILE_DETAIL_START(id) \ + auto cpu_detail_start_##id = std::chrono::high_resolution_clock::now(); + +# define WEBGPU_CPU_PROFILE_DETAIL_END(id, ctx) \ + auto cpu_detail_end_##id = std::chrono::high_resolution_clock::now(); \ + double cpu_detail_time_##id = \ + std::chrono::duration(cpu_detail_end_##id - cpu_detail_start_##id).count(); \ + (ctx)->cpu_detail_ms[#id] += cpu_detail_time_##id; +#else +# define WEBGPU_CPU_PROFILE_TOTAL_START(id) +# define WEBGPU_CPU_PROFILE_TOTAL_END(id, ctx) +# define WEBGPU_CPU_PROFILE_DETAIL_START(id) +# define WEBGPU_CPU_PROFILE_DETAIL_END(id, ctx) +#endif // GGML_WEBGPU_CPU_PROFILE + +#ifdef GGML_WEBGPU_GPU_PROFILE # define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS 100 # define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16 // e.g. enough for two timestamps -#else -# define WEBGPU_CPU_PROFILE_START(id) ((void) 0) -# define WEBGPU_CPU_PROFILE_END(id) ((void) 0) -#endif // GGML_WEBGPU_PROFILE +#endif /* Constants */ @@ -183,13 +202,16 @@ struct webgpu_context_struct { wgpu::Buffer debug_dev_buf; #endif -#ifdef GGML_WEBGPU_PROFILE - // Profiling: per-shader GPU time in ms - std::unordered_map shader_gpu_time_ms; - - // Profiling: labeled CPU time in ms +#ifdef GGML_WEBGPU_CPU_PROFILE + // Profiling: labeled CPU time in ms (total) std::unordered_map cpu_time_ms; + // Profiling: detailed CPU time in ms + std::unordered_map cpu_detail_ms; +#endif +#ifdef GGML_WEBGPU_GPU_PROFILE + // Profiling: per-shader GPU time in ms + std::unordered_map shader_gpu_time_ms; // Profiling: pool of timestamp query buffers (one per operation) webgpu_buf_pool timestamp_query_buf_pool; @@ -303,7 +325,7 @@ static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) { } } -#ifdef GGML_WEBGPU_PROFILE +#ifdef GGML_WEBGPU_GPU_PROFILE static wgpu::FutureWaitInfo ggml_backend_webgpu_process_timestamps(webgpu_context & ctx, webgpu_pool_bufs ts_bufs, std::string label) { @@ -347,10 +369,13 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) { ctx->staged_command_bufs.clear(); std::vector staged_param_bufs = std::move(ctx->staged_param_bufs); + ctx->staged_param_bufs.clear(); std::vector staged_set_row_error_bufs = std::move(ctx->staged_set_row_error_bufs); + ctx->staged_set_row_error_bufs.clear(); -#ifdef GGML_WEBGPU_PROFILE +#ifdef GGML_WEBGPU_GPU_PROFILE std::vector> staged_timestamp_bufs = std::move(ctx->staged_timestamp_bufs); + ctx->staged_timestamp_bufs.clear(); #endif // Free the staged parameter buffers once the submission completes @@ -384,7 +409,7 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) { ctx->callback_futures.push_back({ f }); } -#ifdef GGML_WEBGPU_PROFILE +#ifdef GGML_WEBGPU_GPU_PROFILE for (auto & pair : staged_timestamp_bufs) { auto & ts_bufs = pair.first; const std::string & label = pair.second; @@ -465,19 +490,23 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder(); encoder.CopyBufferToBuffer(params_bufs.host_buf, 0, params_bufs.dev_buf, 0, params_bufs.dev_buf.GetSize()); -#ifdef GGML_WEBGPU_PROFILE +#ifdef GGML_WEBGPU_GPU_PROFILE // --- Profiling: GPU timestamp queries --- // Allocate a timestamp query buffer (2 timestamps: start/end) + WEBGPU_CPU_PROFILE_DETAIL_START(ts_bufs); webgpu_pool_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs(); if (ts_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) { ts_bufs.host_buf.Unmap(); } + WEBGPU_CPU_PROFILE_DETAIL_END(ts_bufs, ctx); // Create a query set for 2 timestamps + WEBGPU_CPU_PROFILE_DETAIL_START(ts_query_set); wgpu::QuerySetDescriptor ts_query_set_desc = {}; ts_query_set_desc.type = wgpu::QueryType::Timestamp; ts_query_set_desc.count = 2; wgpu::QuerySet ts_query_set = ctx->device.CreateQuerySet(&ts_query_set_desc); + WEBGPU_CPU_PROFILE_DETAIL_END(ts_query_set, ctx); wgpu::PassTimestampWrites ts_writes = { .querySet = ts_query_set, .beginningOfPassWriteIndex = 0, @@ -492,10 +521,12 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & pass.DispatchWorkgroups(wg_x, 1, 1); pass.End(); -#ifdef GGML_WEBGPU_PROFILE +#ifdef GGML_WEBGPU_GPU_PROFILE // Resolve the query set into the device buffer + WEBGPU_CPU_PROFILE_DETAIL_START(resolve_query_set); encoder.ResolveQuerySet(ts_query_set, 0, 2, ts_bufs.dev_buf, 0); encoder.CopyBufferToBuffer(ts_bufs.dev_buf, 0, ts_bufs.host_buf, 0, ts_bufs.host_buf.GetSize()); + WEBGPU_CPU_PROFILE_DETAIL_END(resolve_query_set, ctx); #endif wgpu::CommandBuffer commands = encoder.Finish(); @@ -511,8 +542,10 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & ctx->param_buf_pool.free_bufs({ params_bufs }); }), UINT64_MAX); -#ifdef GGML_WEBGPU_PROFILE +#ifdef GGML_WEBGPU_GPU_PROFILE + WEBGPU_CPU_PROFILE_DETAIL_START(submit_wait_process_ts); ctx->instance.WaitAny(ggml_backend_webgpu_process_timestamps(ctx, ts_bufs, pipeline.name).future, UINT64_MAX); + WEBGPU_CPU_PROFILE_DETAIL_END(submit_wait_process_ts, ctx); #endif } else { // Lock the context mutex when pushing to the staging vectors. @@ -520,14 +553,15 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & // Enqueue commands and only submit if we have enough staged commands ctx->staged_command_bufs.push_back(commands); ctx->staged_param_bufs.push_back(params_bufs); -#ifdef GGML_WEBGPU_PROFILE +#ifdef GGML_WEBGPU_GPU_PROFILE // Store timestamp buffer and label for later processing ctx->staged_timestamp_bufs.push_back(std::make_pair(ts_bufs, pipeline.name)); #endif if (ctx->staged_command_bufs.size() == WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) { + WEBGPU_CPU_PROFILE_DETAIL_START(encode_submit_queue); ggml_backend_webgpu_submit_queue(ctx); - ggml_backend_webgpu_wait_on_submission(ctx); + WEBGPU_CPU_PROFILE_DETAIL_END(encode_submit_queue, ctx); } } } @@ -559,31 +593,44 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) { ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *) backend->context; WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")"); -#ifdef GGML_WEBGPU_PROFILE - // Print detailed profiling information - std::cout << "\n[ggml_webgpu profiling summary]\n"; - double total_gpu = 0.0; - for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) { - total_gpu += kv.second; - } +#ifdef GGML_WEBGPU_CPU_PROFILE + std::cout << "\n[ggml_webgpu cpu profiling summary]\n"; double total_cpu = 0.0; for (const auto & kv : ctx->webgpu_ctx->cpu_time_ms) { total_cpu += kv.second; } - std::cout << "ggml_webgpu: total cpu time (all shaders): " << total_cpu << " ms\n"; - std::cout << "ggml_webgpu: total gpu time (all shaders): " << total_gpu << " ms\n"; - std::cout << "ggml_webgpu: gpu/cpu ratio: " << (total_cpu > 0.0 ? total_gpu / total_cpu : 0.0) << "\n"; + std::cout << "ggml_webgpu: total cpu time: " << total_cpu << " ms\n"; std::cout << "ggml_webgpu: cpu breakdown:\n"; for (const auto & kv : ctx->webgpu_ctx->cpu_time_ms) { double pct = (total_cpu > 0.0) ? (kv.second / total_cpu * 100.0) : 0.0; std::cout << "ggml_webgpu: " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n"; } + std::cout << "\nggml_webgpu: cpu detailed breakdown:\n"; + for (const auto & kv : ctx->webgpu_ctx->cpu_detail_ms) { + double pct = (total_cpu > 0.0) ? (kv.second / total_cpu * 100.0) : 0.0; + std::cout << "ggml_webgpu: " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n"; + } +#endif + +#ifdef GGML_WEBGPU_GPU_PROFILE + std::cout << "\n[ggml_webgpu gpu profiling summary]\n"; + double total_gpu = 0.0; + for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) { + total_gpu += kv.second; + } + std::cout << "ggml_webgpu: total gpu time (all shaders): " << total_gpu << " ms\n"; std::cout << "\nggml_webgpu: gpu breakdown:\n"; for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) { double pct = (total_gpu > 0.0) ? (kv.second / total_gpu * 100.0) : 0.0; std::cout << "ggml_webgpu: " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n"; } -#else +#endif + +#if defined(GGML_WEBGPU_CPU_PROFILE) && defined(GGML_WEBGPU_GPU_PROFILE) + std::cout << "ggml_webgpu: gpu/cpu ratio: " << (total_cpu > 0.0 ? total_gpu / total_cpu : 0.0) << "\n"; +#endif + +#if !defined(GGML_WEBGPU_CPU_PROFILE) && !defined(GGML_WEBGPU_GPU_PROFILE) GGML_UNUSED(ctx); #endif } @@ -1204,17 +1251,23 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str ggml_backend_webgpu_context * backend_ctx = static_cast(backend->context); webgpu_context ctx = backend_ctx->webgpu_ctx; - // --- Profiling: CPU timing for graph compute --- - WEBGPU_CPU_PROFILE_START(graph_compute); + WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute); + WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_encode); for (int i = 0; i < cgraph->n_nodes; i++) { ggml_webgpu_encode_node(ctx, cgraph->nodes[i]); } + WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_encode, ctx); + WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_submit); ggml_backend_webgpu_submit_queue(ctx); + WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_submit, ctx); + + WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_wait); ggml_backend_webgpu_wait_on_submission(ctx); + WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_wait, ctx); - WEBGPU_CPU_PROFILE_END(graph_compute); + WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx); return GGML_STATUS_SUCCESS; } @@ -1262,6 +1315,8 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe return; } + WEBGPU_CPU_PROFILE_TOTAL_START(memset_tensor); + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")"); @@ -1272,6 +1327,7 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe // This is a trick to set all bytes of a u32 to the same 1 byte value. uint32_t val32 = (uint32_t) value * 0x01010101; ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, val32, total_offset, size); + WEBGPU_CPU_PROFILE_TOTAL_END(memset_tensor, buf_ctx->webgpu_ctx); } static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, @@ -1281,6 +1337,7 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, size_t size) { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); + WEBGPU_CPU_PROFILE_TOTAL_START(set_tensor); ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx; @@ -1305,6 +1362,7 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, // wait for WriteBuffer to complete ggml_backend_webgpu_wait_on_submission(webgpu_ctx); } + WEBGPU_CPU_PROFILE_TOTAL_END(set_tensor, webgpu_ctx); } static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, @@ -1314,7 +1372,7 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, size_t size) { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); - + WEBGPU_CPU_PROFILE_TOTAL_START(get_tensor); ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx; wgpu::Device device = webgpu_ctx->device; @@ -1354,12 +1412,15 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, // Copy the data from the mapped range to the output buffer std::memcpy(data, mapped_range, size); webgpu_ctx->get_tensor_staging_buf.Unmap(); + WEBGPU_CPU_PROFILE_TOTAL_END(get_tensor, webgpu_ctx); } static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << (uint32_t) value << ")"); + WEBGPU_CPU_PROFILE_TOTAL_START(clear); ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, 0, buffer->size); + WEBGPU_CPU_PROFILE_TOTAL_END(clear, buf_ctx->webgpu_ctx); } static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = { @@ -2009,6 +2070,8 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t GGML_ASSERT(index == 0); WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()"); + WEBGPU_CPU_PROFILE_TOTAL_START(reg_get_device); + ggml_backend_webgpu_reg_context * reg_ctx = static_cast(reg->context); webgpu_context ctx = reg_ctx->webgpu_ctx; @@ -2035,7 +2098,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t // Initialize device std::vector required_features = { wgpu::FeatureName::ShaderF16, wgpu::FeatureName::ImplicitDeviceSynchronization }; -#ifdef GGML_WEBGPU_PROFILE +#ifdef GGML_WEBGPU_GPU_PROFILE required_features.push_back(wgpu::FeatureName::TimestampQuery); #endif @@ -2077,7 +2140,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform, wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite); -#ifdef GGML_WEBGPU_PROFILE +#ifdef GGML_WEBGPU_GPU_PROFILE // Initialize buffer pool for timestamp queries (profiling) ctx->timestamp_query_buf_pool.init(ctx->device, WEBGPU_NUM_TIMESTAMP_QUERY_BUFS, WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES, @@ -2129,6 +2192,8 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t /* .reg = */ reg, /* .context = */ &device_ctx, }; + + WEBGPU_CPU_PROFILE_TOTAL_END(reg_get_device, ctx); return &device; } From 400a58dfafa6922134f19d1292458fb042804981 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 1 Oct 2025 15:51:24 -0700 Subject: [PATCH 03/15] Rework command submission to avoid global locks --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 422 ++++++++---------- .../wgsl-shaders/mul_mat.tmpl.wgsl | 2 +- 2 files changed, 184 insertions(+), 240 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index d26c185ff02b8..14e3d3243792d 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -26,23 +27,21 @@ #endif // GGML_WEBGPU_DEBUG #ifdef GGML_WEBGPU_CPU_PROFILE - // total timing (aggregated) -# define WEBGPU_CPU_PROFILE_TOTAL_START(id) \ - auto cpu_total_start_##id = std::chrono::high_resolution_clock::now(); +// total timing (aggregated) +# define WEBGPU_CPU_PROFILE_TOTAL_START(id) auto cpu_total_start_##id = std::chrono::high_resolution_clock::now(); -# define WEBGPU_CPU_PROFILE_TOTAL_END(id, ctx) \ - auto cpu_total_end_##id = std::chrono::high_resolution_clock::now(); \ - double cpu_total_time_##id = \ +# define WEBGPU_CPU_PROFILE_TOTAL_END(id, ctx) \ + auto cpu_total_end_##id = std::chrono::high_resolution_clock::now(); \ + double cpu_total_time_##id = \ std::chrono::duration(cpu_total_end_##id - cpu_total_start_##id).count(); \ (ctx)->cpu_time_ms[#id] += cpu_total_time_##id; - // fine-grained timing (diagnostics only) -# define WEBGPU_CPU_PROFILE_DETAIL_START(id) \ - auto cpu_detail_start_##id = std::chrono::high_resolution_clock::now(); +// fine-grained timing (diagnostics only) +# define WEBGPU_CPU_PROFILE_DETAIL_START(id) auto cpu_detail_start_##id = std::chrono::high_resolution_clock::now(); -# define WEBGPU_CPU_PROFILE_DETAIL_END(id, ctx) \ - auto cpu_detail_end_##id = std::chrono::high_resolution_clock::now(); \ - double cpu_detail_time_##id = \ +# define WEBGPU_CPU_PROFILE_DETAIL_END(id, ctx) \ + auto cpu_detail_end_##id = std::chrono::high_resolution_clock::now(); \ + double cpu_detail_time_##id = \ std::chrono::duration(cpu_detail_end_##id - cpu_detail_start_##id).count(); \ (ctx)->cpu_detail_ms[#id] += cpu_detail_time_##id; #else @@ -59,9 +58,9 @@ /* Constants */ -#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 16 +#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8 #define WEBGPU_WAIT_ANY_BATCH_SIZE 64 -#define WEBGPU_MUL_MAT_WG_SIZE 64 +#define WEBGPU_MUL_MAT_WG_SIZE 256 #define WEBGPU_NUM_PARAM_BUFS 100 #define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters #define WEBGPU_NUM_SET_ROWS_ERROR_BUFS 32 @@ -149,6 +148,16 @@ struct webgpu_pipeline { std::string name; }; +struct webgpu_command { + wgpu::CommandBuffer commands; + webgpu_pool_bufs params_bufs; + std::optional set_rows_error_bufs; +#ifdef GGML_WEBGPU_GPU_PROFILE + webgpu_pool_bufs timestamp_query_bufs; + std::string pipeline_name; +#endif +}; + // All the base objects needed to run operations on a WebGPU device struct webgpu_context_struct { wgpu::Instance instance; @@ -187,16 +196,6 @@ struct webgpu_context_struct { // Staging buffer for reading data from the GPU wgpu::Buffer get_tensor_staging_buf; - // Command buffers which need to be submitted - std::vector staged_command_bufs; - - // Parameter buffers associated with the staged command buffers - std::vector staged_param_bufs; - // Buffers associated with set_rows operations, used to store potential errors - std::vector staged_set_row_error_bufs; - - std::vector callback_futures; - #ifdef GGML_WEBGPU_DEBUG wgpu::Buffer debug_host_buf; wgpu::Buffer debug_dev_buf; @@ -213,10 +212,7 @@ struct webgpu_context_struct { // Profiling: per-shader GPU time in ms std::unordered_map shader_gpu_time_ms; // Profiling: pool of timestamp query buffers (one per operation) - webgpu_buf_pool timestamp_query_buf_pool; - - // Profiling: staged timestamp buffers and their labels for batch submission - std::vector> staged_timestamp_bufs; + webgpu_buf_pool timestamp_query_buf_pool; #endif }; @@ -301,27 +297,12 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device, /** WebGPU Actions */ // Wait for the queue to finish processing all submitted work -static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) { - std::lock_guard lock(ctx->mutex); - if (ctx->callback_futures.empty()) { - // no existing callbacks, wait on queue submission - ctx->instance.WaitAny( - ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous, - [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) { - if (status != wgpu::QueueWorkDoneStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", - std::string(message).c_str()); - } - }), - UINT64_MAX); - } else { - // WebGPU implementations may limit the number of futures that can be waited on at once, - // so wait in batches (64 is what Dawn supports). - for (size_t i = 0; i < ctx->callback_futures.size(); i += WEBGPU_WAIT_ANY_BATCH_SIZE) { - size_t end = std::min(i + WEBGPU_WAIT_ANY_BATCH_SIZE, ctx->callback_futures.size()); - ctx->instance.WaitAny(end - i, ctx->callback_futures.data() + i, UINT64_MAX); - } - ctx->callback_futures.clear(); +static void ggml_backend_webgpu_wait(webgpu_context & ctx, std::vector & wait_infos) { + // WebGPU implementations may limit the number of futures that can be waited on at once, + // so wait in batches (64 is what Dawn supports). + for (size_t i = 0; i < wait_infos.size(); i += WEBGPU_WAIT_ANY_BATCH_SIZE) { + size_t end = std::min(i + WEBGPU_WAIT_ANY_BATCH_SIZE, wait_infos.size()); + ctx->instance.WaitAny(end - i, wait_infos.data() + i, UINT64_MAX); } } @@ -347,77 +328,6 @@ static wgpu::FutureWaitInfo ggml_backend_webgpu_process_timestamps(webgpu_contex } #endif -static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) { - std::lock_guard lock(ctx->mutex); - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_submit_queue()"); - if (ctx->staged_command_bufs.empty()) { - // Nothing to submit - return; - } - ctx->queue.Submit(ctx->staged_command_bufs.size(), ctx->staged_command_bufs.data()); - - // If there are SET_ROWS operations in this submission, copy their error buffers to the host. - if (ctx->staged_set_row_error_bufs.size() > 0) { - wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder(); - for (auto & error_bufs : ctx->staged_set_row_error_bufs) { - // Copy the error buffer to the host buffer - encoder.CopyBufferToBuffer(error_bufs.dev_buf, 0, error_bufs.host_buf, 0, error_bufs.host_buf.GetSize()); - } - wgpu::CommandBuffer commands = encoder.Finish(); - ctx->queue.Submit(1, &commands); - } - - ctx->staged_command_bufs.clear(); - std::vector staged_param_bufs = std::move(ctx->staged_param_bufs); - ctx->staged_param_bufs.clear(); - std::vector staged_set_row_error_bufs = std::move(ctx->staged_set_row_error_bufs); - ctx->staged_set_row_error_bufs.clear(); - -#ifdef GGML_WEBGPU_GPU_PROFILE - std::vector> staged_timestamp_bufs = std::move(ctx->staged_timestamp_bufs); - ctx->staged_timestamp_bufs.clear(); -#endif - - // Free the staged parameter buffers once the submission completes - wgpu::Future p_f = ctx->queue.OnSubmittedWorkDone( - wgpu::CallbackMode::AllowSpontaneous, - [ctx, staged_param_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) { - if (status != wgpu::QueueWorkDoneStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", std::string(message).c_str()); - } - // Free the staged buffers - ctx->param_buf_pool.free_bufs(staged_param_bufs); - }); - ctx->callback_futures.push_back({ p_f }); - - // Check for errrors in SET_ROWS operations - for (auto & error_bufs : staged_set_row_error_bufs) { - wgpu::Future f = error_bufs.host_buf.MapAsync( - wgpu::MapMode::Read, 0, error_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous, - [ctx, error_bufs](wgpu::MapAsyncStatus status, wgpu::StringView message) { - if (status != wgpu::MapAsyncStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", std::string(message).c_str()); - } else { - const uint32_t * error_data = (const uint32_t *) error_bufs.host_buf.GetConstMappedRange(); - if (*error_data) { - GGML_ABORT("ggml_webgpu: SET_ROWS index > 2^32, unsupported."); - } - // We can't unmap in here due to WebGPU reentrancy limitations. - ctx->set_rows_error_buf_pool.free_bufs({ error_bufs }); - } - }); - ctx->callback_futures.push_back({ f }); - } - -#ifdef GGML_WEBGPU_GPU_PROFILE - for (auto & pair : staged_timestamp_bufs) { - auto & ts_bufs = pair.first; - const std::string & label = pair.second; - ctx->callback_futures.push_back(ggml_backend_webgpu_process_timestamps(ctx, ts_bufs, label)); - } -#endif -} - static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx, wgpu::Buffer & buffer, wgpu::MapMode mode, @@ -438,7 +348,6 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx, // To use, add a bind group entry to the setup for the shader you are debugging, add the buffer and // debug statements in the shader, and then call this function after encoding the commands and submitting them. static void ggml_backend_webgpu_debug(webgpu_context & ctx) { - ggml_backend_webgpu_submit_queue(ctx); wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder(); encoder.CopyBufferToBuffer(ctx->debug_dev_buf, 0, ctx->debug_host_buf, 0, ctx->debug_host_buf.GetSize()); wgpu::CommandBuffer commands = encoder.Finish(); @@ -455,13 +364,73 @@ static void ggml_backend_webgpu_debug(webgpu_context & ctx) { } #endif -static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & ctx, - webgpu_pipeline & pipeline, - std::vector params, - std::vector bind_group_entries, - uint32_t wg_x, - const char * bind_group_label = nullptr, - bool submit_and_wait = false) { +static std::vector ggml_backend_webgpu_submit(webgpu_context ctx, + std::vector builds) { + std::vector commands; + std::vector params_bufs; + std::vector set_rows_error_bufs; +#ifdef GGML_WEBGPU_GPU_PROFILE + std::vector> pipeline_name_and_ts_bufs; +#endif + + for (const auto & build : builds) { + commands.push_back(build.commands); + params_bufs.push_back(build.params_bufs); + if (build.set_rows_error_bufs) { + set_rows_error_bufs.push_back(build.set_rows_error_bufs.value()); + } +#ifdef GGML_WEBGPU_GPU_PROFILE + pipeline_name_and_ts_bufs.push_back({ build.pipeline_name, build.timestamp_query_bufs }); +#endif + } + ctx->queue.Submit(commands.size(), commands.data()); + + std::vector wait_infos; + + wgpu::Future p_f = ctx->queue.OnSubmittedWorkDone( + wgpu::CallbackMode::AllowSpontaneous, + [ctx, params_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) { + if (status != wgpu::QueueWorkDoneStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", std::string(message).c_str()); + } + // Free the staged buffers + ctx->param_buf_pool.free_bufs({ params_bufs }); + }); + wait_infos.push_back({ p_f }); + + for (const auto & bufs : set_rows_error_bufs) { + wgpu::Future f = bufs.host_buf.MapAsync( + wgpu::MapMode::Read, 0, bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous, + [ctx, bufs](wgpu::MapAsyncStatus status, wgpu::StringView message) { + if (status != wgpu::MapAsyncStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", std::string(message).c_str()); + } else { + const uint32_t * error_data = (const uint32_t *) bufs.host_buf.GetConstMappedRange(); + if (*error_data) { + GGML_ABORT("ggml_webgpu: SET_ROWS index > 2^32, unsupported."); + } + // We can't unmap in here due to WebGPU reentrancy limitations. + ctx->set_rows_error_buf_pool.free_bufs({ bufs }); + } + }); + wait_infos.push_back({ f }); + } + +#ifdef GGML_WEBGPU_GPU_PROFILE + for (const auto & name_and_bufs : pipeline_name_and_ts_bufs) { + wgpu::FutureWaitInfo f = ggml_backend_webgpu_process_timestamps(ctx, name_and_bufs.second, name_and_bufs.first); + wait_infos.push_back(f); + } +#endif + return wait_infos; +} + +static webgpu_command ggml_backend_webgpu_build(webgpu_context & ctx, + webgpu_pipeline & pipeline, + std::vector params, + std::vector bind_group_entries, + uint32_t wg_x, + std::optional set_rows_error_bufs = std::nullopt) { webgpu_pool_bufs params_bufs = ctx->param_buf_pool.alloc_bufs(); ggml_backend_webgpu_map_buffer(ctx, params_bufs.host_buf, wgpu::MapMode::Write, 0, params_bufs.host_buf.GetSize()); @@ -482,9 +451,7 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & bind_group_desc.layout = pipeline.pipeline.GetBindGroupLayout(0); bind_group_desc.entryCount = bind_group_entries.size(); bind_group_desc.entries = bind_group_entries.data(); - if (bind_group_label) { - bind_group_desc.label = bind_group_label; - } + bind_group_desc.label = pipeline.name.c_str(); wgpu::BindGroup bind_group = ctx->device.CreateBindGroup(&bind_group_desc); wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder(); @@ -493,20 +460,16 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & #ifdef GGML_WEBGPU_GPU_PROFILE // --- Profiling: GPU timestamp queries --- // Allocate a timestamp query buffer (2 timestamps: start/end) - WEBGPU_CPU_PROFILE_DETAIL_START(ts_bufs); webgpu_pool_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs(); if (ts_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) { ts_bufs.host_buf.Unmap(); } - WEBGPU_CPU_PROFILE_DETAIL_END(ts_bufs, ctx); // Create a query set for 2 timestamps - WEBGPU_CPU_PROFILE_DETAIL_START(ts_query_set); wgpu::QuerySetDescriptor ts_query_set_desc = {}; ts_query_set_desc.type = wgpu::QueryType::Timestamp; ts_query_set_desc.count = 2; wgpu::QuerySet ts_query_set = ctx->device.CreateQuerySet(&ts_query_set_desc); - WEBGPU_CPU_PROFILE_DETAIL_END(ts_query_set, ctx); wgpu::PassTimestampWrites ts_writes = { .querySet = ts_query_set, .beginningOfPassWriteIndex = 0, @@ -523,47 +486,26 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context & #ifdef GGML_WEBGPU_GPU_PROFILE // Resolve the query set into the device buffer - WEBGPU_CPU_PROFILE_DETAIL_START(resolve_query_set); encoder.ResolveQuerySet(ts_query_set, 0, 2, ts_bufs.dev_buf, 0); encoder.CopyBufferToBuffer(ts_bufs.dev_buf, 0, ts_bufs.host_buf, 0, ts_bufs.host_buf.GetSize()); - WEBGPU_CPU_PROFILE_DETAIL_END(resolve_query_set, ctx); #endif + // If there are SET_ROWS operations in this submission, copy their error buffers to the host. + if (set_rows_error_bufs) { + encoder.CopyBufferToBuffer(set_rows_error_bufs->dev_buf, 0, set_rows_error_bufs->host_buf, 0, + set_rows_error_bufs->host_buf.GetSize()); + } + wgpu::CommandBuffer commands = encoder.Finish(); - if (submit_and_wait) { - // Submit and wait immediately - ctx->queue.Submit(1, &commands); - ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone( - wgpu::CallbackMode::AllowSpontaneous, - [ctx, params_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) { - if (status != wgpu::QueueWorkDoneStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data); - } - ctx->param_buf_pool.free_bufs({ params_bufs }); - }), - UINT64_MAX); -#ifdef GGML_WEBGPU_GPU_PROFILE - WEBGPU_CPU_PROFILE_DETAIL_START(submit_wait_process_ts); - ctx->instance.WaitAny(ggml_backend_webgpu_process_timestamps(ctx, ts_bufs, pipeline.name).future, UINT64_MAX); - WEBGPU_CPU_PROFILE_DETAIL_END(submit_wait_process_ts, ctx); -#endif - } else { - // Lock the context mutex when pushing to the staging vectors. - std::lock_guard lock(ctx->mutex); - // Enqueue commands and only submit if we have enough staged commands - ctx->staged_command_bufs.push_back(commands); - ctx->staged_param_bufs.push_back(params_bufs); + webgpu_command result = {}; + result.commands = commands; + result.params_bufs = params_bufs; + result.set_rows_error_bufs = set_rows_error_bufs; #ifdef GGML_WEBGPU_GPU_PROFILE - // Store timestamp buffer and label for later processing - ctx->staged_timestamp_bufs.push_back(std::make_pair(ts_bufs, pipeline.name)); + result.timestamp_query_bufs = ts_bufs; + result.pipeline_name = pipeline.name; #endif - - if (ctx->staged_command_bufs.size() == WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) { - WEBGPU_CPU_PROFILE_DETAIL_START(encode_submit_queue); - ggml_backend_webgpu_submit_queue(ctx); - WEBGPU_CPU_PROFILE_DETAIL_END(encode_submit_queue, ctx); - } - } + return result; } static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx, @@ -577,7 +519,10 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx, }; size_t bytes_per_wg = ctx->max_wg_size_x * ctx->memset_bytes_per_thread; uint32_t wg_x = ((size + 3) + bytes_per_wg - 1) / bytes_per_wg; - ggml_backend_webgpu_build_and_enqueue(ctx, ctx->memset_pipeline, params, entries, wg_x, "MEMSET", true); + + webgpu_command command = ggml_backend_webgpu_build(ctx, ctx->memset_pipeline, params, entries, wg_x); + auto futures = ggml_backend_webgpu_submit(ctx, { command }); + ggml_backend_webgpu_wait(ctx, futures); } /** End WebGPU Actions */ @@ -605,7 +550,9 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) { double pct = (total_cpu > 0.0) ? (kv.second / total_cpu * 100.0) : 0.0; std::cout << "ggml_webgpu: " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n"; } - std::cout << "\nggml_webgpu: cpu detailed breakdown:\n"; + if (ctx->webgpu_ctx->cpu_detail_ms.size() > 0) { + std::cout << "ggml_webgpu: cpu detailed breakdown:\n"; + } for (const auto & kv : ctx->webgpu_ctx->cpu_detail_ms) { double pct = (total_cpu > 0.0) ? (kv.second / total_cpu * 100.0) : 0.0; std::cout << "ggml_webgpu: " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n"; @@ -665,7 +612,7 @@ static bool ggml_webgpu_tensor_equal(ggml_tensor * a, ggml_tensor * b) { (ggml_webgpu_tensor_offset(a) == ggml_webgpu_tensor_offset(b)); } -static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { +static webgpu_command ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { uint32_t ne = (uint32_t) ggml_nelements(dst); std::vector params = { @@ -694,14 +641,13 @@ static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor size_t max_wg_size = ctx->max_wg_size_x; uint32_t wg_x = (ne + max_wg_size - 1) / max_wg_size; - ggml_backend_webgpu_build_and_enqueue(ctx, ctx->cpy_pipeline[src->type][dst->type], params, entries, wg_x, - ggml_op_name(dst->op)); + return ggml_backend_webgpu_build(ctx, ctx->cpy_pipeline[src->type][dst->type], params, entries, wg_x); } -static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) { +static std::optional ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) { // For set rows specifically, we need to check if src and idx are empty tensors. if (ggml_is_empty(src) || ggml_is_empty(idx)) { - return; + return std::nullopt; } webgpu_pool_bufs error_bufs = ctx->set_rows_error_buf_pool.alloc_bufs(); @@ -744,13 +690,10 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t size_t max_wg_size = ctx->max_wg_size_x; uint32_t wg_x = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size; - std::lock_guard lock(ctx->mutex); - ctx->staged_set_row_error_bufs.push_back(error_bufs); - - ggml_backend_webgpu_build_and_enqueue(ctx, ctx->set_rows_pipeline, params, entries, wg_x, ggml_op_name(dst->op)); + return ggml_backend_webgpu_build(ctx, ctx->set_rows_pipeline, params, entries, wg_x, error_bufs); } -static void ggml_webgpu_get_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) { +static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) { std::vector params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)), (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)), @@ -789,10 +732,10 @@ static void ggml_webgpu_get_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t if (src->type == GGML_TYPE_F32 && dst->ne[0] % 4 != 0) { pipeline = ctx->get_rows_f32_no_vec_pipeline; } - ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op)); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) { +static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) { std::vector params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)), (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)), @@ -829,11 +772,10 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t uint32_t wg_x = (dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE; - ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x, - ggml_op_name(dst->op)); + return ggml_backend_webgpu_build(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x); } -static void ggml_webgpu_binary_op(webgpu_context & ctx, +static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, @@ -876,10 +818,10 @@ static void ggml_webgpu_binary_op(webgpu_context & ctx, size_t max_wg_size = ctx->max_wg_size_x; uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size; - ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op)); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static void ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { +static webgpu_command ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { int inplace = ggml_webgpu_tensor_equal(src, dst); std::vector params = { @@ -911,11 +853,10 @@ static void ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_t .size = ggml_webgpu_tensor_binding_size(ctx, dst) }); } - ggml_backend_webgpu_build_and_enqueue(ctx, ctx->rms_norm_pipeline[inplace], params, entries, ggml_nrows(src), - ggml_op_name(dst->op)); + return ggml_backend_webgpu_build(ctx, ctx->rms_norm_pipeline[inplace], params, entries, ggml_nrows(src)); } -static void ggml_webgpu_rope(webgpu_context & ctx, +static webgpu_command ggml_webgpu_rope(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * src2, @@ -1000,10 +941,10 @@ static void ggml_webgpu_rope(webgpu_context & ctx, webgpu_pipeline pipeline = ctx->rope_pipeline[dst->type][has_freq_factor][inplace]; size_t max_wg_size = ctx->max_wg_size_x; uint32_t wg_x = (ggml_nelements(src0) / 2 + max_wg_size - 1) / max_wg_size; - ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op)); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static void ggml_webgpu_glu(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) { +static webgpu_command ggml_webgpu_glu(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) { const int split = (src1 != nullptr); std::vector params = { @@ -1053,10 +994,10 @@ static void ggml_webgpu_glu(webgpu_context & ctx, ggml_tensor * src0, ggml_tenso webgpu_pipeline pipeline = ctx->glu_pipeline[ggml_get_glu_op(dst)][dst->type][split]; size_t max_wg_size = ctx->max_wg_size_x; uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size; - ggml_backend_webgpu_build_and_enqueue(ctx, pipeline, params, entries, wg_x, ggml_op_name(dst->op)); + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static void ggml_webgpu_scale(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { +static webgpu_command ggml_webgpu_scale(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { int inplace = ggml_webgpu_tensor_equal(src, dst); std::vector params = { @@ -1091,11 +1032,10 @@ static void ggml_webgpu_scale(webgpu_context & ctx, ggml_tensor * src, ggml_tens size_t max_wg_size = ctx->max_wg_size_x; uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size; - ggml_backend_webgpu_build_and_enqueue(ctx, ctx->scale_pipeline[inplace], params, entries, wg_x, - ggml_op_name(dst->op)); + return ggml_backend_webgpu_build(ctx, ctx->scale_pipeline[inplace], params, entries, wg_x); } -static void ggml_webgpu_soft_max(webgpu_context & ctx, +static webgpu_command ggml_webgpu_soft_max(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * src2, @@ -1164,14 +1104,14 @@ static void ggml_webgpu_soft_max(webgpu_context & ctx, .size = ggml_webgpu_tensor_binding_size(ctx, dst) }); } - ggml_backend_webgpu_build_and_enqueue(ctx, ctx->soft_max_pipeline[mask_type][has_sink][inplace], params, entries, - ggml_nrows(dst), ggml_op_name(dst->op)); + return ggml_backend_webgpu_build(ctx, ctx->soft_max_pipeline[mask_type][has_sink][inplace], params, entries, + ggml_nrows(dst)); } // Returns true if node has enqueued work into the queue, false otherwise -static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) { +static std::optional ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) { if (ggml_is_empty(node)) { - return false; + return std::nullopt; } WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")"); @@ -1186,63 +1126,49 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) { case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: case GGML_OP_RESHAPE: - return false; + return std::nullopt; case GGML_OP_CPY: case GGML_OP_CONT: - ggml_webgpu_cpy(ctx, src0, node); - break; + return ggml_webgpu_cpy(ctx, src0, node); case GGML_OP_SET_ROWS: - ggml_webgpu_set_rows(ctx, src0, src1, node); - break; + return ggml_webgpu_set_rows(ctx, src0, src1, node); case GGML_OP_GET_ROWS: - ggml_webgpu_get_rows(ctx, src0, src1, node); - break; + return ggml_webgpu_get_rows(ctx, src0, src1, node); case GGML_OP_MUL_MAT: - ggml_webgpu_mul_mat(ctx, src0, src1, node); - break; + return ggml_webgpu_mul_mat(ctx, src0, src1, node); case GGML_OP_ADD: { int inplace = ggml_webgpu_tensor_equal(src0, node); - ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline[node->type][inplace], inplace); - break; + return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline[node->type][inplace], inplace); } case GGML_OP_SUB: { int inplace = ggml_webgpu_tensor_equal(src0, node); - ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->sub_pipeline[node->type][inplace], inplace); - break; + return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->sub_pipeline[node->type][inplace], inplace); } case GGML_OP_MUL: { int inplace = ggml_webgpu_tensor_equal(src0, node); - ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline[node->type][inplace], inplace); - break; + return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline[node->type][inplace], inplace); } case GGML_OP_DIV: { int inplace = ggml_webgpu_tensor_equal(src0, node); - ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->div_pipeline[node->type][inplace], inplace); - break; + return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->div_pipeline[node->type][inplace], inplace); } case GGML_OP_RMS_NORM: - ggml_webgpu_rms_norm(ctx, src0, node); - break; + return ggml_webgpu_rms_norm(ctx, src0, node); case GGML_OP_ROPE: - ggml_webgpu_rope(ctx, src0, src1, src2, node); - break; + return ggml_webgpu_rope(ctx, src0, src1, src2, node); case GGML_OP_GLU: - ggml_webgpu_glu(ctx, src0, src1, node); - break; + return ggml_webgpu_glu(ctx, src0, src1, node); case GGML_OP_SCALE: - ggml_webgpu_scale(ctx, src0, node); - break; + return ggml_webgpu_scale(ctx, src0, node); case GGML_OP_SOFT_MAX: - ggml_webgpu_soft_max(ctx, src0, src1, src2, node); - break; + return ggml_webgpu_soft_max(ctx, src0, src1, src2, node); default: - return false; + return std::nullopt; } - return true; } static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { @@ -1254,17 +1180,27 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute); WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_encode); + std::vector commands; + std::vector wait_infos; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_webgpu_encode_node(ctx, cgraph->nodes[i]); + if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) { + commands.push_back(*cmd); + } + if (commands.size() >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) { + std::vector new_infos = ggml_backend_webgpu_submit(ctx, commands); + wait_infos.insert(wait_infos.end(), new_infos.begin(), new_infos.end()); + commands.clear(); + } + } + if (!commands.empty()) { + std::vector new_infos = ggml_backend_webgpu_submit(ctx, commands); + wait_infos.insert(wait_infos.end(), new_infos.begin(), new_infos.end()); + commands.clear(); } WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_encode, ctx); - WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_submit); - ggml_backend_webgpu_submit_queue(ctx); - WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_submit, ctx); - WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_wait); - ggml_backend_webgpu_wait_on_submission(ctx); + ggml_backend_webgpu_wait(ctx, wait_infos); WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_wait, ctx); WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx); @@ -1360,7 +1296,15 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, remaining_size); } else { // wait for WriteBuffer to complete - ggml_backend_webgpu_wait_on_submission(webgpu_ctx); + webgpu_ctx->instance.WaitAny( + webgpu_ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous, + [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) { + if (status != wgpu::QueueWorkDoneStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", + std::string(message).c_str()); + } + }), + UINT64_MAX); } WEBGPU_CPU_PROFILE_TOTAL_END(set_tensor, webgpu_ctx); } @@ -2116,7 +2060,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t dev_desc.SetUncapturedErrorCallback( [](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) { GGML_UNUSED(device); - GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), + GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), std::string(message).c_str()); }); ctx->instance.WaitAny(ctx->adapter.RequestDevice( diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl index 25e2185de84ee..141db9b39d957 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl @@ -870,7 +870,7 @@ struct MulMatParams { @group(0) @binding(3) var params: MulMatParams; -@compute @workgroup_size(64) +@compute @workgroup_size(256) fn main(@builtin(global_invocation_id) global_id: vec3) { let total = params.m * params.n * params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3; if (global_id.x >= total) { From ca43faa1a8023268f3f11c9deeeac5ec765aeb0f Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Sun, 5 Oct 2025 13:53:39 -0700 Subject: [PATCH 04/15] Update wait handling --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 118 +++++++++++++++++---------- 1 file changed, 73 insertions(+), 45 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 14e3d3243792d..764690fb388f5 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -149,12 +149,12 @@ struct webgpu_pipeline { }; struct webgpu_command { - wgpu::CommandBuffer commands; - webgpu_pool_bufs params_bufs; + wgpu::CommandBuffer commands; + webgpu_pool_bufs params_bufs; std::optional set_rows_error_bufs; #ifdef GGML_WEBGPU_GPU_PROFILE webgpu_pool_bufs timestamp_query_bufs; - std::string pipeline_name; + std::string pipeline_name; #endif }; @@ -297,12 +297,29 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device, /** WebGPU Actions */ // Wait for the queue to finish processing all submitted work -static void ggml_backend_webgpu_wait(webgpu_context & ctx, std::vector & wait_infos) { +static void ggml_backend_webgpu_wait(webgpu_context & ctx, + std::vector & futures, + uint64_t timeout_ms = UINT64_MAX) { // WebGPU implementations may limit the number of futures that can be waited on at once, // so wait in batches (64 is what Dawn supports). - for (size_t i = 0; i < wait_infos.size(); i += WEBGPU_WAIT_ANY_BATCH_SIZE) { - size_t end = std::min(i + WEBGPU_WAIT_ANY_BATCH_SIZE, wait_infos.size()); - ctx->instance.WaitAny(end - i, wait_infos.data() + i, UINT64_MAX); + size_t i = 0; + while (i < futures.size()) { + size_t end = std::min(i + WEBGPU_WAIT_ANY_BATCH_SIZE, futures.size()); + auto waitStatus = ctx->instance.WaitAny(end - i, futures.data() + i, timeout_ms); + switch (waitStatus) { + case wgpu::WaitStatus::Success: + futures.erase(futures.begin() + i, futures.begin() + end); + break; + case wgpu::WaitStatus::TimedOut: + i += WEBGPU_WAIT_ANY_BATCH_SIZE; + break; + case wgpu::WaitStatus::Error: + GGML_LOG_ERROR("ggml_webgpu: WaitAny returned an error\n"); + break; + default: + GGML_LOG_ERROR("ggml_webgpu: WaitAny returned an unknown status\n"); + break; + } } } @@ -385,7 +402,7 @@ static std::vector ggml_backend_webgpu_submit(webgpu_conte } ctx->queue.Submit(commands.size(), commands.data()); - std::vector wait_infos; + std::vector futures; wgpu::Future p_f = ctx->queue.OnSubmittedWorkDone( wgpu::CallbackMode::AllowSpontaneous, @@ -396,7 +413,7 @@ static std::vector ggml_backend_webgpu_submit(webgpu_conte // Free the staged buffers ctx->param_buf_pool.free_bufs({ params_bufs }); }); - wait_infos.push_back({ p_f }); + futures.push_back({ p_f }); for (const auto & bufs : set_rows_error_bufs) { wgpu::Future f = bufs.host_buf.MapAsync( @@ -413,16 +430,16 @@ static std::vector ggml_backend_webgpu_submit(webgpu_conte ctx->set_rows_error_buf_pool.free_bufs({ bufs }); } }); - wait_infos.push_back({ f }); + futures.push_back({ f }); } #ifdef GGML_WEBGPU_GPU_PROFILE for (const auto & name_and_bufs : pipeline_name_and_ts_bufs) { wgpu::FutureWaitInfo f = ggml_backend_webgpu_process_timestamps(ctx, name_and_bufs.second, name_and_bufs.first); - wait_infos.push_back(f); + futures.push_back(f); } #endif - return wait_infos; + return futures; } static webgpu_command ggml_backend_webgpu_build(webgpu_context & ctx, @@ -503,7 +520,7 @@ static webgpu_command ggml_backend_webgpu_build(webgpu_context & result.set_rows_error_bufs = set_rows_error_bufs; #ifdef GGML_WEBGPU_GPU_PROFILE result.timestamp_query_bufs = ts_bufs; - result.pipeline_name = pipeline.name; + result.pipeline_name = pipeline.name; #endif return result; } @@ -521,7 +538,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx, uint32_t wg_x = ((size + 3) + bytes_per_wg - 1) / bytes_per_wg; webgpu_command command = ggml_backend_webgpu_build(ctx, ctx->memset_pipeline, params, entries, wg_x); - auto futures = ggml_backend_webgpu_submit(ctx, { command }); + auto futures = ggml_backend_webgpu_submit(ctx, { command }); ggml_backend_webgpu_wait(ctx, futures); } @@ -644,7 +661,10 @@ static webgpu_command ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, g return ggml_backend_webgpu_build(ctx, ctx->cpy_pipeline[src->type][dst->type], params, entries, wg_x); } -static std::optional ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) { +static std::optional ggml_webgpu_set_rows(webgpu_context & ctx, + ggml_tensor * src, + ggml_tensor * idx, + ggml_tensor * dst) { // For set rows specifically, we need to check if src and idx are empty tensors. if (ggml_is_empty(src) || ggml_is_empty(idx)) { return std::nullopt; @@ -693,7 +713,10 @@ static std::optional ggml_webgpu_set_rows(webgpu_context & ctx, return ggml_backend_webgpu_build(ctx, ctx->set_rows_pipeline, params, entries, wg_x, error_bufs); } -static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * idx, ggml_tensor * dst) { +static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx, + ggml_tensor * src, + ggml_tensor * idx, + ggml_tensor * dst) { std::vector params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)), (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)), @@ -735,7 +758,10 @@ static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx, ggml_tensor * s return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x); } -static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) { +static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx, + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * dst) { std::vector params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)), (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)), @@ -776,11 +802,11 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * sr } static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * dst, - webgpu_pipeline & pipeline, - bool inplace) { + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * dst, + webgpu_pipeline & pipeline, + bool inplace) { std::vector params = { (uint32_t) ggml_nelements(dst), (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)), @@ -857,10 +883,10 @@ static webgpu_command ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * s } static webgpu_command ggml_webgpu_rope(webgpu_context & ctx, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * src2, - ggml_tensor * dst) { + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * src2, + ggml_tensor * dst) { const int inplace = ggml_webgpu_tensor_equal(src0, dst); const int has_freq_factor = (src2 != nullptr); @@ -1036,10 +1062,10 @@ static webgpu_command ggml_webgpu_scale(webgpu_context & ctx, ggml_tensor * src, } static webgpu_command ggml_webgpu_soft_max(webgpu_context & ctx, - ggml_tensor * src0, - ggml_tensor * src1, - ggml_tensor * src2, - ggml_tensor * dst) { + ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * src2, + ggml_tensor * dst) { const int inplace = ggml_webgpu_tensor_equal(src0, dst); const int mask_type = (src1 != nullptr) ? src1->type : 2; // use 2 for no mask here const int has_sink = (src2 != nullptr); @@ -1105,7 +1131,7 @@ static webgpu_command ggml_webgpu_soft_max(webgpu_context & ctx, } return ggml_backend_webgpu_build(ctx, ctx->soft_max_pipeline[mask_type][has_sink][inplace], params, entries, - ggml_nrows(dst)); + ggml_nrows(dst)); } // Returns true if node has enqueued work into the queue, false otherwise @@ -1180,27 +1206,29 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute); WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_encode); - std::vector commands; - std::vector wait_infos; + std::vector commands; + std::vector futures; for (int i = 0; i < cgraph->n_nodes; i++) { if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) { commands.push_back(*cmd); } if (commands.size() >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) { - std::vector new_infos = ggml_backend_webgpu_submit(ctx, commands); - wait_infos.insert(wait_infos.end(), new_infos.begin(), new_infos.end()); + std::vector new_futures = ggml_backend_webgpu_submit(ctx, commands); + // check if previous futures have finished + ggml_backend_webgpu_wait(ctx, futures, 1); + futures.insert(futures.end(), new_futures.begin(), new_futures.end()); commands.clear(); } } if (!commands.empty()) { - std::vector new_infos = ggml_backend_webgpu_submit(ctx, commands); - wait_infos.insert(wait_infos.end(), new_infos.begin(), new_infos.end()); + std::vector new_futures = ggml_backend_webgpu_submit(ctx, commands); + futures.insert(futures.end(), new_futures.begin(), new_futures.end()); commands.clear(); } WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_encode, ctx); WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_wait); - ggml_backend_webgpu_wait(ctx, wait_infos); + ggml_backend_webgpu_wait(ctx, futures); WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_wait, ctx); WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx); @@ -1298,12 +1326,12 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, // wait for WriteBuffer to complete webgpu_ctx->instance.WaitAny( webgpu_ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous, - [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) { - if (status != wgpu::QueueWorkDoneStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", - std::string(message).c_str()); - } - }), + [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) { + if (status != wgpu::QueueWorkDoneStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", + std::string(message).c_str()); + } + }), UINT64_MAX); } WEBGPU_CPU_PROFILE_TOTAL_END(set_tensor, webgpu_ctx); @@ -2061,7 +2089,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t [](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) { GGML_UNUSED(device); GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), - std::string(message).c_str()); + std::string(message).c_str()); }); ctx->instance.WaitAny(ctx->adapter.RequestDevice( &dev_desc, wgpu::CallbackMode::AllowSpontaneous, From 98d98a2a5fd5d26c005acc7411ca7729fb3fd532 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Sun, 5 Oct 2025 16:53:41 -0700 Subject: [PATCH 05/15] try new method of waiting on futures --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 764690fb388f5..3481301d3db8b 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -298,20 +298,19 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device, // Wait for the queue to finish processing all submitted work static void ggml_backend_webgpu_wait(webgpu_context & ctx, - std::vector & futures, + std::vector> & futures, uint64_t timeout_ms = UINT64_MAX) { // WebGPU implementations may limit the number of futures that can be waited on at once, // so wait in batches (64 is what Dawn supports). size_t i = 0; while (i < futures.size()) { - size_t end = std::min(i + WEBGPU_WAIT_ANY_BATCH_SIZE, futures.size()); - auto waitStatus = ctx->instance.WaitAny(end - i, futures.data() + i, timeout_ms); + auto waitStatus = ctx->instance.WaitAny(futures[i].size(), futures[i].data(), timeout_ms); switch (waitStatus) { case wgpu::WaitStatus::Success: - futures.erase(futures.begin() + i, futures.begin() + end); + futures.erase(futures.begin() + i); break; case wgpu::WaitStatus::TimedOut: - i += WEBGPU_WAIT_ANY_BATCH_SIZE; + i++; break; case wgpu::WaitStatus::Error: GGML_LOG_ERROR("ggml_webgpu: WaitAny returned an error\n"); @@ -538,7 +537,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx, uint32_t wg_x = ((size + 3) + bytes_per_wg - 1) / bytes_per_wg; webgpu_command command = ggml_backend_webgpu_build(ctx, ctx->memset_pipeline, params, entries, wg_x); - auto futures = ggml_backend_webgpu_submit(ctx, { command }); + std::vector> futures = { ggml_backend_webgpu_submit(ctx, { command }) }; ggml_backend_webgpu_wait(ctx, futures); } @@ -1207,7 +1206,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_encode); std::vector commands; - std::vector futures; + std::vector> futures; for (int i = 0; i < cgraph->n_nodes; i++) { if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) { commands.push_back(*cmd); @@ -1215,15 +1214,14 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str if (commands.size() >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) { std::vector new_futures = ggml_backend_webgpu_submit(ctx, commands); // check if previous futures have finished - ggml_backend_webgpu_wait(ctx, futures, 1); - futures.insert(futures.end(), new_futures.begin(), new_futures.end()); + ggml_backend_webgpu_wait(ctx, futures); + futures.push_back({ new_futures }); commands.clear(); } } if (!commands.empty()) { std::vector new_futures = ggml_backend_webgpu_submit(ctx, commands); - futures.insert(futures.end(), new_futures.begin(), new_futures.end()); - commands.clear(); + futures.push_back({ new_futures }); } WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_encode, ctx); From 26c44f8e4edf5527652db48606df3c54385b8e0f Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Sun, 5 Oct 2025 20:15:04 -0700 Subject: [PATCH 06/15] Add serializing of command submission in some cases --- .github/workflows/build.yml | 2 +- ggml/CMakeLists.txt | 3 ++- ggml/src/ggml-webgpu/CMakeLists.txt | 4 +++ ggml/src/ggml-webgpu/ggml-webgpu.cpp | 40 +++++++++++++++------------- 4 files changed, 29 insertions(+), 20 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index db885907cdaed..c24b11d3bc088 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -485,7 +485,7 @@ jobs: id: cmake_build run: | export Dawn_DIR=dawn/lib64/cmake/Dawn - cmake -B build -DGGML_WEBGPU=ON + cmake -B build -DGGML_WEBGPU=ON -DGGML_WEBGPU_SERIALIZE_SUBMIT=ON cmake --build build --config Release -j $(nproc) - name: Test diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 67d02e73f6767..f8bccd5a657b6 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -221,9 +221,10 @@ option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug in option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) option(GGML_WEBGPU "ggml: use WebGPU" OFF) +option(GGML_WEBGPU_SERIALIZE_SUBMIT "ggml: enable WebGPU command serialization" OFF) option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF) option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF) -option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF) +option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF) option(GGML_ZDNN "ggml: use zDNN" OFF) option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt index c6a95d5151245..d45ce0acffba7 100644 --- a/ggml/src/ggml-webgpu/CMakeLists.txt +++ b/ggml/src/ggml-webgpu/CMakeLists.txt @@ -46,6 +46,10 @@ else() set(DawnWebGPU_TARGET dawn::webgpu_dawn) endif() +if (GGML_WEBGPU_SERIALIZE_SUBMIT) + target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_SERIALIZE_SUBMIT=1) +endif() + if (GGML_WEBGPU_DEBUG) target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1) endif() diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 3481301d3db8b..2487e1b5560a4 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -56,10 +56,16 @@ # define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16 // e.g. enough for two timestamps #endif +#ifdef GGML_WEBGPU_SERIALIZE_SUBMIT +# define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 1 +# define WEBGPU_WAIT_ANY_TIMEOUT_MS UINT64_MAX +#else +# define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8 +# define WEBGPU_WAIT_ANY_TIMEOUT_MS 1 +#endif + /* Constants */ -#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8 -#define WEBGPU_WAIT_ANY_BATCH_SIZE 64 #define WEBGPU_MUL_MAT_WG_SIZE 256 #define WEBGPU_NUM_PARAM_BUFS 100 #define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters @@ -297,14 +303,12 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device, /** WebGPU Actions */ // Wait for the queue to finish processing all submitted work -static void ggml_backend_webgpu_wait(webgpu_context & ctx, +static void ggml_backend_webgpu_wait(webgpu_context & ctx, std::vector> & futures, - uint64_t timeout_ms = UINT64_MAX) { - // WebGPU implementations may limit the number of futures that can be waited on at once, - // so wait in batches (64 is what Dawn supports). + uint64_t timeout_ms = UINT64_MAX) { size_t i = 0; while (i < futures.size()) { - auto waitStatus = ctx->instance.WaitAny(futures[i].size(), futures[i].data(), timeout_ms); + auto waitStatus = ctx->instance.WaitAny(futures[i].size(), futures[i].data(), timeout_ms); switch (waitStatus) { case wgpu::WaitStatus::Success: futures.erase(futures.begin() + i); @@ -381,25 +385,25 @@ static void ggml_backend_webgpu_debug(webgpu_context & ctx) { #endif static std::vector ggml_backend_webgpu_submit(webgpu_context ctx, - std::vector builds) { - std::vector commands; + std::vector commands) { + std::vector command_buffers; std::vector params_bufs; std::vector set_rows_error_bufs; #ifdef GGML_WEBGPU_GPU_PROFILE std::vector> pipeline_name_and_ts_bufs; #endif - for (const auto & build : builds) { - commands.push_back(build.commands); - params_bufs.push_back(build.params_bufs); - if (build.set_rows_error_bufs) { - set_rows_error_bufs.push_back(build.set_rows_error_bufs.value()); + for (const auto & command : commands) { + command_buffers.push_back(command.commands); + params_bufs.push_back(command.params_bufs); + if (command.set_rows_error_bufs) { + set_rows_error_bufs.push_back(command.set_rows_error_bufs.value()); } #ifdef GGML_WEBGPU_GPU_PROFILE - pipeline_name_and_ts_bufs.push_back({ build.pipeline_name, build.timestamp_query_bufs }); + pipeline_name_and_ts_bufs.push_back({ command.pipeline_name, command.timestamp_query_bufs }); #endif } - ctx->queue.Submit(commands.size(), commands.data()); + ctx->queue.Submit(command_buffers.size(), command_buffers.data()); std::vector futures; @@ -1205,7 +1209,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute); WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_encode); - std::vector commands; + std::vector commands; std::vector> futures; for (int i = 0; i < cgraph->n_nodes; i++) { if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) { @@ -1214,7 +1218,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str if (commands.size() >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) { std::vector new_futures = ggml_backend_webgpu_submit(ctx, commands); // check if previous futures have finished - ggml_backend_webgpu_wait(ctx, futures); + ggml_backend_webgpu_wait(ctx, futures, WEBGPU_WAIT_ANY_TIMEOUT_MS); futures.push_back({ new_futures }); commands.clear(); } From eabab9e0577304e30429cf2fcf652adea7e1926c Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 6 Oct 2025 10:53:55 -0700 Subject: [PATCH 07/15] Add new pool for timestamp queries and clean up logging --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 149 +++++++++++++++++---------- 1 file changed, 92 insertions(+), 57 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 2487e1b5560a4..76344d341fd93 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -36,7 +36,7 @@ std::chrono::duration(cpu_total_end_##id - cpu_total_start_##id).count(); \ (ctx)->cpu_time_ms[#id] += cpu_total_time_##id; -// fine-grained timing (diagnostics only) +// fine-grained timing (not included in totals) # define WEBGPU_CPU_PROFILE_DETAIL_START(id) auto cpu_detail_start_##id = std::chrono::high_resolution_clock::now(); # define WEBGPU_CPU_PROFILE_DETAIL_END(id, ctx) \ @@ -52,7 +52,7 @@ #endif // GGML_WEBGPU_CPU_PROFILE #ifdef GGML_WEBGPU_GPU_PROFILE -# define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS 100 +# define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS 24 # define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16 // e.g. enough for two timestamps #endif @@ -67,7 +67,7 @@ /* Constants */ #define WEBGPU_MUL_MAT_WG_SIZE 256 -#define WEBGPU_NUM_PARAM_BUFS 100 +#define WEBGPU_NUM_PARAM_BUFS 32 #define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters #define WEBGPU_NUM_SET_ROWS_ERROR_BUFS 32 #define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4 @@ -149,6 +149,68 @@ struct webgpu_buf_pool { } }; +#ifdef GGML_WEBGPU_GPU_PROFILE +struct webgpu_gpu_profile_bufs { + wgpu::Buffer host_buf; + wgpu::Buffer dev_buf; + wgpu::QuerySet query_set; +}; + +// Holds a pool of parameter buffers for WebGPU operations +struct webgpu_gpu_profile_buf_pool { + std::vector free; + + std::mutex mutex; + + std::condition_variable cv; + + void init(wgpu::Device device, + int num_bufs, + size_t buf_size, + wgpu::BufferUsage dev_buf_usage, + wgpu::BufferUsage host_buf_usage) { + for (int i = 0; i < num_bufs; i++) { + wgpu::Buffer host_buf; + wgpu::Buffer dev_buf; + ggml_webgpu_create_buffer(device, host_buf, buf_size, host_buf_usage, "ggml_webgpu_host_profile_buf"); + ggml_webgpu_create_buffer(device, dev_buf, buf_size, dev_buf_usage, "ggml_webgpu_dev_profile_buf"); + // Create a query set for 2 timestamps + wgpu::QuerySetDescriptor ts_query_set_desc = {}; + + ts_query_set_desc.type = wgpu::QueryType::Timestamp; + ts_query_set_desc.count = 2; + wgpu::QuerySet ts_query_set = device.CreateQuerySet(&ts_query_set_desc); + + free.push_back({ host_buf, dev_buf, ts_query_set }); + } + } + + webgpu_gpu_profile_bufs alloc_bufs() { + std::unique_lock lock(mutex); + cv.wait(lock, [this] { return !free.empty(); }); + webgpu_gpu_profile_bufs bufs = free.back(); + free.pop_back(); + return bufs; + } + + void free_bufs(std::vector bufs) { + std::lock_guard lock(mutex); + free.insert(free.end(), bufs.begin(), bufs.end()); + cv.notify_all(); + } + + void cleanup() { + std::lock_guard lock(mutex); + for (auto & bufs : free) { + bufs.host_buf.Destroy(); + bufs.dev_buf.Destroy(); + bufs.query_set.Destroy(); + } + free.clear(); + } +}; +#endif + struct webgpu_pipeline { wgpu::ComputePipeline pipeline; std::string name; @@ -159,8 +221,8 @@ struct webgpu_command { webgpu_pool_bufs params_bufs; std::optional set_rows_error_bufs; #ifdef GGML_WEBGPU_GPU_PROFILE - webgpu_pool_bufs timestamp_query_bufs; - std::string pipeline_name; + webgpu_gpu_profile_bufs timestamp_query_bufs; + std::string pipeline_name; #endif }; @@ -218,7 +280,7 @@ struct webgpu_context_struct { // Profiling: per-shader GPU time in ms std::unordered_map shader_gpu_time_ms; // Profiling: pool of timestamp query buffers (one per operation) - webgpu_buf_pool timestamp_query_buf_pool; + webgpu_gpu_profile_buf_pool timestamp_query_buf_pool; #endif }; @@ -259,8 +321,6 @@ static void ggml_webgpu_create_pipeline(wgpu::Device & const char * shader_code, const char * label, const std::vector & constants = {}) { - WEBGPU_LOG_DEBUG("ggml_webgpu_create_pipeline()"); - wgpu::ShaderSourceWGSL shader_source; shader_source.code = shader_code; @@ -286,8 +346,6 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device, size_t size, wgpu::BufferUsage usage, const char * label) { - WEBGPU_LOG_DEBUG("ggml_webgpu_create_buffer()"); - wgpu::BufferDescriptor buffer_desc; buffer_desc.size = size; buffer_desc.usage = usage; @@ -326,28 +384,6 @@ static void ggml_backend_webgpu_wait(webgpu_context & } } -#ifdef GGML_WEBGPU_GPU_PROFILE -static wgpu::FutureWaitInfo ggml_backend_webgpu_process_timestamps(webgpu_context & ctx, - webgpu_pool_bufs ts_bufs, - std::string label) { - wgpu::Future f = ts_bufs.host_buf.MapAsync( - wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous, - [ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) { - if (status != wgpu::MapAsyncStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str()); - } else { - const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange(); - // WebGPU timestamps are in ticks; convert to ms using device timestamp period if available - double elapsed_ms = double(ts_data[1] - ts_data[0]) * 1e-6; // TODO: use actual timestamp period - ctx->shader_gpu_time_ms[label] += elapsed_ms; - // We can't unmap in here due to WebGPU reentrancy limitations. - ctx->timestamp_query_buf_pool.free_bufs({ ts_bufs }); - } - }); - return { f }; -} -#endif - static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx, wgpu::Buffer & buffer, wgpu::MapMode mode, @@ -390,7 +426,7 @@ static std::vector ggml_backend_webgpu_submit(webgpu_conte std::vector params_bufs; std::vector set_rows_error_bufs; #ifdef GGML_WEBGPU_GPU_PROFILE - std::vector> pipeline_name_and_ts_bufs; + std::vector> pipeline_name_and_ts_bufs; #endif for (const auto & command : commands) { @@ -399,9 +435,6 @@ static std::vector ggml_backend_webgpu_submit(webgpu_conte if (command.set_rows_error_bufs) { set_rows_error_bufs.push_back(command.set_rows_error_bufs.value()); } -#ifdef GGML_WEBGPU_GPU_PROFILE - pipeline_name_and_ts_bufs.push_back({ command.pipeline_name, command.timestamp_query_bufs }); -#endif } ctx->queue.Submit(command_buffers.size(), command_buffers.data()); @@ -437,9 +470,25 @@ static std::vector ggml_backend_webgpu_submit(webgpu_conte } #ifdef GGML_WEBGPU_GPU_PROFILE - for (const auto & name_and_bufs : pipeline_name_and_ts_bufs) { - wgpu::FutureWaitInfo f = ggml_backend_webgpu_process_timestamps(ctx, name_and_bufs.second, name_and_bufs.first); - futures.push_back(f); + for (const auto & command : commands) { + auto label = command.pipeline_name; + auto ts_bufs = command.timestamp_query_bufs; + + wgpu::Future f = ts_bufs.host_buf.MapAsync( + wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous, + [ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) { + if (status != wgpu::MapAsyncStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str()); + } else { + const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange(); + // WebGPU timestamps are in ns; convert to ms + double elapsed_ms = double(ts_data[1] - ts_data[0]) * 1e-6; + ctx->shader_gpu_time_ms[label] += elapsed_ms; + // We can't unmap in here due to WebGPU reentrancy limitations. + ctx->timestamp_query_buf_pool.free_bufs({ ts_bufs }); + } + }); + futures.push_back({ f }); } #endif return futures; @@ -480,18 +529,12 @@ static webgpu_command ggml_backend_webgpu_build(webgpu_context & #ifdef GGML_WEBGPU_GPU_PROFILE // --- Profiling: GPU timestamp queries --- // Allocate a timestamp query buffer (2 timestamps: start/end) - webgpu_pool_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs(); + webgpu_gpu_profile_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs(); if (ts_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) { ts_bufs.host_buf.Unmap(); } - // Create a query set for 2 timestamps - wgpu::QuerySetDescriptor ts_query_set_desc = {}; - ts_query_set_desc.type = wgpu::QueryType::Timestamp; - ts_query_set_desc.count = 2; - wgpu::QuerySet ts_query_set = ctx->device.CreateQuerySet(&ts_query_set_desc); - - wgpu::PassTimestampWrites ts_writes = { .querySet = ts_query_set, + wgpu::PassTimestampWrites ts_writes = { .querySet = ts_bufs.query_set, .beginningOfPassWriteIndex = 0, .endOfPassWriteIndex = 1 }; wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes }; @@ -506,7 +549,7 @@ static webgpu_command ggml_backend_webgpu_build(webgpu_context & #ifdef GGML_WEBGPU_GPU_PROFILE // Resolve the query set into the device buffer - encoder.ResolveQuerySet(ts_query_set, 0, 2, ts_bufs.dev_buf, 0); + encoder.ResolveQuerySet(ts_bufs.query_set, 0, 2, ts_bufs.dev_buf, 0); encoder.CopyBufferToBuffer(ts_bufs.dev_buf, 0, ts_bufs.host_buf, 0, ts_bufs.host_buf.GetSize()); #endif @@ -1137,7 +1180,7 @@ static webgpu_command ggml_webgpu_soft_max(webgpu_context & ctx, ggml_nrows(dst)); } -// Returns true if node has enqueued work into the queue, false otherwise +// Returns the encoded command, or std::nullopt if the operation is a no-op static std::optional ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) { if (ggml_is_empty(node)) { return std::nullopt; @@ -1208,7 +1251,6 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute); - WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_encode); std::vector commands; std::vector> futures; for (int i = 0; i < cgraph->n_nodes; i++) { @@ -1227,14 +1269,8 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str std::vector new_futures = ggml_backend_webgpu_submit(ctx, commands); futures.push_back({ new_futures }); } - WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_encode, ctx); - - WEBGPU_CPU_PROFILE_DETAIL_START(graph_compute_wait); ggml_backend_webgpu_wait(ctx, futures); - WEBGPU_CPU_PROFILE_DETAIL_END(graph_compute_wait, ctx); - WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx); - return GGML_STATUS_SUCCESS; } @@ -1260,7 +1296,6 @@ static ggml_backend_i ggml_backend_webgpu_i = { /* GGML Backend Buffer Interface */ static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_free_buffer()"); ggml_backend_webgpu_buffer_context * ctx = static_cast(buffer->context); ctx->buffer.Destroy(); } From c30b22e2fd69fd0df55906b1407f3207bbfbdabe Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 6 Oct 2025 19:26:21 -0700 Subject: [PATCH 08/15] Serialize command submission in CI and leave a TODO note --- .github/workflows/build.yml | 2 +- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c24b11d3bc088..b326048c2fb18 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -175,7 +175,7 @@ jobs: id: cmake_build run: | export CMAKE_PREFIX_PATH=dawn - cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF + cmake -B build -DGGML_WEBGPU=ON -DGGML_WEBGPU_SERIALIZE_SUBMIT=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Test diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 76344d341fd93..e80af29aa3066 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -56,6 +56,10 @@ # define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16 // e.g. enough for two timestamps #endif +// TODO: The WebGPU backend can deadlock in multi-threaded scenarios if the parameter buffer pool +// is exhausted and the command submit batch size is too high, or in cases where the underlying +// WebGPU implementation has bugs in handling concurrent operations. Serializing command submission +// is a workaround, but we should also investigate better solutions. #ifdef GGML_WEBGPU_SERIALIZE_SUBMIT # define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 1 # define WEBGPU_WAIT_ANY_TIMEOUT_MS UINT64_MAX From b926b0c5f0f8b0ce66f182220c7bba7c30426521 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 6 Oct 2025 19:57:23 -0700 Subject: [PATCH 09/15] Update webgpu CI --- .github/workflows/build.yml | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b77da9f911a5d..5a37f6ebd1e80 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -444,8 +444,8 @@ jobs: # This is using llvmpipe and runs slower than other backends ctest -L main --verbose --timeout 4200 - ubuntu-22-cmake-webgpu: - runs-on: ubuntu-22.04 + ubuntu-24-cmake-webgpu: + runs-on: ubuntu-24.04 steps: - name: Clone @@ -455,16 +455,34 @@ jobs: - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: - key: ubuntu-22-cmake-webgpu + key: ubuntu-24-cmake-webgpu evict-old-files: 1d - - name: Vulkan SDK Dependencies - id: vulkan-depends + - name: Dependencies + id: depends run: | - wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - - sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list + sudo add-apt-repository -y ppa:kisak/kisak-mesa sudo apt-get update -y - sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev + sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev + + - name: Get latest Vulkan SDK version + id: vulkan_sdk_version + run: | + echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV" + + - name: Use Vulkan SDK Cache + uses: actions/cache@v4 + id: cache-sdk + with: + path: ./vulkan_sdk + key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }} + + - name: Setup Vulkan SDK + if: steps.cache-sdk.outputs.cache-hit != 'true' + uses: ./.github/actions/linux-setup-vulkan + with: + path: ./vulkan_sdk + version: ${{ env.VULKAN_SDK_VERSION }} - name: Dawn Dependency id: dawn-depends From d501ef4e2e5c62b24ed6f8b79c9b9008ffe165f5 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 7 Oct 2025 09:07:03 -0700 Subject: [PATCH 10/15] Add myself as WebGPU codeowner --- CODEOWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/CODEOWNERS b/CODEOWNERS index 4ee93fa4babb2..bfffcf3fcdce5 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -74,6 +74,7 @@ /ggml/src/ggml.c @ggerganov @slaren /ggml/src/ggml.cpp @ggerganov @slaren /ggml/src/gguf.cpp @JohannesGaessler @Green-Sky +/ggml/src/ggml-webgpu/ @reeselevine /gguf-py/ @CISC /media/ @ggerganov /scripts/gen* @ggerganov From d56e0613fb8ed6e18af1982b287c6b7448b782b4 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 7 Oct 2025 12:29:50 -0700 Subject: [PATCH 11/15] Deadlock avoidance --- .github/workflows/build.yml | 4 +- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 59 +++++++++++++++++++--------- 2 files changed, 42 insertions(+), 21 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5a37f6ebd1e80..ec2ab5a58d027 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -175,7 +175,7 @@ jobs: id: cmake_build run: | export CMAKE_PREFIX_PATH=dawn - cmake -B build -DGGML_WEBGPU=ON -DGGML_WEBGPU_SERIALIZE_SUBMIT=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF + cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -502,7 +502,7 @@ jobs: id: cmake_build run: | export Dawn_DIR=dawn/lib64/cmake/Dawn - cmake -B build -DGGML_WEBGPU=ON -DGGML_WEBGPU_SERIALIZE_SUBMIT=ON + cmake -B build -DGGML_WEBGPU=ON cmake --build build --config Release -j $(nproc) - name: Test diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index e80af29aa3066..c3961726f19b2 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -11,6 +11,7 @@ #include +#include #include #include #include @@ -65,13 +66,15 @@ # define WEBGPU_WAIT_ANY_TIMEOUT_MS UINT64_MAX #else # define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8 -# define WEBGPU_WAIT_ANY_TIMEOUT_MS 1 +# define WEBGPU_WAIT_ANY_TIMEOUT_MS 0 #endif /* Constants */ #define WEBGPU_MUL_MAT_WG_SIZE 256 #define WEBGPU_NUM_PARAM_BUFS 32 +// Maximum number of in-flight submissions per-thread, to avoid exhausting the parameter buffer pool +#define WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD WEBGPU_NUM_PARAM_BUFS / WEBGPU_COMMAND_SUBMIT_BATCH_SIZE #define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters #define WEBGPU_NUM_SET_ROWS_ERROR_BUFS 32 #define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4 @@ -107,6 +110,11 @@ struct webgpu_pool_bufs { wgpu::Buffer dev_buf; }; +// The futures to wait on for a single queue submission +struct webgpu_submission_futures { + std::vector futures; +}; + // Holds a pool of parameter buffers for WebGPU operations struct webgpu_buf_pool { std::vector free; @@ -243,6 +251,7 @@ struct webgpu_context_struct { uint32_t max_wg_size_x; std::recursive_mutex mutex; + std::atomic_int inflight_threads = 0; webgpu_buf_pool param_buf_pool; webgpu_buf_pool set_rows_error_buf_pool; @@ -365,12 +374,19 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device, /** WebGPU Actions */ // Wait for the queue to finish processing all submitted work -static void ggml_backend_webgpu_wait(webgpu_context & ctx, - std::vector> & futures, - uint64_t timeout_ms = UINT64_MAX) { +static void ggml_backend_webgpu_wait(webgpu_context & ctx, + std::vector & futures, + uint64_t timeout_ms = UINT64_MAX) { + // If we have too many in-flight submissions, wait on the oldest one first. If there are many threads, + // inflight_max may be 0, meaning that we must wait on all futures. + int inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / ctx->inflight_threads; + while (futures.size() >= inflight_max && futures.size() > 0) { + ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX); + futures.erase(futures.begin()); + } size_t i = 0; while (i < futures.size()) { - auto waitStatus = ctx->instance.WaitAny(futures[i].size(), futures[i].data(), timeout_ms); + auto waitStatus = ctx->instance.WaitAny(futures[i].futures.size(), futures[i].futures.data(), timeout_ms); switch (waitStatus) { case wgpu::WaitStatus::Success: futures.erase(futures.begin() + i); @@ -424,8 +440,7 @@ static void ggml_backend_webgpu_debug(webgpu_context & ctx) { } #endif -static std::vector ggml_backend_webgpu_submit(webgpu_context ctx, - std::vector commands) { +static webgpu_submission_futures ggml_backend_webgpu_submit(webgpu_context ctx, std::vector commands) { std::vector command_buffers; std::vector params_bufs; std::vector set_rows_error_bufs; @@ -484,9 +499,9 @@ static std::vector ggml_backend_webgpu_submit(webgpu_conte if (status != wgpu::MapAsyncStatus::Success) { GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str()); } else { - const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange(); + const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange(); // WebGPU timestamps are in ns; convert to ms - double elapsed_ms = double(ts_data[1] - ts_data[0]) * 1e-6; + double elapsed_ms = double(ts_data[1] - ts_data[0]) * 1e-6; ctx->shader_gpu_time_ms[label] += elapsed_ms; // We can't unmap in here due to WebGPU reentrancy limitations. ctx->timestamp_query_buf_pool.free_bufs({ ts_bufs }); @@ -495,7 +510,7 @@ static std::vector ggml_backend_webgpu_submit(webgpu_conte futures.push_back({ f }); } #endif - return futures; + return { futures }; } static webgpu_command ggml_backend_webgpu_build(webgpu_context & ctx, @@ -588,7 +603,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx, uint32_t wg_x = ((size + 3) + bytes_per_wg - 1) / bytes_per_wg; webgpu_command command = ggml_backend_webgpu_build(ctx, ctx->memset_pipeline, params, entries, wg_x); - std::vector> futures = { ggml_backend_webgpu_submit(ctx, { command }) }; + std::vector futures = { ggml_backend_webgpu_submit(ctx, { command }) }; ggml_backend_webgpu_wait(ctx, futures); } @@ -1255,25 +1270,31 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute); - std::vector commands; - std::vector> futures; + ctx->inflight_threads++; + + std::vector commands; + std::vector futures; for (int i = 0; i < cgraph->n_nodes; i++) { if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) { commands.push_back(*cmd); } - if (commands.size() >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) { - std::vector new_futures = ggml_backend_webgpu_submit(ctx, commands); - // check if previous futures have finished + // compute the batch size based on the number of inflight threads + int batch_size = std::min(std::max(1, WEBGPU_NUM_PARAM_BUFS / ctx->inflight_threads), + WEBGPU_COMMAND_SUBMIT_BATCH_SIZE); + if (commands.size() >= batch_size) { + futures.push_back(ggml_backend_webgpu_submit(ctx, commands)); + // Process events and check for completed submissions + ctx->instance.ProcessEvents(); ggml_backend_webgpu_wait(ctx, futures, WEBGPU_WAIT_ANY_TIMEOUT_MS); - futures.push_back({ new_futures }); commands.clear(); } } if (!commands.empty()) { - std::vector new_futures = ggml_backend_webgpu_submit(ctx, commands); - futures.push_back({ new_futures }); + webgpu_submission_futures new_futures = ggml_backend_webgpu_submit(ctx, commands); + futures.push_back(new_futures); } ggml_backend_webgpu_wait(ctx, futures); + ctx->inflight_threads--; WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx); return GGML_STATUS_SUCCESS; } From bcf8d9bdfbd23de0c075ea0345ea7556f8251f2d Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 7 Oct 2025 12:46:34 -0700 Subject: [PATCH 12/15] Leave WebGPU/Vulkan CI serialized --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ec2ab5a58d027..22db2192c40fb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -502,7 +502,7 @@ jobs: id: cmake_build run: | export Dawn_DIR=dawn/lib64/cmake/Dawn - cmake -B build -DGGML_WEBGPU=ON + cmake -B build -DGGML_WEBGPU=ON -DGGML_WEBGPU_SERIALIZE_SUBMIT=ON cmake --build build --config Release -j $(nproc) - name: Test From 8a848cb093922fa7b407833dfd145c86b83dcb49 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 7 Oct 2025 12:48:13 -0700 Subject: [PATCH 13/15] Fix divide by 0 --- .github/workflows/build.yml | 2 +- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 22db2192c40fb..ec2ab5a58d027 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -502,7 +502,7 @@ jobs: id: cmake_build run: | export Dawn_DIR=dawn/lib64/cmake/Dawn - cmake -B build -DGGML_WEBGPU=ON -DGGML_WEBGPU_SERIALIZE_SUBMIT=ON + cmake -B build -DGGML_WEBGPU=ON cmake --build build --config Release -j $(nproc) - name: Test diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index c3961726f19b2..99dddd132343a 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -379,7 +379,7 @@ static void ggml_backend_webgpu_wait(webgpu_context & ct uint64_t timeout_ms = UINT64_MAX) { // If we have too many in-flight submissions, wait on the oldest one first. If there are many threads, // inflight_max may be 0, meaning that we must wait on all futures. - int inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / ctx->inflight_threads; + int inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::min(ctx->inflight_threads, 1); while (futures.size() >= inflight_max && futures.size() > 0) { ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX); futures.erase(futures.begin()); From d3c7ddd4ce5b8bc622a7d07e60e91137e841f5cc Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 7 Oct 2025 13:24:50 -0700 Subject: [PATCH 14/15] Fix logic in division by inflight_threads --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 99dddd132343a..db9bfc3b77431 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -62,17 +62,17 @@ // WebGPU implementation has bugs in handling concurrent operations. Serializing command submission // is a workaround, but we should also investigate better solutions. #ifdef GGML_WEBGPU_SERIALIZE_SUBMIT -# define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 1 +# define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 1u # define WEBGPU_WAIT_ANY_TIMEOUT_MS UINT64_MAX #else -# define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8 +# define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8u # define WEBGPU_WAIT_ANY_TIMEOUT_MS 0 #endif /* Constants */ #define WEBGPU_MUL_MAT_WG_SIZE 256 -#define WEBGPU_NUM_PARAM_BUFS 32 +#define WEBGPU_NUM_PARAM_BUFS 32u // Maximum number of in-flight submissions per-thread, to avoid exhausting the parameter buffer pool #define WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD WEBGPU_NUM_PARAM_BUFS / WEBGPU_COMMAND_SUBMIT_BATCH_SIZE #define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters @@ -251,7 +251,7 @@ struct webgpu_context_struct { uint32_t max_wg_size_x; std::recursive_mutex mutex; - std::atomic_int inflight_threads = 0; + std::atomic_uint inflight_threads = 0; webgpu_buf_pool param_buf_pool; webgpu_buf_pool set_rows_error_buf_pool; @@ -379,7 +379,8 @@ static void ggml_backend_webgpu_wait(webgpu_context & ct uint64_t timeout_ms = UINT64_MAX) { // If we have too many in-flight submissions, wait on the oldest one first. If there are many threads, // inflight_max may be 0, meaning that we must wait on all futures. - int inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::min(ctx->inflight_threads, 1); + uint inflight_threads = ctx->inflight_threads; + uint inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u); while (futures.size() >= inflight_max && futures.size() > 0) { ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX); futures.erase(futures.begin()); @@ -1279,8 +1280,9 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str commands.push_back(*cmd); } // compute the batch size based on the number of inflight threads - int batch_size = std::min(std::max(1, WEBGPU_NUM_PARAM_BUFS / ctx->inflight_threads), - WEBGPU_COMMAND_SUBMIT_BATCH_SIZE); + uint inflight_threads = ctx->inflight_threads; + uint batch_size = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)), + WEBGPU_COMMAND_SUBMIT_BATCH_SIZE); if (commands.size() >= batch_size) { futures.push_back(ggml_backend_webgpu_submit(ctx, commands)); // Process events and check for completed submissions From a5e26c2d3bdd93c065e638838a278d74a213de4d Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 7 Oct 2025 13:39:07 -0700 Subject: [PATCH 15/15] Update CODEOWNERS and remove serialize submit option --- CODEOWNERS | 2 +- ggml/CMakeLists.txt | 1 - ggml/src/ggml-webgpu/CMakeLists.txt | 4 ---- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 23 +++++++---------------- 4 files changed, 8 insertions(+), 22 deletions(-) diff --git a/CODEOWNERS b/CODEOWNERS index bfffcf3fcdce5..3b696bf94a147 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -70,11 +70,11 @@ /ggml/src/ggml-rpc/ @rgerganov /ggml/src/ggml-threading.* @ggerganov @slaren /ggml/src/ggml-vulkan/ @0cc4m +/ggml/src/ggml-webgpu/ @reeselevine /ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM /ggml/src/ggml.c @ggerganov @slaren /ggml/src/ggml.cpp @ggerganov @slaren /ggml/src/gguf.cpp @JohannesGaessler @Green-Sky -/ggml/src/ggml-webgpu/ @reeselevine /gguf-py/ @CISC /media/ @ggerganov /scripts/gen* @ggerganov diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index f8bccd5a657b6..73032be68e153 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -221,7 +221,6 @@ option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug in option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) option(GGML_WEBGPU "ggml: use WebGPU" OFF) -option(GGML_WEBGPU_SERIALIZE_SUBMIT "ggml: enable WebGPU command serialization" OFF) option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF) option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF) option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF) diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt index d45ce0acffba7..c6a95d5151245 100644 --- a/ggml/src/ggml-webgpu/CMakeLists.txt +++ b/ggml/src/ggml-webgpu/CMakeLists.txt @@ -46,10 +46,6 @@ else() set(DawnWebGPU_TARGET dawn::webgpu_dawn) endif() -if (GGML_WEBGPU_SERIALIZE_SUBMIT) - target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_SERIALIZE_SUBMIT=1) -endif() - if (GGML_WEBGPU_DEBUG) target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1) endif() diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index db9bfc3b77431..05e16cd432ad3 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -57,22 +57,12 @@ # define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16 // e.g. enough for two timestamps #endif -// TODO: The WebGPU backend can deadlock in multi-threaded scenarios if the parameter buffer pool -// is exhausted and the command submit batch size is too high, or in cases where the underlying -// WebGPU implementation has bugs in handling concurrent operations. Serializing command submission -// is a workaround, but we should also investigate better solutions. -#ifdef GGML_WEBGPU_SERIALIZE_SUBMIT -# define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 1u -# define WEBGPU_WAIT_ANY_TIMEOUT_MS UINT64_MAX -#else -# define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8u -# define WEBGPU_WAIT_ANY_TIMEOUT_MS 0 -#endif - /* Constants */ #define WEBGPU_MUL_MAT_WG_SIZE 256 #define WEBGPU_NUM_PARAM_BUFS 32u +#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8u +#define WEBGPU_WAIT_ANY_TIMEOUT_MS 0 // Maximum number of in-flight submissions per-thread, to avoid exhausting the parameter buffer pool #define WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD WEBGPU_NUM_PARAM_BUFS / WEBGPU_COMMAND_SUBMIT_BATCH_SIZE #define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters @@ -376,11 +366,12 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device, // Wait for the queue to finish processing all submitted work static void ggml_backend_webgpu_wait(webgpu_context & ctx, std::vector & futures, - uint64_t timeout_ms = UINT64_MAX) { + bool block = true) { // If we have too many in-flight submissions, wait on the oldest one first. If there are many threads, // inflight_max may be 0, meaning that we must wait on all futures. - uint inflight_threads = ctx->inflight_threads; - uint inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u); + uint64_t timeout_ms = block ? UINT64_MAX : 0; + uint inflight_threads = ctx->inflight_threads; + uint inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u); while (futures.size() >= inflight_max && futures.size() > 0) { ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX); futures.erase(futures.begin()); @@ -1287,7 +1278,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str futures.push_back(ggml_backend_webgpu_submit(ctx, commands)); // Process events and check for completed submissions ctx->instance.ProcessEvents(); - ggml_backend_webgpu_wait(ctx, futures, WEBGPU_WAIT_ANY_TIMEOUT_MS); + ggml_backend_webgpu_wait(ctx, futures, false); commands.clear(); } }