From bb490153a84f33e98f1c86e712ffdc6ef0d59edf Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sat, 25 Oct 2025 16:16:05 -0500 Subject: [PATCH 1/2] vulkan: remove the need for the dryrun Allocate pipelines and descriptor sets when requested. Reallocate the prealloc buffers when needed, and flush any pending work before reallocating. For rms_partials and total_mul_mat_bytes, use the sizes computed the last time the graph was executed. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 195 +++++++++++---------------- 1 file changed, 76 insertions(+), 119 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 8d1a85c96939b..f4b0a2cefc0ea 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -539,9 +539,6 @@ struct vk_device_struct { bool mul_mat_id_m[GGML_TYPE_COUNT]; bool mul_mat_id_s[GGML_TYPE_COUNT]; - // set to true to indicate that some shaders need to be compiled after the dryrun - bool need_compiles {}; - vk::DescriptorSetLayout dsl; vk_matmul_pipeline pipeline_matmul_f32 {}; @@ -1408,6 +1405,10 @@ struct ggml_vk_garbage_collector { std::vector contexts; }; +static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx); +static void ggml_vk_load_shaders(vk_device& device); +static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx); + #if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG) #define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl @@ -1561,8 +1562,11 @@ struct ggml_backend_vk_context { bool almost_ready_fence_pending {}; // Set before op_add and unset after op_rms_norm to indicate that the add should // write partial sums to accumulate the square of the vector components + bool do_add_rms_partials_offset_calculation; bool do_add_rms_partials; + uint64_t last_total_mul_mat_bytes {}; + // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert. vk_pipeline_struct * prealloc_y_last_pipeline_used {}; const ggml_tensor * prealloc_y_last_tensor_used {}; @@ -1865,8 +1869,9 @@ static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, ctx->pipeline_descriptor_set_requirements += n; if (!pipeline->compiled) { pipeline->needed = true; - ctx->device->need_compiles = true; + ggml_vk_load_shaders(ctx->device); } + ggml_pipeline_allocate_descriptor_sets(ctx); } static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) { @@ -1878,7 +1883,9 @@ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx vk_device& device = ctx->device; - uint32_t to_alloc = ctx->pipeline_descriptor_set_requirements - ctx->descriptor_sets.size(); + // Grow by 50% to avoid frequent allocations + uint32_t needed = std::max(3 * ctx->descriptor_sets.size() / 2, size_t{ctx->pipeline_descriptor_set_requirements}); + uint32_t to_alloc = needed - ctx->descriptor_sets.size(); uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE; uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE; @@ -3916,7 +3923,6 @@ static void ggml_vk_load_shaders(vk_device& device) { for (auto &c : compiles) { c.wait(); } - device->need_compiles = false; } static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch); @@ -5020,6 +5026,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { ctx->prealloc_size_x = 0; ctx->prealloc_size_y = 0; ctx->prealloc_size_split_k = 0; + ctx->prealloc_size_add_rms_partials = 0; ctx->fence = ctx->device->device.createFence({}); ctx->almost_ready_fence = ctx->device->device.createFence({}); @@ -6205,6 +6212,7 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub } static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool disable_split_k, bool dryrun = false) { + GGML_UNUSED(dryrun); VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << ggml_type_name(src0->type) << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << ggml_type_name(src1->type) << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << ggml_type_name(dst->type) << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; @@ -6322,7 +6330,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, true); } - if (dryrun) { + { const uint64_t x_sz_upd = x_sz * ne02 * ne03; uint64_t y_sz_upd = y_sz * ne12 * ne13; if (quantize_y) { @@ -6337,12 +6345,15 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub } if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { ctx->prealloc_size_x = x_sz_upd; + ggml_vk_preallocate_buffers(ctx, subctx); } if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz_upd) { ctx->prealloc_size_y = y_sz_upd; + ggml_vk_preallocate_buffers(ctx, subctx); } if (split_k > 1 && ctx->prealloc_size_split_k < split_k_size) { ctx->prealloc_size_split_k = split_k_size; + ggml_vk_preallocate_buffers(ctx, subctx); } // Request descriptor sets @@ -6359,7 +6370,6 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub if (split_k > 1) { ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1); } - return; } vk_buffer d_D = dst_buf_ctx->dev_buffer; @@ -6516,6 +6526,7 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_ } static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { + GGML_UNUSED(dryrun); ggml_tensor * dst = cgraph->nodes[node_idx]; const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; @@ -6619,7 +6630,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& const uint64_t y_sz = quantize_y ? (y_ne * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne); const uint64_t d_sz = sizeof(float) * d_ne; - if (dryrun) { + { const uint64_t x_sz_upd = x_sz * ne02 * ne03; uint64_t y_sz_upd = y_sz * ne12 * ne13; if (quantize_y) { @@ -6632,9 +6643,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& } if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { ctx->prealloc_size_x = x_sz_upd; + ggml_vk_preallocate_buffers(ctx, subctx); } if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz_upd) { ctx->prealloc_size_y = y_sz_upd; + ggml_vk_preallocate_buffers(ctx, subctx); } // Request descriptor sets @@ -6648,7 +6661,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1); } ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1); - return; } vk_buffer d_D; @@ -6807,6 +6819,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& } static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { + GGML_UNUSED(dryrun); ggml_tensor * dst = cgraph->nodes[node_idx]; const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; @@ -6859,10 +6872,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c gqa_ratio = 1; } - if (dryrun) { + { // Request descriptor sets ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1); - return; } vk_buffer d_D; @@ -6937,6 +6949,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c } static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { + GGML_UNUSED(dryrun); ggml_tensor * dst = cgraph->nodes[node_idx]; const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; @@ -6995,10 +7008,9 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con const uint64_t qy_sz = ggml_nbytes(src1); const uint64_t d_sz = sizeof(float) * d_ne; - if (dryrun) { + { // Request descriptor sets ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1); - return; } vk_buffer d_D; @@ -7123,6 +7135,7 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c } static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) { + GGML_UNUSED(dryrun); VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; @@ -7251,7 +7264,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, true); } - if (dryrun) { + { const uint64_t x_sz_upd = x_sz * ne02 * ne03; uint64_t y_sz_upd = y_sz * ne12 * ne13; if (quantize_y) { @@ -7264,9 +7277,11 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& } if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { ctx->prealloc_size_x = x_sz_upd; + ggml_vk_preallocate_buffers(ctx, subctx); } if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz_upd) { ctx->prealloc_size_y = y_sz_upd; + ggml_vk_preallocate_buffers(ctx, subctx); } // Request descriptor sets @@ -7280,7 +7295,6 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& if (quantize_y) { ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1); } - return; } vk_buffer d_D = dst_buf_ctx->dev_buffer; @@ -7397,6 +7411,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& } static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { + GGML_UNUSED(dryrun); ggml_tensor * dst = cgraph->nodes[node_idx]; ggml_tensor * src0 = dst->src[0]; ggml_tensor * src1 = dst->src[1]; @@ -7493,7 +7508,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT GGML_ASSERT(dmmv != nullptr); - if (dryrun) { + { const uint64_t x_sz_upd = x_sz * ne02 * ne03; const uint64_t y_sz_upd = y_sz * ne12 * ne13; if ( @@ -7503,9 +7518,11 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte } if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { ctx->prealloc_size_x = x_sz_upd; + ggml_vk_preallocate_buffers(ctx, subctx); } if (qy_needs_dequant && ctx->prealloc_size_y < y_sz_upd) { ctx->prealloc_size_y = y_sz_upd; + ggml_vk_preallocate_buffers(ctx, subctx); } // Request descriptor sets @@ -7516,7 +7533,6 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1); } ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1); - return; } vk_buffer d_D; @@ -7734,6 +7750,7 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co } static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst, bool dryrun = false) { + GGML_UNUSED(dryrun); VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3]; std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3]; std::cerr << "), (" << v << ", name=" << v->name << ", type=" << v->type << ", ne0=" << v->ne[0] << ", ne1=" << v->ne[1] << ", ne2=" << v->ne[2] << ", ne3=" << v->ne[3] << ", nb0=" << v->nb[0] << ", nb1=" << v->nb[1] << ", nb2=" << v->nb[2] << ", nb3=" << v->nb[3]; @@ -7915,15 +7932,15 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx } if (ctx->prealloc_size_split_k < split_k_size) { ctx->prealloc_size_split_k = split_k_size; + ggml_vk_preallocate_buffers(ctx, subctx); } - if (dryrun) { + { // Request descriptor sets ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); if (split_k > 1) { ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1); } - return; } float scale = 1.0f; @@ -8728,6 +8745,7 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk template static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) { + GGML_UNUSED(dryrun); VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; if (src1 != nullptr) { std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -8790,10 +8808,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co GGML_ABORT("fatal error"); } - if (dryrun) { - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - return; - } + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op); @@ -9210,6 +9225,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const } static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { + GGML_UNUSED(dryrun); const ggml_tensor *first_node = cgraph->nodes[node_idx]; const ggml_tensor *dst = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; @@ -9254,10 +9270,7 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, GGML_ABORT("fatal error"); } - if (dryrun) { - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - return; - } + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); ggml_backend_vk_buffer_context * buf_ctx[MAX_PARAMETER_COUNT]; vk_buffer buf[MAX_PARAMETER_COUNT]; @@ -9395,6 +9408,7 @@ static void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, co } static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, int version, bool dryrun = false) { + GGML_UNUSED(dryrun); GGML_ASSERT(version == 6 || version == 7); int num_srcs = version == 6 ? 6 : 7; @@ -9407,10 +9421,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, dst->src[0], dst->src[1], dst->src[2], dst, dst->op); GGML_ASSERT(pipeline != nullptr); - if (dryrun) { - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - return; - } + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; ggml_backend_vk_buffer_context * src_buf_ctxs[7] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr }; @@ -9519,6 +9530,7 @@ static void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, } static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) { + GGML_UNUSED(dryrun); const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src2 = dst->src[2]; @@ -9540,10 +9552,7 @@ static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, dst->op); GGML_ASSERT(pipeline != nullptr); - if (dryrun) { - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - return; - } + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); const int64_t s_off = ggml_nelements(src1) * sizeof(float); @@ -9630,6 +9639,7 @@ static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, } static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_push_constants&& pc, bool dryrun = false) { + GGML_UNUSED(dryrun); const ggml_tensor * x = dst->src[0]; const ggml_tensor * g = dst->src[1]; const ggml_tensor * gm = dst->src[2]; @@ -9655,10 +9665,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, g, gm, gv, dst, GGML_OP_OPT_STEP_ADAMW); GGML_ASSERT(pipeline != nullptr); - if (dryrun) { - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - return; - } + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); ggml_backend_vk_buffer_context * x_buf_ctx = (ggml_backend_vk_buffer_context *)x->buffer->context; ggml_backend_vk_buffer_context * g_buf_ctx = (ggml_backend_vk_buffer_context *)g->buffer->context; @@ -9932,9 +9939,10 @@ static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, op_params[0], 0.0f, (int32_t)param3, }, dryrun); - if (ctx->do_add_rms_partials) { + if (ctx->do_add_rms_partials_offset_calculation) { ctx->prealloc_size_add_rms_partials_offset += ggml_vk_rms_partials_size(ctx, src0); ctx->do_add_rms_partials = false; + ctx->do_add_rms_partials_offset_calculation = false; } } @@ -10030,7 +10038,7 @@ static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& sub } static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { - + GGML_UNUSED(dryrun); topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops); ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0]; ggml_tensor * weights = (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) ? cgraph->nodes[node_idx + 9] : @@ -10050,10 +10058,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, cgraph->nodes[node_idx], GGML_OP_SOFT_MAX); - if (dryrun) { - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - return; - } + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); ggml_backend_vk_buffer_context * logits_buf_ctx = (ggml_backend_vk_buffer_context *)logits->buffer->context; ggml_backend_vk_buffer_context * weights_buf_ctx = (ggml_backend_vk_buffer_context *)weights->buffer->context; @@ -10660,10 +10665,6 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t } } - if (ctx->device->need_compiles) { - ggml_vk_load_shaders(ctx->device); - } - ggml_pipeline_allocate_descriptor_sets(ctx); vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, {vk::MemoryPropertyFlagBits::eDeviceLocal}); @@ -10910,10 +10911,6 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ ggml_pipeline_request_descriptor_sets(ctx, p, 1); - if (ctx->device->need_compiles) { - ggml_vk_load_shaders(ctx->device); - } - ggml_pipeline_allocate_descriptor_sets(ctx); ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); @@ -11011,10 +11008,6 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ // // ggml_pipeline_request_descriptor_sets(ctx, p, 1); // -// if (ctx->device->need_compiles) { -// ggml_vk_load_shaders(ctx->device); -// } -// // ggml_pipeline_allocate_descriptor_sets(ctx); // // ggml_vk_buffer_write(x_buf, 0, x, x_sz); @@ -11185,10 +11178,6 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it); } - if (ctx->device->need_compiles) { - ggml_vk_load_shaders(ctx->device); - } - ggml_pipeline_allocate_descriptor_sets(ctx); ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz); @@ -11326,7 +11315,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, } #endif -static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { +static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx) { #if defined(GGML_VULKAN_RUN_TESTS) const std::vector vals { 512, 512, 128, @@ -11416,6 +11405,14 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { GGML_ABORT("fatal error"); #endif + if (subctx) { + // Submit and wait for any pending work before reallocating the buffers + ggml_vk_ctx_end(subctx); + ggml_vk_submit(subctx, ctx->fence); + ggml_vk_wait_for_fence(ctx); + ggml_vk_ctx_begin(ctx->device, subctx); + } + if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) { VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")"); // Resize buffer @@ -11514,10 +11511,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr cgraph->nodes[next_node_idx]->src[0] == cgraph->nodes[next_node_idx - 1] && ggml_nrows(cgraph->nodes[next_node_idx]) == 1 && ctx->device->add_rms_fusion) { - if (dryrun) { - ctx->prealloc_size_add_rms_partials += ggml_vk_rms_partials_size(ctx, cgraph->nodes[node_idx]); + uint32_t size = ggml_vk_rms_partials_size(ctx, cgraph->nodes[node_idx]); + ctx->do_add_rms_partials_offset_calculation = true; + if (ctx->prealloc_size_add_rms_partials_offset + size <= ctx->prealloc_size_add_rms_partials) { + ctx->do_add_rms_partials = true; } - ctx->do_add_rms_partials = true; } } break; case GGML_OP_REPEAT: @@ -11651,6 +11649,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); if (node->op == GGML_OP_RMS_NORM) { ctx->do_add_rms_partials = false; + ctx->do_add_rms_partials_offset_calculation = false; } return false; } @@ -12919,58 +12918,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT(ctx->device->compute_queue.queue, reinterpret_cast(&dul)); } - ctx->prealloc_size_add_rms_partials = 0; ctx->prealloc_size_add_rms_partials_offset = 0; ctx->do_add_rms_partials = false; - - uint64_t total_mat_mul_bytes = 0; - for (int i = 0; i < cgraph->n_nodes; i++) { - if (!ctx->device->disable_fusion) { - uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i); - if (num_adds) { - ctx->num_additional_fused_ops = num_adds - 1; - } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { - ctx->num_additional_fused_ops = 1; - } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) { - ctx->num_additional_fused_ops = 1; - } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) { - ctx->num_additional_fused_ops = 1; - } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) && - ggml_check_edges(cgraph, i, rope_view_set_rows_edges) && - ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) { - ctx->num_additional_fused_ops = 2; - } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) && - ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) && - ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) { - ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1; - } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) && - ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) && - ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) { - ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1; - } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) && - ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) && - ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) { - ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1; - } - } - ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false); - if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) { - total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); - } else if (cgraph->nodes[i]->op == GGML_OP_CONV_2D || cgraph->nodes[i]->op == GGML_OP_CONV_TRANSPOSE_2D) { - // Return CRSxNPQxsizeof(*) to account as many bytes as mul_mat has in im2col->mul_mat mode. - auto CRS_size = - cgraph->nodes[i]->src[0]->ne[0] * cgraph->nodes[i]->src[0]->ne[1] * cgraph->nodes[i]->src[1]->ne[2]; - auto NPQ_size = cgraph->nodes[i]->ne[0] * cgraph->nodes[i]->ne[1] * cgraph->nodes[i]->ne[3]; - total_mat_mul_bytes += NPQ_size * CRS_size * ggml_type_size(cgraph->nodes[i]->type); - } - i += ctx->num_additional_fused_ops; - ctx->num_additional_fused_ops = 0; - } - if (ctx->device->need_compiles) { - ggml_vk_load_shaders(ctx->device); - } - ggml_vk_preallocate_buffers(ctx); - ggml_pipeline_allocate_descriptor_sets(ctx); + ctx->do_add_rms_partials_offset_calculation = false; int last_node = cgraph->n_nodes - 1; @@ -13012,6 +12962,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ctx->prealloc_y_last_tensor_used = nullptr; if (ctx->prealloc_size_add_rms_partials) { + ggml_vk_preallocate_buffers(ctx, nullptr); if (ctx->compute_ctx.expired()) { compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); ctx->compute_ctx = compute_ctx; @@ -13032,14 +12983,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg int submitted_nodes = 0; int submit_count = 0; uint64_t mul_mat_bytes = 0; - uint64_t mul_mat_bytes_per_submit = std::min(uint64_t(100*1000*1000), total_mat_mul_bytes / 40u); + uint64_t total_mul_mat_bytes = 0; + uint64_t mul_mat_bytes_per_submit = std::min(uint64_t(100*1000*1000), ctx->last_total_mul_mat_bytes / 40u); for (int i = 0; i < cgraph->n_nodes; i++) { if (first_node_in_batch) { submit_node_idx = i; } if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) { - mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); + auto bytes = ggml_nbytes(cgraph->nodes[i]->src[0]); + mul_mat_bytes += bytes; + total_mul_mat_bytes += bytes; } if (!ctx->device->disable_fusion) { @@ -13081,7 +13035,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining) bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5; bool submit = (submitted_nodes >= nodes_per_submit) || - (mul_mat_bytes >= mul_mat_bytes_per_submit) || + (mul_mat_bytes_per_submit != 0 && mul_mat_bytes >= mul_mat_bytes_per_submit) || (i + ctx->num_additional_fused_ops >= last_node) || (almost_ready && !ctx->almost_ready_fence_pending); @@ -13125,6 +13079,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ctx->fused_ops_write_mask = 0; } + ctx->prealloc_size_add_rms_partials = std::max(ctx->prealloc_size_add_rms_partials, ctx->prealloc_size_add_rms_partials_offset); + ctx->last_total_mul_mat_bytes = total_mul_mat_bytes; + if (vk_perf_logger_enabled) { // End the command buffer and submit/wait GGML_ASSERT(!ctx->compute_ctx.expired()); From a7255d1619edb0b0e4e9df6293e56df908f593c9 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sun, 2 Nov 2025 15:30:49 -0600 Subject: [PATCH 2/2] remove dryrun parameters --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 513 +++++++++++---------------- 1 file changed, 212 insertions(+), 301 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index f4b0a2cefc0ea..7fc46bc46bc39 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -129,7 +129,7 @@ struct vk_pipeline_struct { uint32_t align; // true if fields have been set by ggml_vk_create_pipeline bool initialized {}; - // set to true to request the pipeline is compiled after the dryrun + // set to true to request the pipeline is compiled bool needed {}; // set to true when the shader has been compiled bool compiled {}; @@ -6211,12 +6211,11 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub ggml_vk_sync_buffers(ctx, subctx); } -static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool disable_split_k, bool dryrun = false) { - GGML_UNUSED(dryrun); +static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool disable_split_k) { VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << ggml_type_name(src0->type) << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << ggml_type_name(src1->type) << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << ggml_type_name(dst->type) << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; - std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); + std::cerr << "))"); GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT @@ -6525,8 +6524,7 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_ GGML_UNUSED(k); } -static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { - GGML_UNUSED(dryrun); +static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { ggml_tensor * dst = cgraph->nodes[node_idx]; const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; @@ -6534,7 +6532,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; - std::cerr << "), " << (dryrun ? "dryrun" : "") << "),)"); + std::cerr << ")),)"); GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT @@ -6818,15 +6816,14 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& } } -static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { - GGML_UNUSED(dryrun); +static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { ggml_tensor * dst = cgraph->nodes[node_idx]; const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32(" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; - std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); + std::cerr << "))"); GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT @@ -6948,15 +6945,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c }, pc, { 1, (uint32_t)ne01, workgroups_z }); } -static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { - GGML_UNUSED(dryrun); +static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { ggml_tensor * dst = cgraph->nodes[node_idx]; const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; - std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); + std::cerr << "))"); GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); @@ -7078,7 +7074,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 }); } -static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { +static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { ggml_tensor * dst = cgraph->nodes[node_idx]; ggml_tensor * src0 = dst->src[0]; ggml_tensor * src1 = dst->src[1]; @@ -7106,7 +7102,7 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c dst2.ne[0] = cur_M_size; src02.ne[1] = cur_M_size; - ggml_vk_mul_mat_q_f16(ctx, subctx, &src02, src1, &dst2, true, dryrun); + ggml_vk_mul_mat_q_f16(ctx, subctx, &src02, src1, &dst2, true); m_offset += cur_M_size; } @@ -7120,22 +7116,21 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c src1->nb[1] <= src1->nb[3] && src0->ne[3] == 1 && src1->ne[3] == 1) { - ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, cgraph, node_idx, dryrun); + ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, cgraph, node_idx); } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 && !ggml_is_permuted(src0) && !ggml_is_permuted(src1)) { - ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, cgraph, node_idx, dryrun); + ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, cgraph, node_idx); // mul_mat_vec supports batching ne12*ne13 when ne11==1, or treating ne11 as the batch size (up to four) // when ne12 and ne13 are one. } else if ((dst->ne[1] == 1 || (dst->ne[1] <= mul_mat_vec_max_cols && src1->ne[2] * src1->ne[3] == 1)) && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || ggml_is_quantized(src0->type))) { - ggml_vk_mul_mat_vec_q_f16(ctx, subctx, cgraph, node_idx, dryrun); + ggml_vk_mul_mat_vec_q_f16(ctx, subctx, cgraph, node_idx); } else { - ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, false, dryrun); + ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, false); } } -static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) { - GGML_UNUSED(dryrun); +static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; @@ -7410,8 +7405,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& } } -static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { - GGML_UNUSED(dryrun); +static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { ggml_tensor * dst = cgraph->nodes[node_idx]; ggml_tensor * src0 = dst->src[0]; ggml_tensor * src1 = dst->src[1]; @@ -7420,7 +7414,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; - std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); + std::cerr << "))"); GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT GGML_ASSERT(ids->type == GGML_TYPE_I32); @@ -7680,16 +7674,16 @@ static bool ggml_vk_use_mul_mat_vec_id(const struct ggml_cgraph * cgraph, int no return src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)); } -static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { +static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) { ggml_tensor * dst = cgraph->nodes[node_idx]; ggml_tensor * src0 = dst->src[0]; ggml_tensor * src1 = dst->src[1]; ggml_tensor * src2 = dst->src[2]; VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")"); if (ggml_vk_use_mul_mat_vec_id(cgraph, node_idx)) { - ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, cgraph, node_idx, dryrun); + ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, cgraph, node_idx); } else { - ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun); + ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst); } } @@ -7749,8 +7743,7 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co return supported; } -static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst, bool dryrun = false) { - GGML_UNUSED(dryrun); +static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3]; std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3]; std::cerr << "), (" << v << ", name=" << v->name << ", type=" << v->type << ", ne0=" << v->ne[0] << ", ne1=" << v->ne[1] << ", ne2=" << v->ne[2] << ", ne3=" << v->ne[3] << ", nb0=" << v->nb[0] << ", nb1=" << v->nb[1] << ", nb2=" << v->nb[2] << ", nb3=" << v->nb[3]; @@ -7758,7 +7751,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx if (sinks) { std::cerr << "), (" << sinks << ", name=" << sinks->name << ", type=" << sinks->type << ", ne0=" << sinks->ne[0] << ", ne1=" << sinks->ne[1] << ", ne2=" << sinks->ne[2] << ", ne3=" << sinks->ne[3] << ", nb0=" << sinks->nb[0] << ", nb1=" << sinks->nb[1] << ", nb2=" << sinks->nb[2] << ", nb3=" << sinks->nb[3]; } - std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); + std::cerr << "))"); GGML_TENSOR_LOCALS(int64_t, neq, q, ne) GGML_TENSOR_LOCALS(size_t, nbq, q, nb) @@ -8744,8 +8737,7 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk } template -static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) { - GGML_UNUSED(dryrun); +static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst, ggml_op op, PC&& pc) { VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; if (src1 != nullptr) { std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -8757,7 +8749,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co std::cerr << "), (" << src3 << ", name=" << src3->name << ", type=" << src3->type << ", ne0=" << src3->ne[0] << ", ne1=" << src3->ne[1] << ", ne2=" << src3->ne[2] << ", ne3=" << src3->ne[3] << ", nb0=" << src3->nb[0] << ", nb1=" << src3->nb[1] << ", nb2=" << src3->nb[2] << ", nb3=" << src3->nb[3]; } std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; - std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")"); + std::cerr << "), " << ggml_op_name(op) << ")"); GGML_ASSERT(op == GGML_OP_GET_ROWS || op == GGML_OP_CPY || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT GGML_ASSERT(dst->buffer != nullptr); @@ -9189,7 +9181,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } } -static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -9201,10 +9193,10 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, - }, dryrun); + }); } -static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -9221,11 +9213,10 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, offset, - }, dryrun); + }); } -static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { - GGML_UNUSED(dryrun); +static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) { const ggml_tensor *first_node = cgraph->nodes[node_idx]; const ggml_tensor *dst = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; @@ -9332,7 +9323,7 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, }, pc, elements); } -static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -9344,10 +9335,10 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, ctx->do_add_rms_partials, - }, dryrun); + }); } -static void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -9359,10 +9350,10 @@ static void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, - }, dryrun); + }); } -static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -9374,10 +9365,10 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, - }, dryrun); + }); } -static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -9389,10 +9380,10 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, - }, dryrun); + }); } -static void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t src2_type_size = ggml_type_size(src2->type); @@ -9404,11 +9395,10 @@ static void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, co (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src2->nb[1] / src2_type_size, - }, dryrun); + }); } -static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, int version, bool dryrun = false) { - GGML_UNUSED(dryrun); +static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, int version) { GGML_ASSERT(version == 6 || version == 7); int num_srcs = version == 6 ? 6 : 7; @@ -9491,7 +9481,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx } } -static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { const size_t seq_length = dst->src[0]->ne[2]; const size_t n_embed = dst->ne[0]; const size_t n_heads = dst->src[0]->ne[1]; @@ -9505,12 +9495,11 @@ static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, (uint32_t)n_embed, (uint32_t)n_heads, }, - 6, - dryrun + 6 ); } -static void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { const size_t seq_length = dst->src[0]->ne[2]; const size_t n_embed = dst->ne[0]; const size_t n_heads = dst->src[0]->ne[1]; @@ -9524,13 +9513,11 @@ static void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, (uint32_t)n_embed, (uint32_t)n_heads, }, - 7, - dryrun + 7 ); } -static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) { - GGML_UNUSED(dryrun); +static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src2 = dst->src[2]; @@ -9622,7 +9609,7 @@ static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, }, pc, elements); } -static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; @@ -9635,11 +9622,10 @@ static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, (uint32_t)src0->ne[1], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2], - }, dryrun); + }); } -static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_push_constants&& pc, bool dryrun = false) { - GGML_UNUSED(dryrun); +static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_push_constants&& pc) { const ggml_tensor * x = dst->src[0]; const ggml_tensor * g = dst->src[1]; const ggml_tensor * gm = dst->src[2]; @@ -9729,23 +9715,22 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont }, pc, elements); } -static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) { const size_t n = ggml_nelements(dst->src[0]); ggml_vk_op_f32_opt_step_adamw( ctx, subctx, dst, - { (uint32_t)n, 0, 0.0f, 0.0f }, - dryrun + { (uint32_t)n, 0, 0.0f, 0.0f } ); } -static void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { const size_t n = ggml_nelements(dst->src[0]); - ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f }); } -static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { int * op_params = (int *)dst->op_params; const uint32_t src0_type_size = ggml_type_size(src0->type); @@ -9759,10 +9744,10 @@ static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, co (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, op_params[0], - }, dryrun); + }); } -static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0); @@ -9786,47 +9771,47 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c (uint32_t)nb00 / src0_type_size, (uint32_t)nb01 / src0_type_size, (uint32_t)nb02 / src0_type_size, (uint32_t)nb03 / src0_type_size, (uint32_t)ne0, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, sf0, sf1, sf2, sf3, pixel_offset - }, dryrun); + }); } -static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); p.param1 = ggml_get_op_params_f32(dst, 0); p.param2 = ggml_get_op_params_f32(dst, 1); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p)); } -static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst), dryrun); +static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst)); } -static void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQRT, vk_op_unary_push_constants_init(src0, dst), dryrun); +static void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQRT, vk_op_unary_push_constants_init(src0, dst)); } -static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst), dryrun); +static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst)); } -static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst), dryrun); +static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst)); } -static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); p.param1 = ggml_get_op_params_f32(dst, 0); p.param2 = ggml_get_op_params_f32(dst, 1); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p)); } -static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_pad_push_constants p = vk_op_pad_push_constants_init(src0, dst); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p)); } -static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { const int32_t s0 = ggml_get_op_params_i32(dst, 0); const int32_t s1 = ggml_get_op_params_i32(dst, 1); const int32_t s2 = ggml_get_op_params_i32(dst, 2); @@ -9838,20 +9823,20 @@ static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, cons memcpy(&p.param1, &s01_packed, sizeof(float)); memcpy(&p.param2, &s23_packed, sizeof(float)); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p)); } -static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p)); } -static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p)); } -static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { uint32_t ne = (uint32_t)ggml_nelements(src0); if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) { // Convert from number of logical elements to 2- or 4-byte units. @@ -9864,10 +9849,10 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const } vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p)); } -static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -9886,20 +9871,20 @@ static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, 0, - }, dryrun); + }); } -static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun); +static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); } -static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); } -static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { const int * int_op_params = (const int *)dst->op_params; const float * float_op_params = (const float *)dst->op_params; @@ -9907,7 +9892,7 @@ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx const float eps = float_op_params[1]; const uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }); } static uint32_t ggml_vk_rms_num_partials(ggml_backend_vk_context * ctx, const ggml_tensor *node) { @@ -9923,7 +9908,7 @@ static uint32_t ggml_vk_rms_partials_size(ggml_backend_vk_context * ctx, const g return num_bytes; } -static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, float * op_params, bool dryrun = false) { +static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, float * op_params) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -9937,7 +9922,7 @@ static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, op_params[0], 0.0f, (int32_t)param3, - }, dryrun); + }); if (ctx->do_add_rms_partials_offset_calculation) { ctx->prealloc_size_add_rms_partials_offset += ggml_vk_rms_partials_size(ctx, src0); @@ -9946,21 +9931,21 @@ static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, } } -static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); } -static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_L2_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_L2_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); } -static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun); +static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); } -static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const float * op_params_f = (const float *)dst->op_params; const bool swapped = (bool)dst->op_params[1]; @@ -9988,15 +9973,15 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const mode, alpha, limit - }, dryrun); + }); } -static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { int32_t * op_params = (int32_t *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }); } -static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; float scale = op_params[0]; @@ -10029,16 +10014,15 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, n_head_log2, nrows_x, src2 != nullptr - }, dryrun); + }); } -static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1] }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1] }); } -static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx, bool dryrun = false) { - GGML_UNUSED(dryrun); +static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) { topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops); ggml_tensor * logits = cgraph->nodes[node_idx + 0]->src[0]; ggml_tensor * weights = (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) ? cgraph->nodes[node_idx + 9] : @@ -10122,7 +10106,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, }, pc, elements); } -static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop, bool dryrun = false) { +static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop) { ggml_tensor * dst = cgraph->nodes[node_idx]; const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; @@ -10167,10 +10151,10 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale, src2 != nullptr, (uint32_t)src0->ne[2], s1, s2, { sections[0], sections[1], sections[2], sections[3] }, is_imrope, backprop, set_rows_stride, - }, dryrun); + }); } -static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { int32_t * op_params = (int32_t *)dst->op_params; uint32_t ncols = src0->ne[0]; @@ -10180,34 +10164,34 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, c ncols, nrows, op_params[0], - }, dryrun); + }); } -static void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, ggml_nelements(src0)); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM, p, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM, p); } -static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, p, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, p); } -static void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]); p.weight = 1.0f / (float)src0->ne[0]; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_MEAN, p, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_MEAN, p); } -static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f }, dryrun); +static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f }); } -static void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_COUNT_EQUAL, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun); +static void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_COUNT_EQUAL, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); } -static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const int32_t s0 = dst->op_params[0]; const int32_t s1 = dst->op_params[1]; const int32_t p0 = dst->op_params[2]; @@ -10244,10 +10228,10 @@ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, co pelements, IC * KH * KW, s0, s1, p0, p1, d0, d1, - }, dryrun); + }); } -static void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_TENSOR_BINARY_OP_LOCALS const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; @@ -10310,20 +10294,20 @@ static void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, pc.OH_OW_IC_KD_KH_KW = OH*OW*IC*KD*KH*KW; pc.OW_IC_KD_KH_KW = OW*IC*KD*KH*KW; - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL_3D, std::move(pc), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL_3D, std::move(pc)); } -static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { const uint32_t dim = dst->op_params[0]; const uint32_t max_period = dst->op_params[1]; const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type); ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, { nb1, dim, max_period, - }, dryrun); + }); } -static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { // src0: (K, Cout, Cin, 1) -- kernel // src1: (L, Cin, 1, 1) -- input // dst: (*, Cout, 1, 1) @@ -10351,10 +10335,10 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& p.nb1 = static_cast(nb1 / nb0); p.s0 = static_cast(s0); - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p)); } -static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { uint32_t op = static_cast(dst->op_params[0]); const int32_t k1 = dst->op_params[1]; const int32_t k0 = dst->op_params[2]; @@ -10379,11 +10363,11 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c parallel_elements, op, k0, k1, s0, s1, p0, p1, - }, dryrun); + }); } static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { + const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -10428,11 +10412,11 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, GGML_ASSERT(ne03 == ne2); GGML_ASSERT(ne02 == ne12); - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D, std::move(p)); } static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { + const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -10477,10 +10461,10 @@ static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context GGML_ASSERT(ne02 == ne2); GGML_ASSERT(ne03 == ne12); - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_2D, std::move(p), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_2D, std::move(p)); } -static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { vk_op_conv2d_dw_push_constants p{}; p.ne = ggml_nelements(dst); p.channels = dst->ne[2]; @@ -10501,12 +10485,12 @@ static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx GGML_ASSERT(src0->ne[3] == p.channels); GGML_ASSERT(src1->ne[3] == p.batches); - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D_DW, std::move(p), dryrun); + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D_DW, std::move(p)); } -static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { const float * op_params = (const float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }); } #ifdef GGML_VULKAN_RUN_TESTS @@ -11451,7 +11435,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * // Returns true if node has enqueued work into the queue, false otherwise // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. -static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){ +static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool last_node, bool almost_ready, bool submit){ ggml_tensor * node = cgraph->nodes[node_idx]; if (ggml_is_empty(node) || !node->buffer) { return false; @@ -11583,82 +11567,15 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr vk_context compute_ctx; - if (!dryrun) { - if (ctx->compute_ctx.expired()) { - compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); - ctx->compute_ctx = compute_ctx; - ggml_vk_ctx_begin(ctx->device, compute_ctx); - } else { - compute_ctx = ctx->compute_ctx.lock(); - } + if (ctx->compute_ctx.expired()) { + compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); + ctx->compute_ctx = compute_ctx; + ggml_vk_ctx_begin(ctx->device, compute_ctx); } else { - switch (node->op) { - case GGML_OP_REPEAT: - case GGML_OP_REPEAT_BACK: - case GGML_OP_ACC: - case GGML_OP_GET_ROWS: - case GGML_OP_ADD: - case GGML_OP_SUB: - case GGML_OP_MUL: - case GGML_OP_DIV: - case GGML_OP_CONCAT: - case GGML_OP_UPSCALE: - case GGML_OP_SCALE: - case GGML_OP_SQR: - case GGML_OP_SQRT: - case GGML_OP_SIN: - case GGML_OP_COS: - case GGML_OP_CLAMP: - case GGML_OP_PAD: - case GGML_OP_CPY: - case GGML_OP_SET_ROWS: - case GGML_OP_CONT: - case GGML_OP_DUP: - case GGML_OP_SILU_BACK: - case GGML_OP_NORM: - case GGML_OP_GROUP_NORM: - case GGML_OP_RMS_NORM: - case GGML_OP_RMS_NORM_BACK: - case GGML_OP_L2_NORM: - case GGML_OP_UNARY: - case GGML_OP_GLU: - case GGML_OP_DIAG_MASK_INF: - case GGML_OP_SOFT_MAX: - case GGML_OP_SOFT_MAX_BACK: - case GGML_OP_ROPE_BACK: - case GGML_OP_ARGSORT: - case GGML_OP_SUM: - case GGML_OP_SUM_ROWS: - case GGML_OP_MEAN: - case GGML_OP_ARGMAX: - case GGML_OP_COUNT_EQUAL: - case GGML_OP_IM2COL: - case GGML_OP_IM2COL_3D: - case GGML_OP_TIMESTEP_EMBEDDING: - case GGML_OP_CONV_TRANSPOSE_1D: - case GGML_OP_POOL_2D: - case GGML_OP_CONV_2D: - case GGML_OP_CONV_TRANSPOSE_2D: - case GGML_OP_CONV_2D_DW: - case GGML_OP_LEAKY_RELU: - case GGML_OP_OPT_STEP_SGD: - { - // These operations all go through ggml_vk_op_f32, so short-circuit and - // do the only thing needed for the dryrun. - vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op); - ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); - if (node->op == GGML_OP_RMS_NORM) { - ctx->do_add_rms_partials = false; - ctx->do_add_rms_partials_offset_calculation = false; - } - return false; - } - default: - break; - } + compute_ctx = ctx->compute_ctx.lock(); } - if (!dryrun) { + { // This logic detects dependencies between modes in the graph and calls ggml_vk_sync_buffers // to synchronize them. This handles most "normal" synchronization when computing the graph, and when // there is no auxiliary memory use, it shouldn't be necessary to call ggml_vk_sync_buffers @@ -11743,118 +11660,116 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr } } #if ENABLE_SYNC_LOGGING - if (!dryrun) { - for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) { - auto *n = cgraph->nodes[node_idx + i]; - std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name; - if (n->op == GGML_OP_GLU) { - std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " "; - } - std::cerr << std::endl; + for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) { + auto *n = cgraph->nodes[node_idx + i]; + std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name; + if (n->op == GGML_OP_GLU) { + std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " "; } + std::cerr << std::endl; } #endif switch (node->op) { case GGML_OP_REPEAT: - ggml_vk_repeat(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_repeat(ctx, compute_ctx, src0, node); break; case GGML_OP_REPEAT_BACK: - ggml_vk_repeat_back(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_repeat_back(ctx, compute_ctx, src0, node); break; case GGML_OP_ACC: - ggml_vk_acc(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_acc(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_GET_ROWS: - ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_ADD: if (ctx->num_additional_fused_ops) { - ggml_vk_multi_add(ctx, compute_ctx, cgraph, node_idx, dryrun); + ggml_vk_multi_add(ctx, compute_ctx, cgraph, node_idx); } else { - ggml_vk_add(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_add(ctx, compute_ctx, src0, src1, node); } break; case GGML_OP_SUB: - ggml_vk_sub(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_sub(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_MUL: - ggml_vk_mul(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_mul(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_DIV: - ggml_vk_div(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_div(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_ADD_ID: - ggml_vk_add_id(ctx, compute_ctx, src0, src1, src2, node, dryrun); + ggml_vk_add_id(ctx, compute_ctx, src0, src1, src2, node); break; case GGML_OP_CONCAT: - ggml_vk_concat(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_concat(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_UPSCALE: - ggml_vk_upscale(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_upscale(ctx, compute_ctx, src0, node); break; case GGML_OP_SCALE: - ggml_vk_scale(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_scale(ctx, compute_ctx, src0, node); break; case GGML_OP_SQR: - ggml_vk_sqr(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_sqr(ctx, compute_ctx, src0, node); break; case GGML_OP_SQRT: - ggml_vk_sqrt(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_sqrt(ctx, compute_ctx, src0, node); break; case GGML_OP_SIN: - ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_sin(ctx, compute_ctx, src0, node); break; case GGML_OP_COS: - ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_cos(ctx, compute_ctx, src0, node); break; case GGML_OP_CLAMP: - ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_clamp(ctx, compute_ctx, src0, node); break; case GGML_OP_PAD: - ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_pad(ctx, compute_ctx, src0, node); break; case GGML_OP_ROLL: - ggml_vk_roll(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_roll(ctx, compute_ctx, src0, node); break; case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: - ggml_vk_cpy(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_cpy(ctx, compute_ctx, src0, node); break; case GGML_OP_SET_ROWS: - ggml_vk_set_rows(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_set_rows(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_SILU_BACK: - ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_NORM: - ggml_vk_norm(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_norm(ctx, compute_ctx, src0, node); break; case GGML_OP_GROUP_NORM: - ggml_vk_group_norm(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_group_norm(ctx, compute_ctx, src0, node); break; case GGML_OP_RMS_NORM: @@ -11862,17 +11777,17 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr // fused rms_norm + mul ggml_tensor *mul = cgraph->nodes[node_idx + 1]; ggml_tensor *other_src = mul->src[0] == node ? mul->src[1] : mul->src[0]; - ggml_vk_rms_norm(ctx, compute_ctx, src0, other_src, mul, (float *)node->op_params, dryrun); + ggml_vk_rms_norm(ctx, compute_ctx, src0, other_src, mul, (float *)node->op_params); } else { - ggml_vk_rms_norm(ctx, compute_ctx, src0, src0, node, (float *)node->op_params, dryrun); + ggml_vk_rms_norm(ctx, compute_ctx, src0, src0, node, (float *)node->op_params); } break; case GGML_OP_RMS_NORM_BACK: - ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_L2_NORM: - ggml_vk_l2_norm(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_l2_norm(ctx, compute_ctx, src0, node); break; case GGML_OP_UNARY: @@ -11887,7 +11802,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_UNARY_OP_SIGMOID: case GGML_UNARY_OP_HARDSIGMOID: case GGML_UNARY_OP_HARDSWISH: - ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_unary(ctx, compute_ctx, src0, node); break; default: return false; @@ -11901,151 +11816,147 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_GLU_OP_SWIGLU_OAI: case GGML_GLU_OP_GEGLU_ERF: case GGML_GLU_OP_GEGLU_QUICK: - ggml_vk_glu(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_glu(ctx, compute_ctx, src0, src1, node); break; default: return false; } break; case GGML_OP_DIAG_MASK_INF: - ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node); break; case GGML_OP_SOFT_MAX: if (ctx->num_additional_fused_ops) { - ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx, dryrun); + ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx); } else { - ggml_vk_soft_max(ctx, compute_ctx, src0, src1, src2, node, dryrun); + ggml_vk_soft_max(ctx, compute_ctx, src0, src1, src2, node); } break; case GGML_OP_SOFT_MAX_BACK: - ggml_vk_soft_max_back(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_soft_max_back(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_ROPE: - ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, false, dryrun); + ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, false); break; case GGML_OP_ROPE_BACK: - ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, true, dryrun); + ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, true); break; case GGML_OP_ARGSORT: if (ctx->num_additional_fused_ops) { - ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx, dryrun); + ggml_vk_topk_moe(ctx, compute_ctx, cgraph, node_idx); } else { - ggml_vk_argsort(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_argsort(ctx, compute_ctx, src0, node); } break; case GGML_OP_SUM: - ggml_vk_sum(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_sum(ctx, compute_ctx, src0, node); break; case GGML_OP_SUM_ROWS: - ggml_vk_sum_rows(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_sum_rows(ctx, compute_ctx, src0, node); break; case GGML_OP_MEAN: - ggml_vk_mean(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_mean(ctx, compute_ctx, src0, node); break; case GGML_OP_ARGMAX: - ggml_vk_argmax(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_argmax(ctx, compute_ctx, src0, node); break; case GGML_OP_COUNT_EQUAL: - ggml_vk_count_equal(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_count_equal(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_IM2COL: - ggml_vk_im2col(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_im2col(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_IM2COL_3D: - ggml_vk_im2col_3d(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_im2col_3d(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_TIMESTEP_EMBEDDING: - ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node); break; case GGML_OP_CONV_TRANSPOSE_1D: - ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_POOL_2D: - ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_pool_2d(ctx, compute_ctx, src0, node); break; case GGML_OP_CONV_2D: - ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_CONV_TRANSPOSE_2D: - ggml_vk_conv_transpose_2d(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_conv_transpose_2d(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_CONV_2D_DW: - ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node, dryrun); + ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node); break; case GGML_OP_LEAKY_RELU: - ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun); + ggml_vk_leaky_relu(ctx, compute_ctx, src0, node); break; case GGML_OP_MUL_MAT: - ggml_vk_mul_mat(ctx, compute_ctx, cgraph, node_idx, dryrun); + ggml_vk_mul_mat(ctx, compute_ctx, cgraph, node_idx); break; case GGML_OP_MUL_MAT_ID: - ggml_vk_mul_mat_id(ctx, compute_ctx, cgraph, node_idx, dryrun); + ggml_vk_mul_mat_id(ctx, compute_ctx, cgraph, node_idx); break; case GGML_OP_FLASH_ATTN_EXT: - ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node->src[4], node, dryrun); + ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node->src[4], node); break; case GGML_OP_RWKV_WKV6: - ggml_vk_rwkv_wkv6(ctx, compute_ctx, node, dryrun); + ggml_vk_rwkv_wkv6(ctx, compute_ctx, node); break; case GGML_OP_RWKV_WKV7: - ggml_vk_rwkv_wkv7(ctx, compute_ctx, node, dryrun); + ggml_vk_rwkv_wkv7(ctx, compute_ctx, node); break; case GGML_OP_SSM_SCAN: - ggml_vk_ssm_scan(ctx, compute_ctx, node, dryrun); + ggml_vk_ssm_scan(ctx, compute_ctx, node); break; case GGML_OP_SSM_CONV: - ggml_vk_ssm_conv(ctx, compute_ctx, node, dryrun); + ggml_vk_ssm_conv(ctx, compute_ctx, node); break; case GGML_OP_OPT_STEP_ADAMW: - ggml_vk_opt_step_adamw(ctx, compute_ctx, node, dryrun); + ggml_vk_opt_step_adamw(ctx, compute_ctx, node); break; case GGML_OP_OPT_STEP_SGD: - ggml_vk_opt_step_sgd(ctx, compute_ctx, src0, src1, src2, node, dryrun); + ggml_vk_opt_step_sgd(ctx, compute_ctx, src0, src1, src2, node); break; default: return false; } - if (dryrun) { - return false; - } - ctx->tensor_ctxs[node_idx] = compute_ctx; #if defined(GGML_VULKAN_CHECK_RESULTS) @@ -13039,7 +12950,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg (i + ctx->num_additional_fused_ops >= last_node) || (almost_ready && !ctx->almost_ready_fence_pending); - bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit); + bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit); if (vk_perf_logger_enabled) { if (ctx->compute_ctx.expired()) {