From 7403c05c06bbef5009d0649abaf44cffb82ee02a Mon Sep 17 00:00:00 2001 From: Paul Tsochantaris Date: Sat, 5 Oct 2024 02:00:59 +0100 Subject: [PATCH 1/4] Single allocation of encode_async block with non-ARC capture in ggml-metal.m --- ggml/src/ggml-metal.m | 85 +++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 44 deletions(-) diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index c6a7014fc69f6..40835074bcb0b 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -438,6 +438,7 @@ @implementation GGMLMetalClass ctx->capture_scope = nil; ctx->gf = nil; + Block_release(ctx->encode_async); ctx->encode_async = nil; for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { ctx->command_buffers[i] = nil; @@ -3000,46 +3001,6 @@ static enum ggml_status ggml_metal_graph_compute( } } - // TODO: how to avoid this allocation? I tried initializing it in ggml_backend_metal_set_n_cb but it crashes. - ctx->encode_async = ^(size_t iter) { - const int cb_idx = iter; - const int n_cb_l = ctx->n_cb; - - const int n_nodes_0 = ctx->n_nodes_0; - const int n_nodes_1 = ctx->n_nodes_1; - - const int n_nodes_per_cb = ctx->n_nodes_per_cb; - - id command_buffer = ctx->command_buffers[cb_idx]; - id encoder = [command_buffer computeCommandEncoder]; - - int node_start = 0; - int node_end = n_nodes_0; - - if (cb_idx < n_cb_l) { - node_start = n_nodes_0 + ( (cb_idx + 0) * n_nodes_per_cb); - node_end = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1)); - } - - for (int idx = node_start; idx < node_end; ++idx) { - if (should_capture) { - [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(gf, idx)) encoding:NSUTF8StringEncoding]]; - } - - ggml_metal_encode_node(ctx, idx, encoder); - - if (should_capture) { - [encoder popDebugGroup]; - } - } - - [encoder endEncoding]; - - if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer commit]; - } - }; - // the main thread commits the first few commands immediately // command_buffer[n_cb] { @@ -3468,10 +3429,46 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { } } - // TODO: setting encode_async here causes crash during the next ggml_metal_graph_compute call. why? - //ctx->encode_async = ^(size_t iter) { - // ... - //}; + ctx->encode_async = Block_copy(^(size_t iter) { + const int cb_idx = iter; + const int n_cb_l = ctx->n_cb; + + const int n_nodes_0 = ctx->n_nodes_0; + const int n_nodes_1 = ctx->n_nodes_1; + + const int n_nodes_per_cb = ctx->n_nodes_per_cb; + + id command_buffer = ctx->command_buffers[cb_idx]; + id encoder = [command_buffer computeCommandEncoder]; + + int node_start = 0; + int node_end = n_nodes_0; + + if (cb_idx < n_cb_l) { + node_start = n_nodes_0 + ( (cb_idx + 0) * n_nodes_per_cb); + node_end = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1)); + } + + const bool should_capture = ctx->capture_next_compute; + + for (int idx = node_start; idx < node_end; ++idx) { + if (should_capture) { + [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; + } + + ggml_metal_encode_node(ctx, idx, encoder); + + if (should_capture) { + [encoder popDebugGroup]; + } + } + + [encoder endEncoding]; + + if (cb_idx < 2 || ctx->abort_callback == NULL) { + [command_buffer commit]; + } + }); } static struct ggml_backend_i ggml_backend_metal_i = { From 5e6358398c073970d5684f7574aba77a271f59d8 Mon Sep 17 00:00:00 2001 From: Paul Tsochantaris Date: Mon, 7 Oct 2024 12:09:16 +0100 Subject: [PATCH 2/4] Moving Block_release to the deallocation code --- ggml/src/ggml-metal.m | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 40835074bcb0b..f1b4b8e872ce7 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -239,8 +239,6 @@ struct ggml_cgraph * gf; // the callback given to the thread pool - // TODO: ideally, this should be created once, utilizing the command buffer state above - // for some reason, doing it like this leads to a crash void (^encode_async)(size_t ith); // n_cb command buffers + 1 used by the main thread @@ -438,7 +436,6 @@ @implementation GGMLMetalClass ctx->capture_scope = nil; ctx->gf = nil; - Block_release(ctx->encode_async); ctx->encode_async = nil; for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { ctx->command_buffers[i] = nil; @@ -684,6 +681,8 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { [ctx->kernels[i].pipeline release]; } + Block_release(ctx->encode_async); + [ctx->queue release]; [ctx->device release]; From 4af03de2a6e962aaa9984a156ebf44cde60b0f86 Mon Sep 17 00:00:00 2001 From: Paul Tsochantaris Date: Mon, 7 Oct 2024 12:21:28 +0100 Subject: [PATCH 3/4] Release encode block when re-setting encoding buffer count if needed --- ggml/src/ggml-metal.m | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index f1b4b8e872ce7..923bcf0b4dfda 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -3428,6 +3428,10 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { } } + if(ctx->encode_async) { + Block_release(ctx->encode_async); + } + ctx->encode_async = Block_copy(^(size_t iter) { const int cb_idx = iter; const int n_cb_l = ctx->n_cb; From 594a07a515e0ca0b440a332f8fbb2225373623e4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 7 Oct 2024 14:40:28 +0300 Subject: [PATCH 4/4] Update ggml/src/ggml-metal.m --- ggml/src/ggml-metal.m | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 923bcf0b4dfda..db3fbbe48aa77 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -3428,10 +3428,10 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { } } - if(ctx->encode_async) { + if (ctx->encode_async) { Block_release(ctx->encode_async); } - + ctx->encode_async = Block_copy(^(size_t iter) { const int cb_idx = iter; const int n_cb_l = ctx->n_cb;