From 20ffb58c12bb7e39ca1fb57654ea777e923e2db1 Mon Sep 17 00:00:00 2001
From: Acly <aclysia@gmail.com>
Date: Thu, 4 Sep 2025 12:05:27 +0200
Subject: [PATCH 01/11] ggml : make gallocr respect the backend's max buffer
 size

* if the graph requires more memory than can fit into a single allocation, split it into multiple backend buffers
* vulkan: report the actual max  allocation size in buffer type  interface
---
 ggml/src/ggml-alloc.c                | 208 +++++++++---
 ggml/src/ggml-impl.h                 |   4 +
 ggml/src/ggml-vulkan/ggml-vulkan.cpp |   2 +-
 tests/CMakeLists.txt                 |   3 +
 tests/test-alloc.cpp                 | 483 +++++++++++++++++++++++++++
 5 files changed, 656 insertions(+), 44 deletions(-)
 create mode 100644 tests/test-alloc.cpp

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 8b6e6028361d0..5536d3c03293d 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -23,7 +23,7 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
 }
 
 // ops that return true for this function must not use restrict pointers for their backend implementations
-static bool ggml_op_can_inplace(enum ggml_op op) {
+bool ggml_op_can_inplace(enum ggml_op op) {
     switch (op) {
         case GGML_OP_SCALE:
         case GGML_OP_DIAG_MASK_ZERO:
@@ -105,6 +105,7 @@ struct ggml_dyn_tallocr {
     int n_free_blocks;
     struct free_block free_blocks[MAX_FREE_BLOCKS];
     size_t max_size;
+    size_t max_chunk_size;
 
 #ifdef GGML_ALLOCATOR_DEBUG
     struct {
@@ -114,6 +115,14 @@ struct ggml_dyn_tallocr {
 #endif
 };
 
+// the memory range [0, max_size) is divided into n chunks of size max_chunk_size (with the last chunk possibly being smaller).
+// tensor allocations may not cross chunk boundaries.
+static void ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, struct free_block * block) {
+    size_t n_chunks = (alloc->max_size + alloc->max_chunk_size - 1) / alloc->max_chunk_size;
+    block->offset = n_chunks * alloc->max_chunk_size;
+    block->size = alloc->max_chunk_size;
+}
+
 #ifdef GGML_ALLOCATOR_DEBUG
 static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
     for (int i = 0; i < 1024; i++) {
@@ -140,6 +149,10 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
     size = aligned_offset(NULL, size, alloc->alignment);
 
     AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
+    if (size > alloc->max_chunk_size) {
+        GGML_ABORT("allocation failed: tensor %s (%zu bytes) exceeds maximum backend buffer size (%zu bytes)\n",
+            tensor->name, size, alloc->max_chunk_size);
+    }
 
     size_t max_avail = 0;
 
@@ -156,16 +169,17 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
     }
 
     if (best_fit_block == -1) {
-        // the last block is our last resort
+        // the last block represents memory still available in an existing chunk
         struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
         max_avail = MAX(max_avail, block->size);
         if (block->size >= size) {
             best_fit_block = alloc->n_free_blocks - 1;
         } else {
-            // this should never happen
-            GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
-                    __func__, size, max_avail);
-            GGML_ABORT("not enough space in the buffer");
+            // not enough space in existing chunk, create a new one at the end
+            best_fit_block = alloc->n_free_blocks;
+            alloc->n_free_blocks += 1;
+            GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
+            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[alloc->n_free_blocks - 1]);
         }
     }
 
@@ -179,9 +193,14 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
         for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
             alloc->free_blocks[j] = alloc->free_blocks[j+1];
         }
+        // if there are no remaining blocks all memory in current chunk was used up -> start the next one
+        if (alloc->n_free_blocks == 0) {
+            alloc->n_free_blocks = 1;
+            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[0]);
+        }
     }
 
-    AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
+    AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, offset, offset / alloc->max_chunk_size);
 
 #ifdef GGML_ALLOCATOR_DEBUG
     add_allocated_tensor(alloc, offset, tensor);
@@ -229,19 +248,28 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
 #ifdef GGML_ALLOCATOR_DEBUG
     remove_allocated_tensor(alloc, offset, tensor);
 #endif
+    size_t chunk = offset / alloc->max_chunk_size;
 
     // see if we can merge with an existing block
     for (int i = 0; i < alloc->n_free_blocks; i++) {
         struct free_block * block = &alloc->free_blocks[i];
+        // can only merge with blocks within the same chunk
+        size_t block_chunk = block->offset / alloc->max_chunk_size;
+        if (chunk != block_chunk) {
+            continue;
+        }
         // check if ptr is at the end of the block
         if (block->offset + block->size == offset) {
             block->size += size;
-            // check if we can merge with the next block
-            if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
-                block->size += alloc->free_blocks[i+1].size;
-                alloc->n_free_blocks--;
-                for (int j = i+1; j < alloc->n_free_blocks; j++) {
-                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+            // check if we can merge with the next block (within the same chunk)
+            if (i < alloc->n_free_blocks - 1) {
+                struct free_block * next = &alloc->free_blocks[i+1];
+                if (block->offset + block->size == next->offset && block_chunk == (next->offset / alloc->max_chunk_size)) {
+                    block->size += next->size;
+                    alloc->n_free_blocks--;
+                    for (int j = i+1; j < alloc->n_free_blocks; j++) {
+                        alloc->free_blocks[j] = alloc->free_blocks[j+1];
+                    }
                 }
             }
             return;
@@ -250,12 +278,15 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
         if (offset + size == block->offset) {
             block->offset = offset;
             block->size += size;
-            // check if we can merge with the previous block
-            if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
-                alloc->free_blocks[i-1].size += block->size;
-                alloc->n_free_blocks--;
-                for (int j = i; j < alloc->n_free_blocks; j++) {
-                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+            // check if we can merge with the previous block (within the same chunk)
+            if (i > 0) {
+                struct free_block * prev = &alloc->free_blocks[i-1];
+                if (prev->offset + prev->size == block->offset && block_chunk == (prev->offset / alloc->max_chunk_size)) {
+                    prev->size += block->size;
+                    alloc->n_free_blocks--;
+                    for (int j = i; j < alloc->n_free_blocks; j++) {
+                        alloc->free_blocks[j] = alloc->free_blocks[j+1];
+                    }
                 }
             }
             return;
@@ -283,9 +314,13 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
     alloc->n_free_blocks = 1;
     alloc->free_blocks[0].offset = 0;
-    alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
+    alloc->free_blocks[0].size = alloc->max_chunk_size;
     alloc->max_size = 0;
 
+    if (alloc->free_blocks[0].size == SIZE_MAX) {
+        alloc->free_blocks[0].size = SIZE_MAX/2; // avoid overflows
+    }
+
 #ifdef GGML_ALLOCATOR_DEBUG
     for (int i = 0; i < 1024; i++) {
         alloc->allocated_tensors[i].tensor = NULL;
@@ -293,14 +328,15 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
 #endif
 }
 
-static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
+static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
     struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
 
     *alloc = (struct ggml_dyn_tallocr) {
-        /*.alignment     = */ alignment,
-        /*.n_free_blocks = */ 0,
-        /*.free_blocks   = */ {{0}},
-        /*.max_size      = */ 0,
+        /*.alignment       = */ alignment,
+        /*.n_free_blocks   = */ 0,
+        /*.free_blocks     = */ {{0}},
+        /*.max_size        = */ 0,
+        /*.max_chunk_size  = */ max_buffer_size,
 #ifdef GGML_ALLOCATOR_DEBUG
         /*.allocated_tensors = */ {{0}},
 #endif
@@ -320,6 +356,95 @@ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
 }
 
 
+// virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
+
+#define GGML_VBUFFER_MAX_CHUNKS 8
+
+struct vbuffer {
+    ggml_backend_buffer_type_t buft;
+    ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
+};
+
+static struct vbuffer * ggml_vbuffer_new(ggml_backend_buffer_type_t buft) {
+    struct vbuffer * buf = calloc(1, sizeof(struct vbuffer));
+    buf->buft = buft;
+    memset(buf->chunks, 0, sizeof(buf->chunks));
+    return buf;
+}
+
+static void ggml_vbuffer_free_chunks(struct vbuffer * buf) {
+    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) {
+        ggml_backend_buffer_free(buf->chunks[i]);
+        buf->chunks[i] = NULL;
+    }
+}
+
+static void ggml_vbuffer_free(struct vbuffer * buf) {
+    if (buf == NULL) {
+        return;
+    }
+    ggml_vbuffer_free_chunks(buf);
+    free(buf);
+}
+
+static int ggml_vbuffer_n_chunks(struct vbuffer * buf) {
+    int n = 0;
+    while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
+    return n;
+}
+
+static size_t ggml_vbuffer_size(struct vbuffer * buf) {
+    size_t size = 0;
+    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
+        size += ggml_backend_buffer_get_size(buf->chunks[i]);
+    }
+    return size;
+}
+
+static int ggml_vbuffer_alloc(struct vbuffer * buf, size_t size, enum ggml_backend_buffer_usage usage) {
+    size_t max_chunk_size = ggml_backend_buft_get_max_size(buf->buft);
+    if (size > GGML_VBUFFER_MAX_CHUNKS * max_chunk_size) {
+        return 0;
+    }
+
+    int n = 0;
+    // always allocate at least 1 chunk even if requested size is 0
+    while (size > 0 || n == 0) {
+        GGML_ASSERT(n < GGML_VBUFFER_MAX_CHUNKS);
+        size_t chunk_size = MIN(size, max_chunk_size);
+        buf->chunks[n] = ggml_backend_buft_alloc_buffer(buf->buft, chunk_size);
+        if (buf->chunks[n] == NULL) {
+            ggml_vbuffer_free_chunks(buf);
+            return 0;
+        }
+        ggml_backend_buffer_set_usage(buf->chunks[n], usage);
+
+        GGML_ASSERT(size >= chunk_size);
+        size -= chunk_size;
+        n += 1;
+    }
+    return n;
+}
+
+static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, size_t offset) {
+    size_t max_chunk_size = ggml_backend_buft_get_max_size(buf->buft);
+    size_t chunk_index = offset / max_chunk_size;
+    size_t chunk_offset = offset % max_chunk_size;
+    GGML_ASSERT(chunk_index < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[chunk_index] != NULL);
+
+    void * base = ggml_backend_buffer_get_base(buf->chunks[chunk_index]);
+    void * addr = (char *)base + chunk_offset;
+    ggml_backend_tensor_alloc(buf->chunks[chunk_index], tensor, addr);
+}
+
+static void ggml_vbuffer_reset(struct vbuffer * buf) {
+    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
+        ggml_backend_buffer_reset(buf->chunks[i]);
+    }
+}
+
+
+
 /////////////////////////////////////
 
 // graph allocator
@@ -349,7 +474,7 @@ struct node_alloc {
 
 struct ggml_gallocr {
     ggml_backend_buffer_type_t * bufts; // [n_buffers]
-    ggml_backend_buffer_t * buffers; // [n_buffers]
+    struct vbuffer ** buffers; // [n_buffers]
     struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
     int n_buffers;
 
@@ -370,7 +495,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
     galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
     GGML_ASSERT(galloc->bufts != NULL);
 
-    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
+    galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
     GGML_ASSERT(galloc->buffers != NULL);
 
     galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
@@ -378,7 +503,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
 
     for (int i = 0; i < n_bufs; i++) {
         galloc->bufts[i] = bufts[i];
-        galloc->buffers[i] = NULL;
+        galloc->buffers[i] = ggml_vbuffer_new(bufts[i]);
 
         // check if the same buffer type is used multiple times and reuse the same allocator
         for (int j = 0; j < i; j++) {
@@ -390,7 +515,8 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
 
         if (galloc->buf_tallocs[i] == NULL) {
             size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
-            galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
+            size_t max_size = ggml_backend_buft_get_max_size(bufts[i]);
+            galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment, max_size);
         }
     }
     galloc->n_buffers = n_bufs;
@@ -418,7 +544,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
                 }
             }
             if (!freed) {
-                ggml_backend_buffer_free(galloc->buffers[i]);
+                ggml_vbuffer_free(galloc->buffers[i]);
             }
         }
         if (galloc->buf_tallocs != NULL) {
@@ -744,22 +870,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
             }
         }
 
-        size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
+        size_t cur_size = ggml_vbuffer_size(galloc->buffers[i]);
         size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
 
         // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
-        if (new_size > cur_size || galloc->buffers[i] == NULL) {
+        if (new_size > cur_size || ggml_vbuffer_n_chunks(galloc->buffers[i]) == 0) {
 #ifndef NDEBUG
             GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
 
-            ggml_backend_buffer_free(galloc->buffers[i]);
-            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
-            if (galloc->buffers[i] == NULL) {
+            ggml_vbuffer_free_chunks(galloc->buffers[i]);
+            if (!ggml_vbuffer_alloc(galloc->buffers[i], new_size, GGML_BACKEND_BUFFER_USAGE_COMPUTE)) {
                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
                 return false;
             }
-            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
         }
     }
 
@@ -772,7 +896,7 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
 
 static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
     int buffer_id = tensor_alloc->buffer_id;
-    assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
+    assert(tensor->data || tensor->view_src || ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
 
     if (tensor->view_src != NULL) {
         if (tensor->buffer == NULL) {
@@ -786,10 +910,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
     } else {
         if (tensor->data == NULL) {
             assert(tensor_alloc->offset != SIZE_MAX);
-            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
-            void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
-            void * addr = (char *)base + tensor_alloc->offset;
-            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
+            assert(ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
+            ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->offset);
         } else {
             if (tensor->buffer == NULL) {
                 // this tensor was allocated without ggml-backend
@@ -874,7 +996,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
     // reset buffers
     for (int i = 0; i < galloc->n_buffers; i++) {
         if (galloc->buffers[i] != NULL) {
-            ggml_backend_buffer_reset(galloc->buffers[i]);
+            ggml_vbuffer_reset(galloc->buffers[i]);
         }
     }
 
@@ -917,7 +1039,7 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
         }
     }
 
-    return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
+    return ggml_vbuffer_size(galloc->buffers[buffer_id]);
 }
 
 // utils
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 19a7adb2d101b..0fc42846f0a77 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -329,6 +329,10 @@ struct ggml_cgraph {
 // if you need the gradients, get them from the original graph
 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
 
+// ggml-alloc.c: true if the operation can reuse memory from its sources
+GGML_API bool ggml_op_can_inplace(enum ggml_op op);
+
+
 // Memory allocation
 
 GGML_API void * ggml_aligned_malloc(size_t size);
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index cd1c66ba7b476..67b183754542a 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -11129,7 +11129,7 @@ static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type
 
 static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
     ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
-    return ctx->device->suballocation_block_size;
+    return ctx->device->max_memory_allocation_size;
 }
 
 static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 91719577564a9..cec1f1641825a 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -219,3 +219,6 @@ target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
 add_executable(${TEST_TARGET} test-c.c)
 target_link_libraries(${TEST_TARGET} PRIVATE llama)
+
+llama_build_and_test(test-alloc.cpp)
+target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
\ No newline at end of file
diff --git a/tests/test-alloc.cpp b/tests/test-alloc.cpp
new file mode 100644
index 0000000000000..2f5db4d0c0d6f
--- /dev/null
+++ b/tests/test-alloc.cpp
@@ -0,0 +1,483 @@
+#include <ggml-alloc.h>
+#include <ggml-backend-impl.h>
+#include <ggml-cpp.h>
+#include <ggml-impl.h>
+#include <ggml.h>
+
+#include <algorithm>
+#include <exception>
+#include <memory>
+#include <vector>
+
+//
+// dummy backend with configurable max_buffer_size, tracks allocations
+
+uint8_t * const alloc_base = (uint8_t *) 16;
+
+struct dummy_backend_context {
+    size_t max_buffer_size = 64;
+
+    ggml_backend_buffer_i              buffer_interface;
+    std::vector<ggml_backend_buffer_t> buffers;
+
+    size_t allocated_total() const {
+        size_t n = 0;
+        for (ggml_backend_buffer_t buf : buffers) {
+            n += ggml_backend_buffer_get_size(buf);
+        }
+        return n;
+    }
+};
+
+// ggml_backend_buffer_type interface
+
+static const char * dummy_backend_buffer_type_get_name(ggml_backend_buffer_type_t) {
+    return "dummy_buffer_type";
+}
+
+static ggml_backend_buffer_t dummy_backend_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    dummy_backend_context * ctx    = (dummy_backend_context *) buft->context;
+    ggml_backend_buffer_t & buffer = ctx->buffers.emplace_back();
+    buffer                         = ggml_backend_buffer_init(buft, ctx->buffer_interface, ctx, size);
+    return buffer;
+}
+
+static size_t dummy_backend_buffer_type_get_alignment(ggml_backend_buffer_type_t) {
+    return 8;
+}
+
+static size_t dummy_backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
+    return ctx->max_buffer_size;
+}
+
+static bool dummy_backend_buffer_type_is_host(ggml_backend_buffer_type_t) {
+    return true;
+}
+
+// ggml_backend_buffer interface
+
+static void dummy_backend_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    dummy_backend_context * ctx = (dummy_backend_context *) buffer->context;
+
+    auto i = std::find(ctx->buffers.begin(), ctx->buffers.end(), buffer);
+    GGML_ASSERT(i != ctx->buffers.end());
+    ctx->buffers.erase(i);
+}
+
+static void * dummy_backend_buffer_get_base(ggml_backend_buffer_t) {
+    return alloc_base;
+}
+
+static ggml_status dummy_backend_buffer_init_tensor(ggml_backend_buffer_t, ggml_tensor *) {
+    return GGML_STATUS_SUCCESS;
+}
+
+static void dummy_backend_buffer_memset_tensor(ggml_backend_buffer_t, ggml_tensor *, uint8_t, size_t, size_t) {}
+
+static void dummy_backend_buffer_set_tensor(ggml_backend_buffer_t, ggml_tensor *, const void *, size_t, size_t) {}
+
+static void dummy_backend_buffer_get_tensor(ggml_backend_buffer_t, const ggml_tensor *, void *, size_t, size_t) {}
+
+static void dummy_backend_buffer_clear(ggml_backend_buffer_t, uint8_t) {}
+
+// dummy_backend (not really a full backend, just provides what gallocr needs)
+
+struct dummy_backend {
+    std::unique_ptr<dummy_backend_context> context;
+    ggml_backend_buffer_type               buffer_type;
+};
+
+static dummy_backend dummy_backend_init(size_t max_buffer_size) {
+    dummy_backend b{};
+    b.context                  = std::make_unique<dummy_backend_context>();
+    b.context->max_buffer_size = max_buffer_size;
+
+    b.context->buffer_interface.free_buffer   = dummy_backend_buffer_free_buffer;
+    b.context->buffer_interface.get_base      = dummy_backend_buffer_get_base;
+    b.context->buffer_interface.init_tensor   = dummy_backend_buffer_init_tensor;
+    b.context->buffer_interface.memset_tensor = dummy_backend_buffer_memset_tensor;
+    b.context->buffer_interface.set_tensor    = dummy_backend_buffer_set_tensor;
+    b.context->buffer_interface.get_tensor    = dummy_backend_buffer_get_tensor;
+    b.context->buffer_interface.clear         = dummy_backend_buffer_clear;
+
+    b.buffer_type.context             = b.context.get();
+    b.buffer_type.iface.get_name      = dummy_backend_buffer_type_get_name;
+    b.buffer_type.iface.alloc_buffer  = dummy_backend_buffer_type_alloc_buffer;
+    b.buffer_type.iface.get_alignment = dummy_backend_buffer_type_get_alignment;
+    b.buffer_type.iface.get_max_size  = dummy_backend_buffer_type_get_max_size;
+    b.buffer_type.iface.is_host       = dummy_backend_buffer_type_is_host;
+    return b;
+}
+
+//
+// test utilities
+
+struct test_context_with_graph {
+    ggml_context *   ctx;
+    ggml_cgraph *    graph;
+    ggml_context_ptr ctx_ptr;
+};
+
+static test_context_with_graph make_context() {
+    ggml_init_params params{};
+    params.mem_size = 32 * ggml_tensor_overhead() + ggml_graph_overhead();
+    params.no_alloc = true;
+
+    ggml_context *   ctx     = ggml_init(params);
+    ggml_context_ptr ctx_ptr = ggml_context_ptr(ctx);
+    ggml_cgraph *    graph   = ggml_new_graph(ctx);
+    return { ctx, graph, std::move(ctx_ptr) };
+}
+
+static ggml_tensor * make_input_1d(ggml_context * ctx, int64_t n_elements) {
+    ggml_tensor * t = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+    ggml_set_input(t);
+    return t;
+}
+
+static ggml_tensor * make_input_with_size(ggml_context * ctx, size_t size_bytes) {
+    GGML_ASSERT(size_bytes % 4 == 0);
+    return make_input_1d(ctx, size_bytes / 4);
+}
+
+static void assign_names(ggml_context * ctx, const char * prefix = "x") {
+    int i = 0;
+    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
+        ggml_format_name(t, "%s%d", prefix, i++);
+    }
+}
+
+static int get_leaf_id(ggml_cgraph * graph, const char * tensor_name) {
+    for (int i = 0; i < graph->n_leafs; ++i) {
+        if (strncmp(graph->leafs[i]->name, tensor_name, GGML_MAX_NAME) == 0) {
+            return i;
+        }
+    }
+    GGML_ABORT("leaf not found: %s", tensor_name);
+    return -1;
+}
+
+static int get_node_id(ggml_cgraph * graph, const char * tensor_name) {
+    for (int i = 0; i < graph->n_nodes; ++i) {
+        if (strncmp(graph->nodes[i]->name, tensor_name, GGML_MAX_NAME) == 0) {
+            return i;
+        }
+    }
+    GGML_ABORT("node not found: %s", tensor_name);
+    return -1;
+}
+
+static ggml_gallocr_ptr allocate_graph(ggml_cgraph * graph, ggml_tensor * out, ggml_backend_buffer_type_t buft) {
+    ggml_set_output(out);
+    ggml_build_forward_expand(graph, out);
+
+    ggml_gallocr_ptr galloc = ggml_gallocr_ptr(ggml_gallocr_new(buft));
+    bool             result = ggml_gallocr_alloc_graph(galloc.get(), graph);
+    GGML_ASSERT(result);
+    return galloc;
+}
+
+//
+// correctness checks for result allocations
+
+static void check_all_allocated(ggml_cgraph * graph) {
+    for (int i = 0; i < ggml_graph_n_nodes(graph); ++i) {
+        ggml_tensor * t = ggml_graph_node(graph, i);
+        GGML_ASSERT(t->buffer != nullptr);
+        GGML_ASSERT(t->data != nullptr);
+    }
+}
+
+static void check_max_size(ggml_context * ctx) {
+    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
+        auto   buft     = ggml_backend_buffer_get_type(t->buffer);
+        size_t max_size = ggml_backend_buft_get_max_size(buft);
+        size_t offset   = (char *) t->data - (char *) ggml_backend_buffer_get_base(t->buffer);
+        GGML_ASSERT(t->data >= ggml_backend_buffer_get_base(t->buffer));
+        GGML_ASSERT((size_t) offset + ggml_nbytes(t) <= max_size);
+    }
+}
+
+static bool can_reuse_memory(ggml_cgraph * graph, int current_i, ggml_tensor * current, ggml_tensor * other) {
+    if (other->flags & GGML_TENSOR_FLAG_OUTPUT) {
+        return false;
+    }
+    // Check if `other` is still "alive", ie. an input to any node after the `current` op
+    for (int i = current_i; i < ggml_graph_n_nodes(graph); ++i) {
+        ggml_tensor * t = ggml_graph_node(graph, i);
+        for (int s = 0; s < GGML_MAX_SRC; s++) {
+            if (t == current && ggml_op_can_inplace(t->op)) {
+                continue;
+            }
+            if (t->src[s] == other) {
+                return false;
+            }
+            if (t->src[s] && t->src[s]->view_src == other) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+static bool memory_overlap(ggml_tensor * a, ggml_tensor * b) {
+    if (a->buffer != b->buffer) {
+        return false;
+    }
+    int64_t a0 = (int64_t) a->data;
+    int64_t a1 = a0 + ggml_nbytes(a);
+    int64_t b0 = (int64_t) b->data;
+    int64_t b1 = b0 + ggml_nbytes(b);
+    return a1 > b0 && b1 > a0;
+}
+
+static ggml_tensor * get_view_source(ggml_tensor * t) {
+    while (t->view_src) {
+        t = t->view_src;
+    }
+    return t;
+}
+
+static void check_no_overlap(ggml_cgraph * graph) {
+    for (int i = 0; i < ggml_graph_n_nodes(graph); ++i) {
+        for (int j = 0; j < i; ++j) {
+            ggml_tensor * t = ggml_graph_node(graph, i);
+            ggml_tensor * o = ggml_graph_node(graph, j);
+            GGML_ASSERT(t != o);
+
+            if (get_view_source(t) == get_view_source(o)) {
+                continue;
+            }
+            if (memory_overlap(t, o)) {
+                GGML_ASSERT(can_reuse_memory(graph, i, t, o));
+            }
+        }
+    }
+}
+
+//
+// test cases
+
+// scenario where the first backend buffer is completely exhausted and there are further
+// tensors which require a second buffer
+static void test_max_size_too_many_tensors() {
+    dummy_backend backend      = dummy_backend_init(16);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[7];
+    x[0] = make_input_with_size(ctx, 8);
+    x[1] = make_input_with_size(ctx, 8);
+    x[2] = make_input_with_size(ctx, 8);
+    x[3] = ggml_mul(ctx, x[0], x[1]);
+    x[4] = ggml_add(ctx, x[1], x[2]);
+    x[5] = ggml_add(ctx, x[3], x[0]);
+    x[6] = ggml_add(ctx, x[4], x[5]);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[6], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 16 + 16);
+}
+
+// scenario where there is some space left in the first buffer, but not enough to accomodate
+// a larger tensor, so a second buffer is required
+static void test_max_size_tensor_too_large() {
+    dummy_backend backend      = dummy_backend_init(32);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[3];
+    x[0] = make_input_with_size(ctx, 16);    // chunk 0, [0 , 16)
+    x[1] = make_input_with_size(ctx, 8);     // chunk 0, [16, 24)
+    x[2] = ggml_concat(ctx, x[0], x[1], 0);  // chunk 1, [0 , 24)
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[2], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 32 + 24);
+}
+
+// check that views don't require any extra memory
+static void test_view_inplace() {
+    dummy_backend backend      = dummy_backend_init(32);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[6];
+    x[0] = make_input_1d(ctx, 4);                // chunk 0, [0, 16)
+    x[1] = ggml_reshape_2d(ctx, x[0], 2, 2);     // view of x0
+    x[2] = ggml_permute(ctx, x[1], 1, 0, 2, 3);  // view of x0
+    x[3] = ggml_view_1d(ctx, x[2], 2, 4);        // view of x0
+    x[4] = make_input_1d(ctx, 2);                // chunk 0, [16, 24)
+    x[5] = ggml_add(ctx, x[3], x[4]);            // reuse (inplace add)
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[5], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 24);
+}
+
+static void test_reuse_and_free() {
+    dummy_backend backend      = dummy_backend_init(32);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[9];
+    x[0] = make_input_with_size(ctx, 24);
+    x[1] = make_input_with_size(ctx, 8);
+    x[2] = make_input_with_size(ctx, 8);
+    x[3] = ggml_add(ctx, x[1], x[2]);        // reuse, free x2
+    x[4] = ggml_pad(ctx, x[0], 2, 0, 0, 0);  // alloc new buffer, free x0
+    x[5] = ggml_scale(ctx, x[4], 2.0f);      // alloc from free block
+    x[6] = ggml_add(ctx, x[4], x[5]);        // reuse, free x5
+    x[7] = ggml_view_1d(ctx, x[6], 2, 8);    // view
+    x[8] = ggml_add(ctx, x[3], x[7]);        // reuse
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[8], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 32 + 32 + 32);
+}
+
+static void test_merge_free_block(size_t max_buffer_size) {
+    dummy_backend backend      = dummy_backend_init(max_buffer_size);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[9];
+    x[0] = make_input_with_size(ctx, 16);
+    x[1] = make_input_with_size(ctx, 16);
+    x[2] = make_input_with_size(ctx, 16);
+    x[3] = ggml_mean(ctx, x[0]);
+    x[4] = ggml_mean(ctx, x[1]);
+    x[5] = ggml_pad(ctx, x[2], 2, 0, 0, 0);
+    x[6] = ggml_add(ctx, x[3], x[4]);
+    x[7] = ggml_pad(ctx, x[6], 5, 0, 0, 0);
+    x[8] = ggml_add(ctx, x[5], x[7]);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[8], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 32 + 32 + 24);
+}
+
+// test for allocating on multiple devices with some tensors in the graph
+// allocated externally (not by gallocr).
+static void test_multiple_buffer_types() {
+    dummy_backend backend_a = dummy_backend_init(32);
+    dummy_backend backend_b = dummy_backend_init(SIZE_MAX);
+
+    auto [ctx_a, _a, ctx_a_ptr] = make_context();
+    auto [ctx_b, _b, ctx_b_ptr] = make_context();
+    auto [ctx, graph, ctx_ptr]  = make_context();
+
+    ggml_tensor * a[2];
+    a[0] = make_input_with_size(ctx_a, 16);
+    a[1] = make_input_with_size(ctx_a, 16);
+    assign_names(ctx_a, "a");
+
+    ggml_tensor * b[2];
+    b[0] = make_input_with_size(ctx_b, 24);
+    b[1] = make_input_with_size(ctx_b, 4);
+    assign_names(ctx_b, "b");
+
+    ggml_tensor * x[9];
+    x[0] = make_input_with_size(ctx, 16);
+    x[1] = ggml_mul(ctx, x[0], a[0]);
+    x[2] = ggml_pad(ctx, x[1], 2, 0, 0, 0);
+    x[3] = ggml_mul(ctx, x[2], b[0]);
+    x[4] = ggml_mean(ctx, x[3]);
+    x[5] = ggml_add(ctx, x[4], b[1]);
+    x[6] = ggml_pad(ctx, x[5], 3, 0, 0, 0);
+    x[7] = ggml_add(ctx, x[6], a[1]);
+    x[8] = ggml_scale(ctx, x[7], 2.0f);
+    assign_names(ctx, "x");
+
+    ggml_backend_buffer_ptr    buf_a(ggml_backend_alloc_ctx_tensors_from_buft(ctx_a, &backend_a.buffer_type));
+    ggml_backend_buffer_ptr    buf_b(ggml_backend_alloc_ctx_tensors_from_buft(ctx_b, &backend_b.buffer_type));
+    ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
+
+    // assign buffer types manually to avoid extra complexity from backend scheduler
+    ggml_set_output(x[8]);
+    ggml_build_forward_expand(graph, x[8]);
+
+    GGML_ASSERT(graph->n_leafs == 5);
+    int leaf_buffer_ids[5];
+    leaf_buffer_ids[get_leaf_id(graph, "a0")] = 0;
+    leaf_buffer_ids[get_leaf_id(graph, "a1")] = 0;
+    leaf_buffer_ids[get_leaf_id(graph, "b0")] = 1;
+    leaf_buffer_ids[get_leaf_id(graph, "b1")] = 1;
+    leaf_buffer_ids[get_leaf_id(graph, "x0")] = 0;
+
+    GGML_ASSERT(graph->n_nodes == 8);
+    int node_buffer_ids[8];
+    node_buffer_ids[get_node_id(graph, "x1")] = 0;
+    node_buffer_ids[get_node_id(graph, "x2")] = 0;
+    node_buffer_ids[get_node_id(graph, "x3")] = 1;
+    node_buffer_ids[get_node_id(graph, "x4")] = 1;
+    node_buffer_ids[get_node_id(graph, "x5")] = 1;
+    node_buffer_ids[get_node_id(graph, "x6")] = 1;
+    node_buffer_ids[get_node_id(graph, "x7")] = 0;
+    node_buffer_ids[get_node_id(graph, "x8")] = 0;
+
+    ggml_gallocr_ptr galloc(ggml_gallocr_new_n(bufts, 2));
+    ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
+    ggml_gallocr_alloc_graph(galloc.get(), graph);
+
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend_a.context->allocated_total() <= 32 + 32 + 24);
+    GGML_ASSERT(backend_b.context->allocated_total() <= 32 + 24);
+}
+
+static void test_buffer_size_zero() {
+    dummy_backend backend_a    = dummy_backend_init(SIZE_MAX);
+    dummy_backend backend_b    = dummy_backend_init(SIZE_MAX);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[2];
+    x[0] = make_input_with_size(ctx, 16);
+    x[1] = ggml_scale(ctx, x[0], 2.0f);
+
+    ggml_set_output(x[1]);
+    ggml_build_forward_expand(graph, x[1]);
+
+    int leaf_buffer_ids[1] = { 0 };
+    int node_buffer_ids[1] = { 0 };
+
+    ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
+    ggml_gallocr_ptr           galloc   = ggml_gallocr_ptr(ggml_gallocr_new_n(bufts, 2));
+    ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
+    ggml_gallocr_alloc_graph(galloc.get(), graph);
+
+    check_all_allocated(graph);
+    GGML_ASSERT(backend_a.context->allocated_total() == 16);
+    GGML_ASSERT(backend_b.context->allocated_total() == 0);
+}
+
+static void run(const char * name, void (*f)()) {
+    printf("%s ", name);
+    fflush(stdout);
+    f();
+    printf("PASSED\n");
+}
+
+int main() {
+    run("test_max_size_too_many_tensors", test_max_size_too_many_tensors);
+    run("test_max_size_tensor_too_large", test_max_size_tensor_too_large);
+    run("test_view_inplace", test_view_inplace);
+    run("test_reuse_and_free", test_reuse_and_free);
+    run("test_merge_free_block(32)", []() { test_merge_free_block(32); });
+    run("test_merge_free_block(SIZE_MAX)", []() { test_merge_free_block(SIZE_MAX); });
+    run("test_multiple_buffer_types", test_multiple_buffer_types);
+    run("test_buffer_size_zero", test_buffer_size_zero);
+    return 0;
+}

From 8d3c5d90fbf373d142b0373c840fe4b7272c1c05 Mon Sep 17 00:00:00 2001
From: Acly <aclysia@gmail.com>
Date: Fri, 5 Sep 2025 16:37:06 +0200
Subject: [PATCH 02/11] fix missing newline, apple-clang warning

---
 tests/CMakeLists.txt | 2 +-
 tests/test-alloc.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index cec1f1641825a..3e9e082d93c18 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -221,4 +221,4 @@ add_executable(${TEST_TARGET} test-c.c)
 target_link_libraries(${TEST_TARGET} PRIVATE llama)
 
 llama_build_and_test(test-alloc.cpp)
-target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
\ No newline at end of file
+target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
diff --git a/tests/test-alloc.cpp b/tests/test-alloc.cpp
index 2f5db4d0c0d6f..031a44e811037 100644
--- a/tests/test-alloc.cpp
+++ b/tests/test-alloc.cpp
@@ -154,7 +154,7 @@ static int get_leaf_id(ggml_cgraph * graph, const char * tensor_name) {
             return i;
         }
     }
-    GGML_ABORT("leaf not found: %s", tensor_name);
+    fprintf(stderr, "leaf not found: %s\n", tensor_name);
     return -1;
 }
 
@@ -164,7 +164,7 @@ static int get_node_id(ggml_cgraph * graph, const char * tensor_name) {
             return i;
         }
     }
-    GGML_ABORT("node not found: %s", tensor_name);
+    fprintf(stderr, "node not found: %s", tensor_name);
     return -1;
 }
 

From 44d3ee4a9c7d55b90d6e9d82d9c44c33eecb1852 Mon Sep 17 00:00:00 2001
From: Acly <aclysia@gmail.com>
Date: Fri, 12 Sep 2025 18:06:39 +0200
Subject: [PATCH 03/11] track size of individual chunks in ggml_dyn_tallocr and
 raise max chunks. revert to use suballocation_block_size as max chunk size
 for vulkan.

---
 ggml/src/ggml-alloc.c                | 102 ++++++++++++++-------------
 ggml/src/ggml-vulkan/ggml-vulkan.cpp |   2 +-
 tests/test-alloc.cpp                 |  34 +++++++--
 3 files changed, 80 insertions(+), 58 deletions(-)

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 5536d3c03293d..6a3d0a58b3aff 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -95,6 +95,8 @@ enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_te
 
 // dynamic tensor allocator
 
+#define GGML_VBUFFER_MAX_CHUNKS 16
+
 struct free_block {
     size_t offset;
     size_t size;
@@ -103,8 +105,9 @@ struct free_block {
 struct ggml_dyn_tallocr {
     size_t alignment;
     int n_free_blocks;
+    int n_chunks;
     struct free_block free_blocks[MAX_FREE_BLOCKS];
-    size_t max_size;
+    size_t max_size[GGML_VBUFFER_MAX_CHUNKS];
     size_t max_chunk_size;
 
 #ifdef GGML_ALLOCATOR_DEBUG
@@ -117,10 +120,21 @@ struct ggml_dyn_tallocr {
 
 // the memory range [0, max_size) is divided into n chunks of size max_chunk_size (with the last chunk possibly being smaller).
 // tensor allocations may not cross chunk boundaries.
-static void ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, struct free_block * block) {
-    size_t n_chunks = (alloc->max_size + alloc->max_chunk_size - 1) / alloc->max_chunk_size;
-    block->offset = n_chunks * alloc->max_chunk_size;
-    block->size = alloc->max_chunk_size;
+static size_t ggml_dyn_tallocr_chunk_index(struct ggml_dyn_tallocr * alloc, size_t offset) {
+    for (int i = 0; i < alloc->n_chunks; i++) {
+        if (offset < alloc->max_size[i]) {
+            return i;
+        }
+    }
+    return alloc->n_chunks - 1;
+}
+
+static void ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, struct free_block * block, size_t min_size) {
+    GGML_ASSERT(alloc->n_chunks >= 1);
+    block->offset = alloc->max_size[alloc->n_chunks - 1];
+    block->size = MAX(min_size, alloc->max_chunk_size);
+    alloc->n_chunks++;
+    GGML_ASSERT(alloc->n_chunks <= GGML_VBUFFER_MAX_CHUNKS);
 }
 
 #ifdef GGML_ALLOCATOR_DEBUG
@@ -149,10 +163,6 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
     size = aligned_offset(NULL, size, alloc->alignment);
 
     AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
-    if (size > alloc->max_chunk_size) {
-        GGML_ABORT("allocation failed: tensor %s (%zu bytes) exceeds maximum backend buffer size (%zu bytes)\n",
-            tensor->name, size, alloc->max_chunk_size);
-    }
 
     size_t max_avail = 0;
 
@@ -172,14 +182,10 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
         // the last block represents memory still available in an existing chunk
         struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
         max_avail = MAX(max_avail, block->size);
-        if (block->size >= size) {
-            best_fit_block = alloc->n_free_blocks - 1;
-        } else {
-            // not enough space in existing chunk, create a new one at the end
-            best_fit_block = alloc->n_free_blocks;
-            alloc->n_free_blocks += 1;
-            GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
-            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[alloc->n_free_blocks - 1]);
+        best_fit_block = alloc->n_free_blocks - 1;
+        if (block->size < size) {
+            // not enough space in existing chunk, start the next one
+            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[best_fit_block], size);
         }
     }
 
@@ -196,7 +202,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
         // if there are no remaining blocks all memory in current chunk was used up -> start the next one
         if (alloc->n_free_blocks == 0) {
             alloc->n_free_blocks = 1;
-            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[0]);
+            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[0], 0);
         }
     }
 
@@ -232,7 +238,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
     }
 #endif
 
-    alloc->max_size = MAX(alloc->max_size, offset + size);
+    alloc->max_size[alloc->n_chunks-1] = MAX(alloc->max_size[alloc->n_chunks-1], offset + size);
 
     return offset;
 
@@ -248,13 +254,13 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
 #ifdef GGML_ALLOCATOR_DEBUG
     remove_allocated_tensor(alloc, offset, tensor);
 #endif
-    size_t chunk = offset / alloc->max_chunk_size;
+    size_t chunk = ggml_dyn_tallocr_chunk_index(alloc, offset);
 
     // see if we can merge with an existing block
     for (int i = 0; i < alloc->n_free_blocks; i++) {
         struct free_block * block = &alloc->free_blocks[i];
         // can only merge with blocks within the same chunk
-        size_t block_chunk = block->offset / alloc->max_chunk_size;
+        size_t block_chunk = ggml_dyn_tallocr_chunk_index(alloc, block->offset);
         if (chunk != block_chunk) {
             continue;
         }
@@ -264,7 +270,7 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
             // check if we can merge with the next block (within the same chunk)
             if (i < alloc->n_free_blocks - 1) {
                 struct free_block * next = &alloc->free_blocks[i+1];
-                if (block->offset + block->size == next->offset && block_chunk == (next->offset / alloc->max_chunk_size)) {
+                if (block->offset + block->size == next->offset && block_chunk == ggml_dyn_tallocr_chunk_index(alloc, next->offset)) {
                     block->size += next->size;
                     alloc->n_free_blocks--;
                     for (int j = i+1; j < alloc->n_free_blocks; j++) {
@@ -281,7 +287,7 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
             // check if we can merge with the previous block (within the same chunk)
             if (i > 0) {
                 struct free_block * prev = &alloc->free_blocks[i-1];
-                if (prev->offset + prev->size == block->offset && block_chunk == (prev->offset / alloc->max_chunk_size)) {
+                if (prev->offset + prev->size == block->offset && block_chunk == ggml_dyn_tallocr_chunk_index(alloc, prev->offset)) {
                     prev->size += block->size;
                     alloc->n_free_blocks--;
                     for (int j = i; j < alloc->n_free_blocks; j++) {
@@ -313,9 +319,10 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
 
 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
     alloc->n_free_blocks = 1;
+    alloc->n_chunks = 1;
     alloc->free_blocks[0].offset = 0;
     alloc->free_blocks[0].size = alloc->max_chunk_size;
-    alloc->max_size = 0;
+    memset(alloc->max_size, 0, sizeof(alloc->max_size));
 
     if (alloc->free_blocks[0].size == SIZE_MAX) {
         alloc->free_blocks[0].size = SIZE_MAX/2; // avoid overflows
@@ -334,8 +341,9 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t m
     *alloc = (struct ggml_dyn_tallocr) {
         /*.alignment       = */ alignment,
         /*.n_free_blocks   = */ 0,
+        /*.n_chunks        = */ 0,
         /*.free_blocks     = */ {{0}},
-        /*.max_size        = */ 0,
+        /*.max_size        = */ {0},
         /*.max_chunk_size  = */ max_buffer_size,
 #ifdef GGML_ALLOCATOR_DEBUG
         /*.allocated_tensors = */ {{0}},
@@ -352,14 +360,12 @@ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
 }
 
 static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
-    return alloc->max_size;
+    return alloc->max_size[alloc->n_chunks - 1];
 }
 
 
 // virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
 
-#define GGML_VBUFFER_MAX_CHUNKS 8
-
 struct vbuffer {
     ggml_backend_buffer_type_t buft;
     ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
@@ -401,36 +407,32 @@ static size_t ggml_vbuffer_size(struct vbuffer * buf) {
     return size;
 }
 
-static int ggml_vbuffer_alloc(struct vbuffer * buf, size_t size, enum ggml_backend_buffer_usage usage) {
-    size_t max_chunk_size = ggml_backend_buft_get_max_size(buf->buft);
-    if (size > GGML_VBUFFER_MAX_CHUNKS * max_chunk_size) {
-        return 0;
-    }
-
-    int n = 0;
-    // always allocate at least 1 chunk even if requested size is 0
-    while (size > 0 || n == 0) {
-        GGML_ASSERT(n < GGML_VBUFFER_MAX_CHUNKS);
-        size_t chunk_size = MIN(size, max_chunk_size);
+static bool ggml_vbuffer_alloc(struct vbuffer * buf, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
+    for (int n = 0; n < talloc->n_chunks; n++) {
+        size_t chunk_size = talloc->max_size[n];
+        if (n > 0) {
+            chunk_size -= talloc->max_size[n - 1];
+        }
         buf->chunks[n] = ggml_backend_buft_alloc_buffer(buf->buft, chunk_size);
         if (buf->chunks[n] == NULL) {
             ggml_vbuffer_free_chunks(buf);
-            return 0;
+            return false;
         }
         ggml_backend_buffer_set_usage(buf->chunks[n], usage);
-
-        GGML_ASSERT(size >= chunk_size);
-        size -= chunk_size;
-        n += 1;
     }
-    return n;
+    return true;
 }
 
 static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, size_t offset) {
-    size_t max_chunk_size = ggml_backend_buft_get_max_size(buf->buft);
-    size_t chunk_index = offset / max_chunk_size;
-    size_t chunk_offset = offset % max_chunk_size;
-    GGML_ASSERT(chunk_index < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[chunk_index] != NULL);
+    size_t chunk_index = 0, chunk_offset = offset;
+    while (chunk_index < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[chunk_index]) {
+        size_t chunk_size = ggml_backend_buffer_get_size(buf->chunks[chunk_index]);
+        if (chunk_offset < chunk_size) {
+            break;
+        }
+        chunk_offset -= chunk_size;
+        chunk_index++;
+    }
 
     void * base = ggml_backend_buffer_get_base(buf->chunks[chunk_index]);
     void * addr = (char *)base + chunk_offset;
@@ -880,7 +882,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
 #endif
 
             ggml_vbuffer_free_chunks(galloc->buffers[i]);
-            if (!ggml_vbuffer_alloc(galloc->buffers[i], new_size, GGML_BACKEND_BUFFER_USAGE_COMPUTE)) {
+            if (!ggml_vbuffer_alloc(galloc->buffers[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE)) {
                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
                 return false;
             }
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 67b183754542a..cd1c66ba7b476 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -11129,7 +11129,7 @@ static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type
 
 static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
     ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
-    return ctx->device->max_memory_allocation_size;
+    return ctx->device->suballocation_block_size;
 }
 
 static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
diff --git a/tests/test-alloc.cpp b/tests/test-alloc.cpp
index 031a44e811037..48a93eea13ca7 100644
--- a/tests/test-alloc.cpp
+++ b/tests/test-alloc.cpp
@@ -259,7 +259,7 @@ static void check_no_overlap(ggml_cgraph * graph) {
 //
 // test cases
 
-// scenario where the first backend buffer is completely exhausted and there are further
+// Scenario where the first backend buffer is completely exhausted and there are further
 // tensors which require a second buffer
 static void test_max_size_too_many_tensors() {
     dummy_backend backend      = dummy_backend_init(16);
@@ -282,7 +282,7 @@ static void test_max_size_too_many_tensors() {
     GGML_ASSERT(backend.context->allocated_total() <= 16 + 16);
 }
 
-// scenario where there is some space left in the first buffer, but not enough to accomodate
+// Scenario where there is some space left in the first buffer, but not enough to accomodate
 // a larger tensor, so a second buffer is required
 static void test_max_size_tensor_too_large() {
     dummy_backend backend      = dummy_backend_init(32);
@@ -301,7 +301,25 @@ static void test_max_size_tensor_too_large() {
     GGML_ASSERT(backend.context->allocated_total() <= 32 + 24);
 }
 
-// check that views don't require any extra memory
+// Scenario where a single tensor exceeds the max buffer size - in this case the allocator
+// should try to create a bigger buffer anyway, and wait for the backend to throw an error.
+// Backends may report an artificially lower max size in some cases for compatibility reasons.
+static void test_tensor_larger_than_max_size() {
+    dummy_backend backend      = dummy_backend_init(16);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[2];
+    x[0] = make_input_with_size(ctx, 24);
+    x[1] = ggml_scale(ctx, x[0], 2.0f);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[1], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    GGML_ASSERT(backend.context->allocated_total() == 24);
+}
+
+// Check that views don't require any extra memory
 static void test_view_inplace() {
     dummy_backend backend      = dummy_backend_init(32);
     auto [ctx, graph, ctx_ptr] = make_context();
@@ -323,7 +341,7 @@ static void test_view_inplace() {
 }
 
 static void test_reuse_and_free() {
-    dummy_backend backend      = dummy_backend_init(32);
+    dummy_backend backend      = dummy_backend_init(40);
     auto [ctx, graph, ctx_ptr] = make_context();
 
     ggml_tensor * x[9];
@@ -342,7 +360,7 @@ static void test_reuse_and_free() {
     check_all_allocated(graph);
     check_no_overlap(graph);
     check_max_size(ctx);
-    GGML_ASSERT(backend.context->allocated_total() <= 32 + 32 + 32);
+    GGML_ASSERT(backend.context->allocated_total() <= 40 + 32 + 32);
 }
 
 static void test_merge_free_block(size_t max_buffer_size) {
@@ -455,8 +473,9 @@ static void test_buffer_size_zero() {
 
     ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
     ggml_gallocr_ptr           galloc   = ggml_gallocr_ptr(ggml_gallocr_new_n(bufts, 2));
-    ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
-    ggml_gallocr_alloc_graph(galloc.get(), graph);
+    bool res1 = ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
+    bool res2 = ggml_gallocr_alloc_graph(galloc.get(), graph);
+    GGML_ASSERT(res1 && res2);
 
     check_all_allocated(graph);
     GGML_ASSERT(backend_a.context->allocated_total() == 16);
@@ -473,6 +492,7 @@ static void run(const char * name, void (*f)()) {
 int main() {
     run("test_max_size_too_many_tensors", test_max_size_too_many_tensors);
     run("test_max_size_tensor_too_large", test_max_size_tensor_too_large);
+    run("test_tensor_larger_than_max_size", test_tensor_larger_than_max_size);
     run("test_view_inplace", test_view_inplace);
     run("test_reuse_and_free", test_reuse_and_free);
     run("test_merge_free_block(32)", []() { test_merge_free_block(32); });

From 973d55be399671a2c7cd9a476e49093cb086993f Mon Sep 17 00:00:00 2001
From: Acly <aclysia@gmail.com>
Date: Wed, 17 Sep 2025 15:08:09 +0200
Subject: [PATCH 04/11] track (chunk, offset) pairs instead of "global" offsets
 through gallocr.

* simpler, don't need loops to map between local/global offsets
* touches more code
---
 ggml/src/ggml-alloc.c | 162 +++++++++++++++++++-----------------------
 1 file changed, 74 insertions(+), 88 deletions(-)

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 6a3d0a58b3aff..bc835790ae86b 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -97,8 +97,20 @@ enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_te
 
 #define GGML_VBUFFER_MAX_CHUNKS 16
 
+// relative memory address within an allocation that can be split into multiple buffers (chunks)
+struct buffer_address {
+    int chunk;     // index of a backend buffer
+    size_t offset; // local memory offset within the buffer
+};
+
+static const struct buffer_address GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
+
+static bool ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
+    return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
+}
+
 struct free_block {
-    size_t offset;
+    struct buffer_address addr;
     size_t size;
 };
 
@@ -113,44 +125,35 @@ struct ggml_dyn_tallocr {
 #ifdef GGML_ALLOCATOR_DEBUG
     struct {
         const struct ggml_tensor * tensor;
-        size_t offset;
+        struct buffer_address addr;
     } allocated_tensors[1024];
 #endif
 };
 
-// the memory range [0, max_size) is divided into n chunks of size max_chunk_size (with the last chunk possibly being smaller).
-// tensor allocations may not cross chunk boundaries.
-static size_t ggml_dyn_tallocr_chunk_index(struct ggml_dyn_tallocr * alloc, size_t offset) {
-    for (int i = 0; i < alloc->n_chunks; i++) {
-        if (offset < alloc->max_size[i]) {
-            return i;
-        }
-    }
-    return alloc->n_chunks - 1;
-}
-
+// allocations are split into n chunks of size max_size[i]. tensor allocations may not cross chunk boundaries.
 static void ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, struct free_block * block, size_t min_size) {
     GGML_ASSERT(alloc->n_chunks >= 1);
-    block->offset = alloc->max_size[alloc->n_chunks - 1];
+    block->addr.chunk = alloc->n_chunks;
+    block->addr.offset = 0;
     block->size = MAX(min_size, alloc->max_chunk_size);
     alloc->n_chunks++;
     GGML_ASSERT(alloc->n_chunks <= GGML_VBUFFER_MAX_CHUNKS);
 }
 
 #ifdef GGML_ALLOCATOR_DEBUG
-static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
+static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
     for (int i = 0; i < 1024; i++) {
         if (alloc->allocated_tensors[i].tensor == NULL) {
             alloc->allocated_tensors[i].tensor = tensor;
-            alloc->allocated_tensors[i].offset = offset;
+            alloc->allocated_tensors[i].addr = addr;
             return;
         }
     }
     GGML_ABORT("out of allocated_tensors");
 }
-static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
+static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
     for (int i = 0; i < 1024; i++) {
-        if (alloc->allocated_tensors[i].offset == offset) {
+        if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
             alloc->allocated_tensors[i].tensor = NULL;
             return;
         }
@@ -159,7 +162,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
 }
 #endif
 
-static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
+static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
     size = aligned_offset(NULL, size, alloc->alignment);
 
     AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -190,8 +193,8 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
     }
 
     struct free_block * block = &alloc->free_blocks[best_fit_block];
-    size_t offset = block->offset;
-    block->offset = offset + size;
+    struct buffer_address addr = block->addr;
+    block->addr.offset += size;
     block->size -= size;
     if (block->size == 0) {
         // remove block if empty
@@ -206,31 +209,32 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
         }
     }
 
-    AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, offset, offset / alloc->max_chunk_size);
+    AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
 
 #ifdef GGML_ALLOCATOR_DEBUG
-    add_allocated_tensor(alloc, offset, tensor);
-    size_t cur_max = offset + size;
-    if (cur_max > alloc->max_size) {
-        // sort allocated_tensors by offset
+    add_allocated_tensor(alloc, addr, tensor);
+    size_t cur_max = addr.offset + size;
+    if (cur_max > alloc->max_size[addr.chunk]) {
+        // sort allocated_tensors by chunk/offset
         for (int i = 0; i < 1024; i++) {
             for (int j = i + 1; j < 1024; j++) {
-                if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
+                if (ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
                     const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
-                    size_t tmp_offset = alloc->allocated_tensors[i].offset;
+                    struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
                     alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
-                    alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
+                    alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
                     alloc->allocated_tensors[j].tensor = tmp_tensor;
-                    alloc->allocated_tensors[j].offset = tmp_offset;
+                    alloc->allocated_tensors[j].addr = tmp_addr;
                 }
             }
         }
-        GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
+        GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
         for (int i = 0; i < 1024; i++) {
             if (alloc->allocated_tensors[i].tensor) {
-                GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
-                    alloc->allocated_tensors[i].offset,
-                    alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
+                GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
+                    alloc->allocated_tensors[i].addr.chunk,
+                    alloc->allocated_tensors[i].addr.offset,
+                    alloc->allocated_tensors[i].addr.offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
                     ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
             }
         }
@@ -238,39 +242,37 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
     }
 #endif
 
-    alloc->max_size[alloc->n_chunks-1] = MAX(alloc->max_size[alloc->n_chunks-1], offset + size);
+    alloc->max_size[addr.chunk] = MAX(alloc->max_size[addr.chunk], addr.offset + size);
 
-    return offset;
+    return addr;
 
     GGML_UNUSED(tensor);
 }
 
 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
+static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
     size = aligned_offset(NULL, size, alloc->alignment);
 
-    AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
+    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, addr.chunk, addr.offset, size, alloc->n_free_blocks);
 
 #ifdef GGML_ALLOCATOR_DEBUG
-    remove_allocated_tensor(alloc, offset, tensor);
+    remove_allocated_tensor(alloc, addr, tensor);
 #endif
-    size_t chunk = ggml_dyn_tallocr_chunk_index(alloc, offset);
 
     // see if we can merge with an existing block
     for (int i = 0; i < alloc->n_free_blocks; i++) {
         struct free_block * block = &alloc->free_blocks[i];
         // can only merge with blocks within the same chunk
-        size_t block_chunk = ggml_dyn_tallocr_chunk_index(alloc, block->offset);
-        if (chunk != block_chunk) {
+        if (addr.chunk != block->addr.chunk) {
             continue;
         }
         // check if ptr is at the end of the block
-        if (block->offset + block->size == offset) {
+        if (block->addr.offset + block->size == addr.offset) {
             block->size += size;
             // check if we can merge with the next block (within the same chunk)
             if (i < alloc->n_free_blocks - 1) {
                 struct free_block * next = &alloc->free_blocks[i+1];
-                if (block->offset + block->size == next->offset && block_chunk == ggml_dyn_tallocr_chunk_index(alloc, next->offset)) {
+                if (block->addr.offset + block->size == next->addr.offset && block->addr.chunk == next->addr.chunk) {
                     block->size += next->size;
                     alloc->n_free_blocks--;
                     for (int j = i+1; j < alloc->n_free_blocks; j++) {
@@ -281,13 +283,13 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
             return;
         }
         // check if ptr is at the beginning of the block
-        if (offset + size == block->offset) {
-            block->offset = offset;
+        if (addr.offset + size == block->addr.offset) {
+            block->addr.offset = addr.offset;
             block->size += size;
             // check if we can merge with the previous block (within the same chunk)
             if (i > 0) {
                 struct free_block * prev = &alloc->free_blocks[i-1];
-                if (prev->offset + prev->size == block->offset && block_chunk == ggml_dyn_tallocr_chunk_index(alloc, prev->offset)) {
+                if (prev->addr.offset + prev->size == block->addr.offset && prev->addr.chunk == block->addr.chunk) {
                     prev->size += block->size;
                     alloc->n_free_blocks--;
                     for (int j = i; j < alloc->n_free_blocks; j++) {
@@ -302,7 +304,7 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
     GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
     // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
     int insert_pos = 0;
-    while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
+    while (insert_pos < alloc->n_free_blocks && ggml_buffer_address_less(alloc->free_blocks[insert_pos].addr, addr)) {
         insert_pos++;
     }
     // shift all blocks from insert_pos onward to make room for the new block
@@ -310,7 +312,7 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
         alloc->free_blocks[i] = alloc->free_blocks[i-1];
     }
     // insert the new block
-    alloc->free_blocks[insert_pos].offset = offset;
+    alloc->free_blocks[insert_pos].addr = addr;
     alloc->free_blocks[insert_pos].size = size;
     alloc->n_free_blocks++;
 
@@ -320,7 +322,8 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t
 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
     alloc->n_free_blocks = 1;
     alloc->n_chunks = 1;
-    alloc->free_blocks[0].offset = 0;
+    alloc->free_blocks[0].addr.chunk = 0;
+    alloc->free_blocks[0].addr.offset = 0;
     alloc->free_blocks[0].size = alloc->max_chunk_size;
     memset(alloc->max_size, 0, sizeof(alloc->max_size));
 
@@ -374,7 +377,6 @@ struct vbuffer {
 static struct vbuffer * ggml_vbuffer_new(ggml_backend_buffer_type_t buft) {
     struct vbuffer * buf = calloc(1, sizeof(struct vbuffer));
     buf->buft = buft;
-    memset(buf->chunks, 0, sizeof(buf->chunks));
     return buf;
 }
 
@@ -410,9 +412,6 @@ static size_t ggml_vbuffer_size(struct vbuffer * buf) {
 static bool ggml_vbuffer_alloc(struct vbuffer * buf, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
     for (int n = 0; n < talloc->n_chunks; n++) {
         size_t chunk_size = talloc->max_size[n];
-        if (n > 0) {
-            chunk_size -= talloc->max_size[n - 1];
-        }
         buf->chunks[n] = ggml_backend_buft_alloc_buffer(buf->buft, chunk_size);
         if (buf->chunks[n] == NULL) {
             ggml_vbuffer_free_chunks(buf);
@@ -423,20 +422,10 @@ static bool ggml_vbuffer_alloc(struct vbuffer * buf, const struct ggml_dyn_tallo
     return true;
 }
 
-static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, size_t offset) {
-    size_t chunk_index = 0, chunk_offset = offset;
-    while (chunk_index < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[chunk_index]) {
-        size_t chunk_size = ggml_backend_buffer_get_size(buf->chunks[chunk_index]);
-        if (chunk_offset < chunk_size) {
-            break;
-        }
-        chunk_offset -= chunk_size;
-        chunk_index++;
-    }
-
-    void * base = ggml_backend_buffer_get_base(buf->chunks[chunk_index]);
-    void * addr = (char *)base + chunk_offset;
-    ggml_backend_tensor_alloc(buf->chunks[chunk_index], tensor, addr);
+static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, struct buffer_address buf_addr) {
+    void * base = ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
+    void * addr = (char *)base + buf_addr.offset;
+    ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
 }
 
 static void ggml_vbuffer_reset(struct vbuffer * buf) {
@@ -446,7 +435,6 @@ static void ggml_vbuffer_reset(struct vbuffer * buf) {
 }
 
 
-
 /////////////////////////////////////
 
 // graph allocator
@@ -455,13 +443,13 @@ struct hash_node {
     int n_children;
     int n_views;
     int buffer_id;
-    size_t offset; // offset within the buffer
+    struct buffer_address addr;
     bool allocated;
 };
 
 struct tensor_alloc {
     int buffer_id;
-    size_t offset;
+    struct buffer_address addr;
     size_t size_max; // 0 = pre-allocated, unused, or view
 };
 
@@ -595,7 +583,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
 
     if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
         hn->allocated = true;
-        assert(hn->offset == 0);
+        assert(hn->addr.offset == 0);
 
         // try to reuse a parent's buffer (inplace)
         if (ggml_op_can_inplace(node->op)) {
@@ -629,9 +617,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
                         struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
                         if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
                             AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
-                            assert(view_src_hn->offset == p_hn->offset);
+                            assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
                             hn->buffer_id = p_hn->buffer_id;
-                            hn->offset = p_hn->offset;
+                            hn->addr = p_hn->addr;
                             p_hn->allocated = false; // avoid freeing the parent
                             view_src_hn->allocated = false;
                             return;
@@ -639,7 +627,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
                     } else {
                         AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
                         hn->buffer_id = p_hn->buffer_id;
-                        hn->offset = p_hn->offset;
+                        hn->addr = p_hn->addr;
                         p_hn->allocated = false; // avoid freeing the parent
                         return;
                     }
@@ -650,9 +638,8 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
         struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
         ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
         size_t size = ggml_backend_buft_get_alloc_size(buft, node);
-        size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
         hn->buffer_id = buffer_id;
-        hn->offset = offset;
+        hn->addr = ggml_dyn_tallocr_alloc(alloc, size, node);
     }
 }
 
@@ -664,12 +651,11 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
     }
 
     struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
-    size_t offset = hn->offset;
     int buffer_id = hn->buffer_id;
     struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
     ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
     size_t size = ggml_backend_buft_get_alloc_size(buft, node);
-    ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
+    ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
     hn->allocated = false;
 }
 
@@ -820,24 +806,24 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
         struct node_alloc * node_alloc = &galloc->node_allocs[i];
         if (node->view_src || node->data) {
             node_alloc->dst.buffer_id = -1;
-            node_alloc->dst.offset = SIZE_MAX;
+            node_alloc->dst.addr = GGML_BUFFER_ADDRESS_INVALID;
             node_alloc->dst.size_max = 0;
         } else {
             struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
             node_alloc->dst.buffer_id = hn->buffer_id;
-            node_alloc->dst.offset    = hn->offset;
+            node_alloc->dst.addr = hn->addr;
             node_alloc->dst.size_max  = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
         }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
             if (!src || src->view_src || src->data) {
                 node_alloc->src[j].buffer_id = -1;
-                node_alloc->src[j].offset = SIZE_MAX;
+                node_alloc->src[j].addr = GGML_BUFFER_ADDRESS_INVALID;
                 node_alloc->src[j].size_max = 0;
             } else {
                 struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
                 node_alloc->src[j].buffer_id = hn->buffer_id;
-                node_alloc->src[j].offset   = hn->offset;
+                node_alloc->src[j].addr = hn->addr;
                 node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
             }
         }
@@ -853,11 +839,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
         struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
         if (leaf->view_src || leaf->data) {
             galloc->leaf_allocs[i].leaf.buffer_id = -1;
-            galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
+            galloc->leaf_allocs[i].leaf.addr = GGML_BUFFER_ADDRESS_INVALID;
             galloc->leaf_allocs[i].leaf.size_max = 0;
         } else {
             galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
-            galloc->leaf_allocs[i].leaf.offset = hn->offset;
+            galloc->leaf_allocs[i].leaf.addr = hn->addr;
             galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
         }
     }
@@ -902,7 +888,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
 
     if (tensor->view_src != NULL) {
         if (tensor->buffer == NULL) {
-            assert(tensor_alloc->offset == SIZE_MAX);
+            assert(tensor_alloc->addr.offset == SIZE_MAX);
             if (tensor->view_src->buffer == NULL) {
                 // this tensor was allocated without ggml-backend
                 return;
@@ -911,9 +897,9 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
         }
     } else {
         if (tensor->data == NULL) {
-            assert(tensor_alloc->offset != SIZE_MAX);
+            assert(tensor_alloc->addr.offset != SIZE_MAX);
             assert(ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
-            ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->offset);
+            ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
         } else {
             if (tensor->buffer == NULL) {
                 // this tensor was allocated without ggml-backend

From 059afdb6d44f2c0e2f2300614bec84e7c0e44562 Mon Sep 17 00:00:00 2001
From: Acly <aclysia@gmail.com>
Date: Wed, 17 Sep 2025 15:25:30 +0200
Subject: [PATCH 05/11] fix dyn_tallocr_max_size and initialization

---
 ggml/src/ggml-alloc.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index bc835790ae86b..28c750ea5a589 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -345,7 +345,7 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t m
         /*.alignment       = */ alignment,
         /*.n_free_blocks   = */ 0,
         /*.n_chunks        = */ 0,
-        /*.free_blocks     = */ {{0}},
+        /*.free_blocks     = */ {{{0}, 0}},
         /*.max_size        = */ {0},
         /*.max_chunk_size  = */ max_buffer_size,
 #ifdef GGML_ALLOCATOR_DEBUG
@@ -363,7 +363,11 @@ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
 }
 
 static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
-    return alloc->max_size[alloc->n_chunks - 1];
+    size_t max_size = 0;
+    for (int i = 0; i < alloc->n_chunks; i++) {
+        max_size += alloc->max_size[i];
+    }
+    return max_size;
 }
 
 

From 7b0d76bfaf4038ead6ec4194261c13e39bb40560 Mon Sep 17 00:00:00 2001
From: Acly <aclysia@gmail.com>
Date: Sat, 20 Sep 2025 12:04:03 +0200
Subject: [PATCH 06/11] fix memory leak when buffers are reused due to same
 buffer type appearing multiple times

* make vbuffer allocation follow the same logic as backend_buffer did before
---
 ggml/src/ggml-alloc.c | 44 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 28c750ea5a589..e630be6164daf 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -374,28 +374,16 @@ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
 // virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
 
 struct vbuffer {
-    ggml_backend_buffer_type_t buft;
     ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
 };
 
-static struct vbuffer * ggml_vbuffer_new(ggml_backend_buffer_type_t buft) {
-    struct vbuffer * buf = calloc(1, sizeof(struct vbuffer));
-    buf->buft = buft;
-    return buf;
-}
-
-static void ggml_vbuffer_free_chunks(struct vbuffer * buf) {
-    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) {
-        ggml_backend_buffer_free(buf->chunks[i]);
-        buf->chunks[i] = NULL;
-    }
-}
-
 static void ggml_vbuffer_free(struct vbuffer * buf) {
     if (buf == NULL) {
         return;
     }
-    ggml_vbuffer_free_chunks(buf);
+    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) {
+        ggml_backend_buffer_free(buf->chunks[i]);
+    }
     free(buf);
 }
 
@@ -413,17 +401,22 @@ static size_t ggml_vbuffer_size(struct vbuffer * buf) {
     return size;
 }
 
-static bool ggml_vbuffer_alloc(struct vbuffer * buf, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
+static struct vbuffer * ggml_vbuffer_alloc(ggml_backend_buffer_type_t buft, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
+    struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
+    if (buf == NULL) {
+        return NULL;
+    }
+
     for (int n = 0; n < talloc->n_chunks; n++) {
         size_t chunk_size = talloc->max_size[n];
-        buf->chunks[n] = ggml_backend_buft_alloc_buffer(buf->buft, chunk_size);
+        buf->chunks[n] = ggml_backend_buft_alloc_buffer(buft, chunk_size);
         if (buf->chunks[n] == NULL) {
-            ggml_vbuffer_free_chunks(buf);
-            return false;
+            ggml_vbuffer_free(buf);
+            return NULL;
         }
         ggml_backend_buffer_set_usage(buf->chunks[n], usage);
     }
-    return true;
+    return buf;
 }
 
 static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, struct buffer_address buf_addr) {
@@ -497,7 +490,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
 
     for (int i = 0; i < n_bufs; i++) {
         galloc->bufts[i] = bufts[i];
-        galloc->buffers[i] = ggml_vbuffer_new(bufts[i]);
+        galloc->buffers[i] = NULL;
 
         // check if the same buffer type is used multiple times and reuse the same allocator
         for (int j = 0; j < i; j++) {
@@ -862,17 +855,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
             }
         }
 
-        size_t cur_size = ggml_vbuffer_size(galloc->buffers[i]);
+        size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
         size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
 
         // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
-        if (new_size > cur_size || ggml_vbuffer_n_chunks(galloc->buffers[i]) == 0) {
+        if (new_size > cur_size || galloc->buffers[i] == NULL) {
 #ifndef NDEBUG
             GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
 
-            ggml_vbuffer_free_chunks(galloc->buffers[i]);
-            if (!ggml_vbuffer_alloc(galloc->buffers[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE)) {
+            ggml_vbuffer_free(galloc->buffers[i]);
+            galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+            if (galloc->buffers[i] == NULL) {
                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
                 return false;
             }

From 57381c5110d5c19a71010074f81e63bb93a4410e Mon Sep 17 00:00:00 2001
From: Acly <aclysia@gmail.com>
Date: Sat, 20 Sep 2025 12:42:08 +0200
Subject: [PATCH 07/11] continue to use leftover unallocated space of previous
 chunks after a new one has been created

---
 ggml/src/ggml-alloc.c |  6 ++++--
 tests/test-alloc.cpp  | 24 ++++++++++++++++++++++--
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index e630be6164daf..002b6a147639b 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -185,11 +185,13 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
         // the last block represents memory still available in an existing chunk
         struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
         max_avail = MAX(max_avail, block->size);
-        best_fit_block = alloc->n_free_blocks - 1;
         if (block->size < size) {
             // not enough space in existing chunk, start the next one
-            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[best_fit_block], size);
+            GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
+            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[alloc->n_free_blocks], size);
+            alloc->n_free_blocks++;
         }
+        best_fit_block = alloc->n_free_blocks - 1;
     }
 
     struct free_block * block = &alloc->free_blocks[best_fit_block];
diff --git a/tests/test-alloc.cpp b/tests/test-alloc.cpp
index 48a93eea13ca7..96c0ecf179cf1 100644
--- a/tests/test-alloc.cpp
+++ b/tests/test-alloc.cpp
@@ -319,6 +319,25 @@ static void test_tensor_larger_than_max_size() {
     GGML_ASSERT(backend.context->allocated_total() == 24);
 }
 
+// Fill up leftover unallocated space of a chunk after allocating a large tensor that
+// requires a new chunk.
+static void test_fill_leftover_space() {
+    dummy_backend backend      = dummy_backend_init(16);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[4];
+    x[0] = make_input_with_size(ctx, 8);
+    x[1] = ggml_pad(ctx, x[0], 2, 0, 0, 0);
+    x[3] = ggml_mean(ctx, x[1]);
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[3], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    check_max_size(ctx);
+    GGML_ASSERT(backend.context->allocated_total() <= 12 + 16);
+}
+
 // Check that views don't require any extra memory
 static void test_view_inplace() {
     dummy_backend backend      = dummy_backend_init(32);
@@ -473,8 +492,8 @@ static void test_buffer_size_zero() {
 
     ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
     ggml_gallocr_ptr           galloc   = ggml_gallocr_ptr(ggml_gallocr_new_n(bufts, 2));
-    bool res1 = ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
-    bool res2 = ggml_gallocr_alloc_graph(galloc.get(), graph);
+    bool                       res1     = ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
+    bool                       res2     = ggml_gallocr_alloc_graph(galloc.get(), graph);
     GGML_ASSERT(res1 && res2);
 
     check_all_allocated(graph);
@@ -493,6 +512,7 @@ int main() {
     run("test_max_size_too_many_tensors", test_max_size_too_many_tensors);
     run("test_max_size_tensor_too_large", test_max_size_tensor_too_large);
     run("test_tensor_larger_than_max_size", test_tensor_larger_than_max_size);
+    run("test_fill_leftover_space", test_fill_leftover_space);
     run("test_view_inplace", test_view_inplace);
     run("test_reuse_and_free", test_reuse_and_free);
     run("test_merge_free_block(32)", []() { test_merge_free_block(32); });

From 29087f09083c662f98bef92685295d7c7c6c4be0 Mon Sep 17 00:00:00 2001
From: Acly <aclysia@gmail.com>
Date: Wed, 24 Sep 2025 10:22:04 +0200
Subject: [PATCH 08/11] treat free blocks of each chunk as separate list *
 they're still allocated together, but start/end of each chunk is tracked, and
 allocate/free iterate over sub-ranges * exhaust freed blocks of all chunks
 before considering their last blocks with unallocated space * start with 0
 chunks/blocks and create chunks as needed * allow the last chunk to grow
 beyond max size

---
 ggml/src/ggml-alloc.c | 182 +++++++++++++++++++++++++-----------------
 tests/test-alloc.cpp  |  57 ++++++++++++-
 2 files changed, 160 insertions(+), 79 deletions(-)

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 002b6a147639b..8bb33c218fe57 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -116,8 +116,8 @@ struct free_block {
 
 struct ggml_dyn_tallocr {
     size_t alignment;
-    int n_free_blocks;
     int n_chunks;
+    int free_blocks_begin[GGML_VBUFFER_MAX_CHUNKS + 1]; // end[chunk] == begin[chunk+1]
     struct free_block free_blocks[MAX_FREE_BLOCKS];
     size_t max_size[GGML_VBUFFER_MAX_CHUNKS];
     size_t max_chunk_size;
@@ -130,14 +130,31 @@ struct ggml_dyn_tallocr {
 #endif
 };
 
-// allocations are split into n chunks of size max_size[i]. tensor allocations may not cross chunk boundaries.
-static void ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, struct free_block * block, size_t min_size) {
-    GGML_ASSERT(alloc->n_chunks >= 1);
-    block->addr.chunk = alloc->n_chunks;
-    block->addr.offset = 0;
-    block->size = MAX(min_size, alloc->max_chunk_size);
-    alloc->n_chunks++;
-    GGML_ASSERT(alloc->n_chunks <= GGML_VBUFFER_MAX_CHUNKS);
+struct free_block_range {
+    int begin;
+    int end;
+    int size;
+};
+
+static struct free_block_range ggml_dyn_tallocr_free_block_range(const struct ggml_dyn_tallocr * alloc, int chunk) {
+    struct free_block_range range;
+    range.begin = alloc->free_blocks_begin[chunk];
+    range.end   = alloc->free_blocks_begin[chunk + 1];
+    range.size  = range.end - range.begin;
+    return range;
+}
+
+void ggml_dyn_tallocr_remove_block(struct ggml_dyn_tallocr * alloc, int idx) {
+    int chunk = alloc->free_blocks[idx].addr.chunk;
+    // shift all elements after idx by 1 to the left, overwriting the element at idx
+    int n_free_blocks = alloc->free_blocks_begin[alloc->n_chunks];
+    for (int i = idx; i < n_free_blocks; i++) {
+        alloc->free_blocks[i] = alloc->free_blocks[i + 1];
+    }
+    // adjust first element index of all chunks after the current one
+    for (int c = chunk + 1; c < alloc->n_chunks + 1; c++) {
+        alloc->free_blocks_begin[c]--;
+    }
 }
 
 #ifdef GGML_ALLOCATOR_DEBUG
@@ -167,31 +184,62 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
 
     AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
 
+    int best_fit_block = -1;
     size_t max_avail = 0;
 
     // find the best fitting free block besides the last block
-    int best_fit_block = -1;
-    size_t best_fit_size = SIZE_MAX;
-    for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
-        struct free_block * block = &alloc->free_blocks[i];
-        max_avail = MAX(max_avail, block->size);
-        if (block->size >= size && block->size <= best_fit_size) {
-            best_fit_block = i;
-            best_fit_size = block->size;
+    for (int c = 0; c < alloc->n_chunks; ++c) {
+        struct free_block_range blocks = ggml_dyn_tallocr_free_block_range(alloc, c);
+        size_t best_fit_size = SIZE_MAX;
+        for (int i = blocks.begin; i < blocks.end - 1; i++) {
+            struct free_block * block = &alloc->free_blocks[i];
+            max_avail = MAX(max_avail, block->size);
+            if (block->size >= size && block->size <= best_fit_size) {
+                best_fit_block = i;
+                best_fit_size = block->size;
+            }
+        }
+    }
+
+    if (best_fit_block == -1) {
+        // no suitable block found, try the last block (ie. growing a chunks size)
+        for (int c = 0; c < alloc->n_chunks; ++c) {
+            struct free_block_range blocks = ggml_dyn_tallocr_free_block_range(alloc, c);
+            if (blocks.size > 0) {
+                struct free_block * block = &alloc->free_blocks[blocks.end - 1];
+                max_avail = MAX(max_avail, block->size);
+                if (block->size >= size) {
+                    best_fit_block = blocks.end - 1;
+                    break;
+                }
+            }
         }
     }
 
     if (best_fit_block == -1) {
-        // the last block represents memory still available in an existing chunk
-        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
-        max_avail = MAX(max_avail, block->size);
-        if (block->size < size) {
-            // not enough space in existing chunk, start the next one
-            GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
-            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[alloc->n_free_blocks], size);
-            alloc->n_free_blocks++;
+        // none of the existing chunks have enough space left
+        if (alloc->n_chunks < GGML_VBUFFER_MAX_CHUNKS) {
+            // add a new chunk by creating a block of unclaimed space after the last chunk
+            int i = alloc->free_blocks_begin[alloc->n_chunks];
+            alloc->free_blocks[i].addr.chunk = alloc->n_chunks;
+            alloc->free_blocks[i].addr.offset = 0;
+            // available space in a chunk is limited to max_chunk_size, but can be higher if:
+            // 1. a single tensor exceeds the maximum, and cannot fit any other way
+            // 2. we are running out of chunks
+            // backends will either manage to allocate the larger size, or report an error.
+            alloc->free_blocks[i].size = MAX(size, alloc->max_chunk_size);
+            if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
+                alloc->free_blocks[i].size = SIZE_MAX/2;
+            }
+            alloc->free_blocks_begin[alloc->n_chunks + 1] = i + 1;
+            alloc->n_chunks++;
+            best_fit_block = i;
+        } else {
+            // since the last chunk always has virtually endless memory, this should never happen
+            GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
+                __func__, size, max_avail);
+            GGML_ABORT("graph allocation: failed to reserve memory");
         }
-        best_fit_block = alloc->n_free_blocks - 1;
     }
 
     struct free_block * block = &alloc->free_blocks[best_fit_block];
@@ -200,15 +248,7 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
     block->size -= size;
     if (block->size == 0) {
         // remove block if empty
-        alloc->n_free_blocks--;
-        for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
-            alloc->free_blocks[j] = alloc->free_blocks[j+1];
-        }
-        // if there are no remaining blocks all memory in current chunk was used up -> start the next one
-        if (alloc->n_free_blocks == 0) {
-            alloc->n_free_blocks = 1;
-            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[0], 0);
-        }
+        ggml_dyn_tallocr_remove_block(alloc, best_fit_block);
     }
 
     AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
@@ -255,31 +295,27 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
 static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
     size = aligned_offset(NULL, size, alloc->alignment);
 
-    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, addr.chunk, addr.offset, size, alloc->n_free_blocks);
+    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
+        __func__, tensor->name, addr.chunk, addr.offset, size, alloc->free_blocks_begin[alloc->n_chunks]);
 
 #ifdef GGML_ALLOCATOR_DEBUG
     remove_allocated_tensor(alloc, addr, tensor);
 #endif
 
+    struct free_block_range blocks = ggml_dyn_tallocr_free_block_range(alloc, addr.chunk);
+
     // see if we can merge with an existing block
-    for (int i = 0; i < alloc->n_free_blocks; i++) {
+    for (int i = blocks.begin; i < blocks.end; i++) {
         struct free_block * block = &alloc->free_blocks[i];
-        // can only merge with blocks within the same chunk
-        if (addr.chunk != block->addr.chunk) {
-            continue;
-        }
         // check if ptr is at the end of the block
         if (block->addr.offset + block->size == addr.offset) {
             block->size += size;
-            // check if we can merge with the next block (within the same chunk)
-            if (i < alloc->n_free_blocks - 1) {
+            // check if we can merge with the next block
+            if (i < blocks.end - 1) {
                 struct free_block * next = &alloc->free_blocks[i+1];
-                if (block->addr.offset + block->size == next->addr.offset && block->addr.chunk == next->addr.chunk) {
+                if (block->addr.offset + block->size == next->addr.offset) {
                     block->size += next->size;
-                    alloc->n_free_blocks--;
-                    for (int j = i+1; j < alloc->n_free_blocks; j++) {
-                        alloc->free_blocks[j] = alloc->free_blocks[j+1];
-                    }
+                    ggml_dyn_tallocr_remove_block(alloc, i+1);
                 }
             }
             return;
@@ -288,50 +324,46 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
         if (addr.offset + size == block->addr.offset) {
             block->addr.offset = addr.offset;
             block->size += size;
-            // check if we can merge with the previous block (within the same chunk)
-            if (i > 0) {
+            // check if we can merge with the previous block
+            if (i > blocks.begin) {
                 struct free_block * prev = &alloc->free_blocks[i-1];
-                if (prev->addr.offset + prev->size == block->addr.offset && prev->addr.chunk == block->addr.chunk) {
+                if (prev->addr.offset + prev->size == block->addr.offset) {
                     prev->size += block->size;
-                    alloc->n_free_blocks--;
-                    for (int j = i; j < alloc->n_free_blocks; j++) {
-                        alloc->free_blocks[j] = alloc->free_blocks[j+1];
-                    }
+                    ggml_dyn_tallocr_remove_block(alloc, i);
                 }
             }
             return;
         }
     }
     // otherwise, add a new block
-    GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
+    int n_free_blocks = alloc->free_blocks_begin[alloc->n_chunks];
+    GGML_ASSERT(n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
     // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
-    int insert_pos = 0;
-    while (insert_pos < alloc->n_free_blocks && ggml_buffer_address_less(alloc->free_blocks[insert_pos].addr, addr)) {
+    int insert_pos = blocks.begin;
+    while (insert_pos < blocks.end && alloc->free_blocks[insert_pos].addr.offset < addr.offset) {
         insert_pos++;
     }
     // shift all blocks from insert_pos onward to make room for the new block
-    for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
+    for (int i = n_free_blocks; i > insert_pos; i--) {
         alloc->free_blocks[i] = alloc->free_blocks[i-1];
     }
     // insert the new block
     alloc->free_blocks[insert_pos].addr = addr;
     alloc->free_blocks[insert_pos].size = size;
-    alloc->n_free_blocks++;
+    for (int c = addr.chunk + 1; c < alloc->n_chunks + 1; c++) {
+        alloc->free_blocks_begin[c]++;
+    }
 
     GGML_UNUSED(tensor);
 }
 
 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
-    alloc->n_free_blocks = 1;
-    alloc->n_chunks = 1;
-    alloc->free_blocks[0].addr.chunk = 0;
-    alloc->free_blocks[0].addr.offset = 0;
-    alloc->free_blocks[0].size = alloc->max_chunk_size;
-    memset(alloc->max_size, 0, sizeof(alloc->max_size));
-
-    if (alloc->free_blocks[0].size == SIZE_MAX) {
-        alloc->free_blocks[0].size = SIZE_MAX/2; // avoid overflows
+    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
+        alloc->free_blocks_begin[i] = 0;
+        alloc->max_size[i] = 0;
     }
+    alloc->free_blocks_begin[GGML_VBUFFER_MAX_CHUNKS] = 0;
+    alloc->n_chunks = 0;
 
 #ifdef GGML_ALLOCATOR_DEBUG
     for (int i = 0; i < 1024; i++) {
@@ -344,12 +376,12 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t m
     struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
 
     *alloc = (struct ggml_dyn_tallocr) {
-        /*.alignment       = */ alignment,
-        /*.n_free_blocks   = */ 0,
-        /*.n_chunks        = */ 0,
-        /*.free_blocks     = */ {{{0}, 0}},
-        /*.max_size        = */ {0},
-        /*.max_chunk_size  = */ max_buffer_size,
+        /*.alignment         = */ alignment,
+        /*.n_chunks          = */ 0,
+        /*.free_blocks_begin = */ {0},
+        /*.free_blocks       = */ {{{0}, 0}},
+        /*.max_size          = */ {0},
+        /*.max_chunk_size    = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
 #ifdef GGML_ALLOCATOR_DEBUG
         /*.allocated_tensors = */ {{0}},
 #endif
diff --git a/tests/test-alloc.cpp b/tests/test-alloc.cpp
index 96c0ecf179cf1..2eb7724731acc 100644
--- a/tests/test-alloc.cpp
+++ b/tests/test-alloc.cpp
@@ -16,6 +16,7 @@ uint8_t * const alloc_base = (uint8_t *) 16;
 
 struct dummy_backend_context {
     size_t max_buffer_size = 64;
+    size_t alignment       = 8;
 
     ggml_backend_buffer_i              buffer_interface;
     std::vector<ggml_backend_buffer_t> buffers;
@@ -42,8 +43,9 @@ static ggml_backend_buffer_t dummy_backend_buffer_type_alloc_buffer(ggml_backend
     return buffer;
 }
 
-static size_t dummy_backend_buffer_type_get_alignment(ggml_backend_buffer_type_t) {
-    return 8;
+static size_t dummy_backend_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
+    return ctx->alignment;
 }
 
 static size_t dummy_backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
@@ -88,9 +90,10 @@ struct dummy_backend {
     ggml_backend_buffer_type               buffer_type;
 };
 
-static dummy_backend dummy_backend_init(size_t max_buffer_size) {
+static dummy_backend dummy_backend_init(size_t max_buffer_size, size_t alignment = 8) {
     dummy_backend b{};
     b.context                  = std::make_unique<dummy_backend_context>();
+    b.context->alignment       = alignment;
     b.context->max_buffer_size = max_buffer_size;
 
     b.context->buffer_interface.free_buffer   = dummy_backend_buffer_free_buffer;
@@ -121,7 +124,7 @@ struct test_context_with_graph {
 
 static test_context_with_graph make_context() {
     ggml_init_params params{};
-    params.mem_size = 32 * ggml_tensor_overhead() + ggml_graph_overhead();
+    params.mem_size = 48 * ggml_tensor_overhead() + ggml_graph_overhead();
     params.no_alloc = true;
 
     ggml_context *   ctx     = ggml_init(params);
@@ -319,6 +322,32 @@ static void test_tensor_larger_than_max_size() {
     GGML_ASSERT(backend.context->allocated_total() == 24);
 }
 
+// This test assumes a max of 16 buffer chunks, and tries to allocate tensors that would
+// require more. Expectation is that the last buffer should grow to fit everything,
+// leaving it to the backend to error out if it can't allocate that much.
+static void test_not_enough_chunks() {
+    const int max_chunks = 16;
+    const int max_size   = 8;
+
+    dummy_backend backend      = dummy_backend_init(max_size);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[max_chunks + 1];
+    for (int i = 0; i < max_chunks + 1; ++i) {
+        x[i] = make_input_with_size(ctx, max_size);
+    }
+    ggml_tensor * acc = x[0];
+    for (int i = 0; i < max_chunks; ++i) {
+        acc = ggml_add(ctx, acc, x[i + 1]);
+    }
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, acc, &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    GGML_ASSERT(backend.context->allocated_total() > max_chunks * max_size);
+}
+
 // Fill up leftover unallocated space of a chunk after allocating a large tensor that
 // requires a new chunk.
 static void test_fill_leftover_space() {
@@ -405,6 +434,24 @@ static void test_merge_free_block(size_t max_buffer_size) {
     GGML_ASSERT(backend.context->allocated_total() <= 32 + 32 + 24);
 }
 
+// Check that previously allocated but freed memory is preferred over allocating
+// additional memory, even if the remaining space in a chunk would match tensor size better
+static void test_prefer_already_allocated_memory() {
+    dummy_backend backend      = dummy_backend_init(32, /*align*/ 4);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[3];
+    x[0] = make_input_with_size(ctx, 24);  // [24b][8b unused]
+    x[1] = ggml_mean(ctx, x[0]);           // [24b free][4b][4b unused]
+    x[2] = ggml_mean(ctx, x[1]);           // should be allocated in the 24b block
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[2], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    GGML_ASSERT(backend.context->allocated_total() <= 28);
+}
+
 // test for allocating on multiple devices with some tensors in the graph
 // allocated externally (not by gallocr).
 static void test_multiple_buffer_types() {
@@ -512,11 +559,13 @@ int main() {
     run("test_max_size_too_many_tensors", test_max_size_too_many_tensors);
     run("test_max_size_tensor_too_large", test_max_size_tensor_too_large);
     run("test_tensor_larger_than_max_size", test_tensor_larger_than_max_size);
+    run("test_not_enough_chunks", test_not_enough_chunks);
     run("test_fill_leftover_space", test_fill_leftover_space);
     run("test_view_inplace", test_view_inplace);
     run("test_reuse_and_free", test_reuse_and_free);
     run("test_merge_free_block(32)", []() { test_merge_free_block(32); });
     run("test_merge_free_block(SIZE_MAX)", []() { test_merge_free_block(SIZE_MAX); });
+    run("test_prefer_already_allocated_memory", test_prefer_already_allocated_memory);
     run("test_multiple_buffer_types", test_multiple_buffer_types);
     run("test_buffer_size_zero", test_buffer_size_zero);
     return 0;

From 5a916c72e4c9032705cceb9f645691f415f5fdd5 Mon Sep 17 00:00:00 2001
From: Acly <aclysia@gmail.com>
Date: Wed, 24 Sep 2025 10:48:29 +0200
Subject: [PATCH 09/11] refactor: move adding new free block and new chunk into
 separate functions

---
 ggml/src/ggml-alloc.c | 93 ++++++++++++++++++++++++-------------------
 1 file changed, 52 insertions(+), 41 deletions(-)

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 8bb33c218fe57..36ad8b5c07662 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -144,6 +144,27 @@ static struct free_block_range ggml_dyn_tallocr_free_block_range(const struct gg
     return range;
 }
 
+void ggml_dyn_tallocr_insert_block(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
+    int total_blocks = alloc->free_blocks_begin[alloc->n_chunks];
+    GGML_ASSERT(total_blocks < MAX_FREE_BLOCKS && "out of free blocks");
+    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
+    int insert_pos = alloc->free_blocks_begin[addr.chunk];
+    int blocks_end = alloc->free_blocks_begin[addr.chunk + 1];
+    while (insert_pos < blocks_end && alloc->free_blocks[insert_pos].addr.offset < addr.offset) {
+        insert_pos++;
+    }
+    // shift all blocks from insert_pos onward to make room for the new block
+    for (int i = total_blocks; i > insert_pos; i--) {
+        alloc->free_blocks[i] = alloc->free_blocks[i-1];
+    }
+    // insert the new block
+    alloc->free_blocks[insert_pos].addr = addr;
+    alloc->free_blocks[insert_pos].size = size;
+    for (int c = addr.chunk + 1; c < alloc->n_chunks + 1; ++c) {
+        alloc->free_blocks_begin[c]++;
+    }
+}
+
 void ggml_dyn_tallocr_remove_block(struct ggml_dyn_tallocr * alloc, int idx) {
     int chunk = alloc->free_blocks[idx].addr.chunk;
     // shift all elements after idx by 1 to the left, overwriting the element at idx
@@ -157,6 +178,27 @@ void ggml_dyn_tallocr_remove_block(struct ggml_dyn_tallocr * alloc, int idx) {
     }
 }
 
+// add a new chunk by creating a block of unclaimed space after the last chunk
+int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
+    if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) {
+        return -1;
+    }
+    int i = alloc->free_blocks_begin[alloc->n_chunks];
+    alloc->free_blocks[i].addr.chunk = alloc->n_chunks;
+    alloc->free_blocks[i].addr.offset = 0;
+    // available space in a chunk is limited to max_chunk_size, but can be higher if:
+    // 1. a single tensor exceeds the maximum, and cannot fit any other way
+    // 2. we are running out of chunks
+    // backends will either manage to allocate the larger size, or report an error.
+    alloc->free_blocks[i].size = MAX(min_size, alloc->max_chunk_size);
+    if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
+        alloc->free_blocks[i].size = SIZE_MAX/2;
+    }
+    alloc->free_blocks_begin[alloc->n_chunks + 1] = i + 1;
+    alloc->n_chunks++;
+    return i;
+}
+
 #ifdef GGML_ALLOCATOR_DEBUG
 static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
     for (int i = 0; i < 1024; i++) {
@@ -187,7 +229,7 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
     int best_fit_block = -1;
     size_t max_avail = 0;
 
-    // find the best fitting free block besides the last block
+    // find the best fitting free block in any chunk besides the last block
     for (int c = 0; c < alloc->n_chunks; ++c) {
         struct free_block_range blocks = ggml_dyn_tallocr_free_block_range(alloc, c);
         size_t best_fit_size = SIZE_MAX;
@@ -202,7 +244,7 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
     }
 
     if (best_fit_block == -1) {
-        // no suitable block found, try the last block (ie. growing a chunks size)
+        // no suitable block found, try the last block (this will grow a chunks size)
         for (int c = 0; c < alloc->n_chunks; ++c) {
             struct free_block_range blocks = ggml_dyn_tallocr_free_block_range(alloc, c);
             if (blocks.size > 0) {
@@ -218,28 +260,13 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
 
     if (best_fit_block == -1) {
         // none of the existing chunks have enough space left
-        if (alloc->n_chunks < GGML_VBUFFER_MAX_CHUNKS) {
-            // add a new chunk by creating a block of unclaimed space after the last chunk
-            int i = alloc->free_blocks_begin[alloc->n_chunks];
-            alloc->free_blocks[i].addr.chunk = alloc->n_chunks;
-            alloc->free_blocks[i].addr.offset = 0;
-            // available space in a chunk is limited to max_chunk_size, but can be higher if:
-            // 1. a single tensor exceeds the maximum, and cannot fit any other way
-            // 2. we are running out of chunks
-            // backends will either manage to allocate the larger size, or report an error.
-            alloc->free_blocks[i].size = MAX(size, alloc->max_chunk_size);
-            if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
-                alloc->free_blocks[i].size = SIZE_MAX/2;
-            }
-            alloc->free_blocks_begin[alloc->n_chunks + 1] = i + 1;
-            alloc->n_chunks++;
-            best_fit_block = i;
-        } else {
-            // since the last chunk always has virtually endless memory, this should never happen
-            GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
-                __func__, size, max_avail);
-            GGML_ABORT("graph allocation: failed to reserve memory");
-        }
+        best_fit_block = ggml_dyn_tallocr_new_chunk(alloc, size);
+    }
+    if (best_fit_block == -1) {
+        // since the last chunk always has virtually endless memory, this should never happen
+        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
+            __func__, size, max_avail);
+        GGML_ABORT("graph allocation: failed to reserve memory");
     }
 
     struct free_block * block = &alloc->free_blocks[best_fit_block];
@@ -336,23 +363,7 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
         }
     }
     // otherwise, add a new block
-    int n_free_blocks = alloc->free_blocks_begin[alloc->n_chunks];
-    GGML_ASSERT(n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
-    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
-    int insert_pos = blocks.begin;
-    while (insert_pos < blocks.end && alloc->free_blocks[insert_pos].addr.offset < addr.offset) {
-        insert_pos++;
-    }
-    // shift all blocks from insert_pos onward to make room for the new block
-    for (int i = n_free_blocks; i > insert_pos; i--) {
-        alloc->free_blocks[i] = alloc->free_blocks[i-1];
-    }
-    // insert the new block
-    alloc->free_blocks[insert_pos].addr = addr;
-    alloc->free_blocks[insert_pos].size = size;
-    for (int c = addr.chunk + 1; c < alloc->n_chunks + 1; c++) {
-        alloc->free_blocks_begin[c]++;
-    }
+    ggml_dyn_tallocr_insert_block(alloc, addr, size);
 
     GGML_UNUSED(tensor);
 }

From 69964e02124fc323ae01ce7e05e8e5582236488b Mon Sep 17 00:00:00 2001
From: Acly <aclysia@gmail.com>
Date: Wed, 24 Sep 2025 12:27:19 +0200
Subject: [PATCH 10/11] allocate chunks individually with a separate
 free-blocks list for each one

* needs a bit more memory/allocations/indirections, but code is simpler
---
 ggml/src/ggml-alloc.c | 161 +++++++++++++++++++-----------------------
 1 file changed, 73 insertions(+), 88 deletions(-)

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 36ad8b5c07662..2f61f7b483d37 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -110,17 +110,21 @@ static bool ggml_buffer_address_less(struct buffer_address a, struct buffer_addr
 }
 
 struct free_block {
-    struct buffer_address addr;
+    size_t offset;
     size_t size;
 };
 
+struct tallocr_chunk {
+    struct free_block free_blocks[MAX_FREE_BLOCKS];
+    int n_free_blocks;
+    size_t max_size;
+};
+
 struct ggml_dyn_tallocr {
     size_t alignment;
-    int n_chunks;
-    int free_blocks_begin[GGML_VBUFFER_MAX_CHUNKS + 1]; // end[chunk] == begin[chunk+1]
-    struct free_block free_blocks[MAX_FREE_BLOCKS];
-    size_t max_size[GGML_VBUFFER_MAX_CHUNKS];
     size_t max_chunk_size;
+    struct tallocr_chunk * chunks[GGML_VBUFFER_MAX_CHUNKS];
+    int n_chunks;
 
 #ifdef GGML_ALLOCATOR_DEBUG
     struct {
@@ -130,73 +134,49 @@ struct ggml_dyn_tallocr {
 #endif
 };
 
-struct free_block_range {
-    int begin;
-    int end;
-    int size;
-};
-
-static struct free_block_range ggml_dyn_tallocr_free_block_range(const struct ggml_dyn_tallocr * alloc, int chunk) {
-    struct free_block_range range;
-    range.begin = alloc->free_blocks_begin[chunk];
-    range.end   = alloc->free_blocks_begin[chunk + 1];
-    range.size  = range.end - range.begin;
-    return range;
-}
-
-void ggml_dyn_tallocr_insert_block(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) {
-    int total_blocks = alloc->free_blocks_begin[alloc->n_chunks];
-    GGML_ASSERT(total_blocks < MAX_FREE_BLOCKS && "out of free blocks");
+void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
+    GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
     // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
-    int insert_pos = alloc->free_blocks_begin[addr.chunk];
-    int blocks_end = alloc->free_blocks_begin[addr.chunk + 1];
-    while (insert_pos < blocks_end && alloc->free_blocks[insert_pos].addr.offset < addr.offset) {
+    int insert_pos = 0;
+    while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
         insert_pos++;
     }
     // shift all blocks from insert_pos onward to make room for the new block
-    for (int i = total_blocks; i > insert_pos; i--) {
-        alloc->free_blocks[i] = alloc->free_blocks[i-1];
+    for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
+        chunk->free_blocks[i] = chunk->free_blocks[i-1];
     }
     // insert the new block
-    alloc->free_blocks[insert_pos].addr = addr;
-    alloc->free_blocks[insert_pos].size = size;
-    for (int c = addr.chunk + 1; c < alloc->n_chunks + 1; ++c) {
-        alloc->free_blocks_begin[c]++;
-    }
+    chunk->free_blocks[insert_pos].offset = offset;
+    chunk->free_blocks[insert_pos].size = size;
+    chunk->n_free_blocks++;
 }
 
-void ggml_dyn_tallocr_remove_block(struct ggml_dyn_tallocr * alloc, int idx) {
-    int chunk = alloc->free_blocks[idx].addr.chunk;
+void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
     // shift all elements after idx by 1 to the left, overwriting the element at idx
-    int n_free_blocks = alloc->free_blocks_begin[alloc->n_chunks];
-    for (int i = idx; i < n_free_blocks; i++) {
-        alloc->free_blocks[i] = alloc->free_blocks[i + 1];
-    }
-    // adjust first element index of all chunks after the current one
-    for (int c = chunk + 1; c < alloc->n_chunks + 1; c++) {
-        alloc->free_blocks_begin[c]--;
+    for (int i = idx; i < chunk->n_free_blocks; i++) {
+        chunk->free_blocks[i] = chunk->free_blocks[i+1];
     }
+    chunk->n_free_blocks--;
 }
 
-// add a new chunk by creating a block of unclaimed space after the last chunk
 int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
     if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) {
         return -1;
     }
-    int i = alloc->free_blocks_begin[alloc->n_chunks];
-    alloc->free_blocks[i].addr.chunk = alloc->n_chunks;
-    alloc->free_blocks[i].addr.offset = 0;
+    struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
+    chunk->n_free_blocks = 1;
+    chunk->free_blocks[0].offset = 0;
     // available space in a chunk is limited to max_chunk_size, but can be higher if:
     // 1. a single tensor exceeds the maximum, and cannot fit any other way
     // 2. we are running out of chunks
     // backends will either manage to allocate the larger size, or report an error.
-    alloc->free_blocks[i].size = MAX(min_size, alloc->max_chunk_size);
+    chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
     if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
-        alloc->free_blocks[i].size = SIZE_MAX/2;
+        chunk->free_blocks[0].size = SIZE_MAX/2;
     }
-    alloc->free_blocks_begin[alloc->n_chunks + 1] = i + 1;
+    alloc->chunks[alloc->n_chunks] = chunk;
     alloc->n_chunks++;
-    return i;
+    return alloc->n_chunks - 1;
 }
 
 #ifdef GGML_ALLOCATOR_DEBUG
@@ -226,17 +206,19 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
 
     AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
 
+    int best_fit_chunk = -1;
     int best_fit_block = -1;
     size_t max_avail = 0;
 
     // find the best fitting free block in any chunk besides the last block
     for (int c = 0; c < alloc->n_chunks; ++c) {
-        struct free_block_range blocks = ggml_dyn_tallocr_free_block_range(alloc, c);
+        struct tallocr_chunk * chunk = alloc->chunks[c];
         size_t best_fit_size = SIZE_MAX;
-        for (int i = blocks.begin; i < blocks.end - 1; i++) {
-            struct free_block * block = &alloc->free_blocks[i];
+        for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
+            struct free_block * block = &chunk->free_blocks[i];
             max_avail = MAX(max_avail, block->size);
             if (block->size >= size && block->size <= best_fit_size) {
+                best_fit_chunk = c;
                 best_fit_block = i;
                 best_fit_size = block->size;
             }
@@ -246,12 +228,13 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
     if (best_fit_block == -1) {
         // no suitable block found, try the last block (this will grow a chunks size)
         for (int c = 0; c < alloc->n_chunks; ++c) {
-            struct free_block_range blocks = ggml_dyn_tallocr_free_block_range(alloc, c);
-            if (blocks.size > 0) {
-                struct free_block * block = &alloc->free_blocks[blocks.end - 1];
+            struct tallocr_chunk * chunk = alloc->chunks[c];
+            if (chunk->n_free_blocks > 0) {
+                struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
                 max_avail = MAX(max_avail, block->size);
                 if (block->size >= size) {
-                    best_fit_block = blocks.end - 1;
+                    best_fit_chunk = c;
+                    best_fit_block = chunk->n_free_blocks - 1;
                     break;
                 }
             }
@@ -260,7 +243,8 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
 
     if (best_fit_block == -1) {
         // none of the existing chunks have enough space left
-        best_fit_block = ggml_dyn_tallocr_new_chunk(alloc, size);
+        best_fit_chunk = ggml_dyn_tallocr_new_chunk(alloc, size);
+        best_fit_block = 0;
     }
     if (best_fit_block == -1) {
         // since the last chunk always has virtually endless memory, this should never happen
@@ -269,13 +253,14 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
         GGML_ABORT("graph allocation: failed to reserve memory");
     }
 
-    struct free_block * block = &alloc->free_blocks[best_fit_block];
-    struct buffer_address addr = block->addr;
-    block->addr.offset += size;
+    struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
+    struct free_block    * block = &chunk->free_blocks[best_fit_block];
+    struct buffer_address  addr  = {.chunk = best_fit_chunk, .offset = block->offset };
+    block->offset += size;
     block->size -= size;
     if (block->size == 0) {
         // remove block if empty
-        ggml_dyn_tallocr_remove_block(alloc, best_fit_block);
+        ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
     }
 
     AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
@@ -311,7 +296,7 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
     }
 #endif
 
-    alloc->max_size[addr.chunk] = MAX(alloc->max_size[addr.chunk], addr.offset + size);
+    chunk->max_size = MAX(chunk->max_size, addr.offset + size);
 
     return addr;
 
@@ -329,51 +314,50 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
     remove_allocated_tensor(alloc, addr, tensor);
 #endif
 
-    struct free_block_range blocks = ggml_dyn_tallocr_free_block_range(alloc, addr.chunk);
+    struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
 
     // see if we can merge with an existing block
-    for (int i = blocks.begin; i < blocks.end; i++) {
-        struct free_block * block = &alloc->free_blocks[i];
+    for (int i = 0; i < chunk->n_free_blocks; i++) {
+        struct free_block * block = &chunk->free_blocks[i];
         // check if ptr is at the end of the block
-        if (block->addr.offset + block->size == addr.offset) {
+        if (block->offset + block->size == addr.offset) {
             block->size += size;
             // check if we can merge with the next block
-            if (i < blocks.end - 1) {
-                struct free_block * next = &alloc->free_blocks[i+1];
-                if (block->addr.offset + block->size == next->addr.offset) {
+            if (i < chunk->n_free_blocks - 1) {
+                struct free_block * next = &chunk->free_blocks[i+1];
+                if (block->offset + block->size == next->offset) {
                     block->size += next->size;
-                    ggml_dyn_tallocr_remove_block(alloc, i+1);
+                    ggml_dyn_tallocr_remove_block(chunk, i+1);
                 }
             }
             return;
         }
         // check if ptr is at the beginning of the block
-        if (addr.offset + size == block->addr.offset) {
-            block->addr.offset = addr.offset;
+        if (addr.offset + size == block->offset) {
+            block->offset = addr.offset;
             block->size += size;
             // check if we can merge with the previous block
-            if (i > blocks.begin) {
-                struct free_block * prev = &alloc->free_blocks[i-1];
-                if (prev->addr.offset + prev->size == block->addr.offset) {
+            if (i > 0) {
+                struct free_block * prev = &chunk->free_blocks[i-1];
+                if (prev->offset + prev->size == block->offset) {
                     prev->size += block->size;
-                    ggml_dyn_tallocr_remove_block(alloc, i);
+                    ggml_dyn_tallocr_remove_block(chunk, i);
                 }
             }
             return;
         }
     }
     // otherwise, add a new block
-    ggml_dyn_tallocr_insert_block(alloc, addr, size);
+    ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
 
     GGML_UNUSED(tensor);
 }
 
 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
     for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
-        alloc->free_blocks_begin[i] = 0;
-        alloc->max_size[i] = 0;
+        free(alloc->chunks[i]);
+        alloc->chunks[i] = NULL;
     }
-    alloc->free_blocks_begin[GGML_VBUFFER_MAX_CHUNKS] = 0;
     alloc->n_chunks = 0;
 
 #ifdef GGML_ALLOCATOR_DEBUG
@@ -387,12 +371,10 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t m
     struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
 
     *alloc = (struct ggml_dyn_tallocr) {
-        /*.alignment         = */ alignment,
-        /*.n_chunks          = */ 0,
-        /*.free_blocks_begin = */ {0},
-        /*.free_blocks       = */ {{{0}, 0}},
-        /*.max_size          = */ {0},
-        /*.max_chunk_size    = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
+        /*.alignment      = */ alignment,
+        /*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
+        /*.chunks         = */ {NULL},
+        /*.n_chunks       = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
         /*.allocated_tensors = */ {{0}},
 #endif
@@ -404,13 +386,16 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t m
 }
 
 static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
+    for (int i = 0; i < alloc->n_chunks; ++i) {
+        free(alloc->chunks[i]);
+    }
     free(alloc);
 }
 
 static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
     size_t max_size = 0;
     for (int i = 0; i < alloc->n_chunks; i++) {
-        max_size += alloc->max_size[i];
+        max_size += alloc->chunks[i]->max_size;
     }
     return max_size;
 }
@@ -453,7 +438,7 @@ static struct vbuffer * ggml_vbuffer_alloc(ggml_backend_buffer_type_t buft, cons
     }
 
     for (int n = 0; n < talloc->n_chunks; n++) {
-        size_t chunk_size = talloc->max_size[n];
+        size_t chunk_size = talloc->chunks[n]->max_size;
         buf->chunks[n] = ggml_backend_buft_alloc_buffer(buft, chunk_size);
         if (buf->chunks[n] == NULL) {
             ggml_vbuffer_free(buf);

From ed6928069ffcd35f2ff7fc406ce1a7debc5b6cf1 Mon Sep 17 00:00:00 2001
From: Acly <aclysia@gmail.com>
Date: Wed, 24 Sep 2025 12:44:24 +0200
Subject: [PATCH 11/11] fix warnings (missing static) & debug checks

---
 ggml/src/ggml-alloc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 2f61f7b483d37..fa46f3b491aa5 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -134,7 +134,7 @@ struct ggml_dyn_tallocr {
 #endif
 };
 
-void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
+static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
     GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
     // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
     int insert_pos = 0;
@@ -151,7 +151,7 @@ void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset,
     chunk->n_free_blocks++;
 }
 
-void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
+static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
     // shift all elements after idx by 1 to the left, overwriting the element at idx
     for (int i = idx; i < chunk->n_free_blocks; i++) {
         chunk->free_blocks[i] = chunk->free_blocks[i+1];
@@ -159,7 +159,7 @@ void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
     chunk->n_free_blocks--;
 }
 
-int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
+static int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
     if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) {
         return -1;
     }
@@ -210,7 +210,7 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
     int best_fit_block = -1;
     size_t max_avail = 0;
 
-    // find the best fitting free block in any chunk besides the last block
+    // find the best fitting free block besides the last block, within any chunk
     for (int c = 0; c < alloc->n_chunks; ++c) {
         struct tallocr_chunk * chunk = alloc->chunks[c];
         size_t best_fit_size = SIZE_MAX;
@@ -246,7 +246,7 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
         best_fit_chunk = ggml_dyn_tallocr_new_chunk(alloc, size);
         best_fit_block = 0;
     }
-    if (best_fit_block == -1) {
+    if (best_fit_chunk == -1) {
         // since the last chunk always has virtually endless memory, this should never happen
         GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
             __func__, size, max_avail);
@@ -308,7 +308,7 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
     size = aligned_offset(NULL, size, alloc->alignment);
 
     AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
-        __func__, tensor->name, addr.chunk, addr.offset, size, alloc->free_blocks_begin[alloc->n_chunks]);
+        __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
 
 #ifdef GGML_ALLOCATOR_DEBUG
     remove_allocated_tensor(alloc, addr, tensor);