From 092e2e293018ef711f1ab51924bcdaca537a256e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 14 Oct 2025 20:41:08 +0200 Subject: [PATCH] llama-model: fix insonsistent ctxs <-> bufs order --- src/llama-model.cpp | 60 +++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 32 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0cdad9babd9b2..e26c0b5f50969 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -421,11 +421,8 @@ struct llama_model::impl { llama_mlocks mlock_bufs; llama_mlocks mlock_mmaps; - // contexts where the model tensors metadata is stored - std::vector ctxs; - - // the model memory buffers for the tensor data - std::vector bufs; + // contexts where the model tensors metadata is stored as well ass the corresponding buffers: + std::vector> ctxs_bufs; buft_list_t cpu_buft_list; std::map gpu_buft_list; @@ -2181,7 +2178,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { max_n_tensors += n_layer*2; // duplicated rope freq tensors const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors; - std::map ctx_map; + // define a comparator for the buft -> ctx map to ensure that the order is well-defined: + struct ggml_backend_buft_comparator { + bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const { + return ggml_backend_buft_name(lhs) < ggml_backend_buft_name(rhs); + } + }; + std::map ctx_map; + auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { auto it = ctx_map.find(buft); if (it == ctx_map.end()) { @@ -2196,12 +2200,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { throw std::runtime_error(format("failed to create ggml context")); } - ctx_map[buft] = ctx; - pimpl->ctxs.emplace_back(ctx); + ctx_map.emplace(buft, ctx); return ctx; } - return it->second; + return it->second.get(); }; const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED; @@ -6036,16 +6039,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) { pimpl->mappings.reserve(ml.mappings.size()); // create the backend buffers - std::vector> ctx_bufs; - ctx_bufs.reserve(ctx_map.size()); + std::vector> ctx_buf_maps; + ctx_buf_maps.reserve(ctx_map.size()); // Ensure we have enough capacity for the maximum backend buffer we will potentially create const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size(); - pimpl->bufs.reserve(n_max_backend_buffer); + pimpl->ctxs_bufs.reserve(n_max_backend_buffer); - for (auto & it : ctx_map) { - ggml_backend_buffer_type_t buft = it.first; - ggml_context * ctx = it.second; + for (auto & [buft, ctx_ptr] : ctx_map) { + ggml_context * ctx = ctx_ptr.get(); // skip contexts without tensors if (ggml_get_first_tensor(ctx) == nullptr) { @@ -6069,6 +6071,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr; bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev); + ggml_backend_buffer_t buf = nullptr; if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) { for (uint32_t idx = 0; idx < ml.files.size(); idx++) { // only the mmap region containing the tensors in the model is mapped to the backend buffer @@ -6081,20 +6084,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) { continue; } const size_t max_size = ggml_get_max_tensor_size(ctx); - ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size); + buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size); if (buf == nullptr) { throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); } - pimpl->bufs.emplace_back(buf); buf_map.emplace(idx, buf); } } else { - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); if (buf == nullptr) { throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); } - pimpl->bufs.emplace_back(buf); if (use_mlock && ggml_backend_buffer_is_host(buf)) { pimpl->mlock_bufs.emplace_back(new llama_mlock); auto & mlock_buf = pimpl->mlock_bufs.back(); @@ -6105,10 +6106,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { buf_map.emplace(idx, buf); } } - - if (pimpl->bufs.empty()) { - throw std::runtime_error("failed to allocate buffer"); - } + pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf); for (auto & buf : buf_map) { // indicate that this buffer contains weights @@ -6116,7 +6114,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); } - ctx_bufs.emplace_back(ctx, buf_map); + ctx_buf_maps.emplace_back(ctx, buf_map); } if (llama_supports_gpu_offload()) { @@ -6134,22 +6132,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } // print memory requirements per buffer type - for (auto & buf : pimpl->bufs) { + for (auto & [_, buf] : pimpl->ctxs_bufs) { LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0); } // populate tensors_by_name - for (auto & ctx : pimpl->ctxs) { + for (auto & [ctx, _] : pimpl->ctxs_bufs) { for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) { tensors_by_name.emplace_back(ggml_get_name(cur), cur); } } // load tensor data - for (auto & it : ctx_bufs) { - ggml_context * ctx = it.first; - auto & bufs = it.second; - if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { + for (auto & [ctx, buf_map] : ctx_buf_maps) { + if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { return false; } } @@ -6189,8 +6185,8 @@ size_t llama_model::n_devices() const { std::map llama_model::memory_breakdown() const { std::map ret; - for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) { - ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get()); + for (const auto & [_, buf] : pimpl->ctxs_bufs) { + ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get()); } return ret; }