Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 16 additions & 10 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

#include <algorithm>
#include <cassert>
#include <cmath>
#include <cfloat>
#include <cstring>
#include <cmath>
Expand Down Expand Up @@ -438,7 +437,7 @@ struct llama_model::impl {
llama_mlocks mlock_mmaps;

// contexts where the model tensors metadata is stored as well ass the corresponding buffers:
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;

buft_list_t cpu_buft_list;
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
Expand Down Expand Up @@ -6186,7 +6185,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);

ggml_backend_buffer_t buf = nullptr;
std::vector<ggml_backend_buffer_ptr> bufs;
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
// only the mmap region containing the tensors in the model is mapped to the backend buffer
Expand All @@ -6199,15 +6198,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
continue;
}
const size_t max_size = ggml_get_max_tensor_size(ctx);
buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
if (buf == nullptr) {
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
}
bufs.emplace_back(buf);
buf_map.emplace(idx, buf);
}
}
else {
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (buf == nullptr) {
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
}
Expand All @@ -6217,11 +6217,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
mlock_buf->init (ggml_backend_buffer_get_base(buf));
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
}
bufs.emplace_back(buf);
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
buf_map.emplace(idx, buf);
}
}
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf);
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));

for (auto & buf : buf_map) {
// indicate that this buffer contains weights
Expand All @@ -6247,8 +6248,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}

// print memory requirements per buffer type
for (auto & [_, buf] : pimpl->ctxs_bufs) {
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
for (auto & [_, bufs] : pimpl->ctxs_bufs) {
for (auto & buf: bufs) {
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
__func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
}
}

// populate tensors_by_name
Expand Down Expand Up @@ -6300,8 +6304,10 @@ size_t llama_model::n_devices() const {

std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const auto & [_, buf] : pimpl->ctxs_bufs) {
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
for (const auto & buf : bufs) {
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
}
}
return ret;
}
Expand Down
Loading