Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 28 additions & 32 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -421,11 +421,8 @@ struct llama_model::impl {
llama_mlocks mlock_bufs;
llama_mlocks mlock_mmaps;

// contexts where the model tensors metadata is stored
std::vector<ggml_context_ptr> ctxs;

// the model memory buffers for the tensor data
std::vector<ggml_backend_buffer_ptr> bufs;
// contexts where the model tensors metadata is stored as well ass the corresponding buffers:
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;

buft_list_t cpu_buft_list;
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
Expand Down Expand Up @@ -2181,7 +2178,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
max_n_tensors += n_layer*2; // duplicated rope freq tensors
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;

std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
struct ggml_backend_buft_comparator {
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
return ggml_backend_buft_name(lhs) < ggml_backend_buft_name(rhs);
}
};
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;

auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
Expand All @@ -2196,12 +2200,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
throw std::runtime_error(format("failed to create ggml context"));
}

ctx_map[buft] = ctx;
pimpl->ctxs.emplace_back(ctx);
ctx_map.emplace(buft, ctx);

return ctx;
}
return it->second;
return it->second.get();
};

const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
Expand Down Expand Up @@ -6036,16 +6039,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
pimpl->mappings.reserve(ml.mappings.size());

// create the backend buffers
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
ctx_bufs.reserve(ctx_map.size());
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
ctx_buf_maps.reserve(ctx_map.size());

// Ensure we have enough capacity for the maximum backend buffer we will potentially create
const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
pimpl->bufs.reserve(n_max_backend_buffer);
pimpl->ctxs_bufs.reserve(n_max_backend_buffer);

for (auto & it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx = it.second;
for (auto & [buft, ctx_ptr] : ctx_map) {
ggml_context * ctx = ctx_ptr.get();

// skip contexts without tensors
if (ggml_get_first_tensor(ctx) == nullptr) {
Expand All @@ -6069,6 +6071,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);

ggml_backend_buffer_t buf = nullptr;
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
// only the mmap region containing the tensors in the model is mapped to the backend buffer
Expand All @@ -6081,20 +6084,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
continue;
}
const size_t max_size = ggml_get_max_tensor_size(ctx);
ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
if (buf == nullptr) {
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
}
pimpl->bufs.emplace_back(buf);
buf_map.emplace(idx, buf);
}
}
else {
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (buf == nullptr) {
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
}
pimpl->bufs.emplace_back(buf);
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
pimpl->mlock_bufs.emplace_back(new llama_mlock);
auto & mlock_buf = pimpl->mlock_bufs.back();
Expand All @@ -6105,18 +6106,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
buf_map.emplace(idx, buf);
}
}

if (pimpl->bufs.empty()) {
throw std::runtime_error("failed to allocate buffer");
}
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf);

for (auto & buf : buf_map) {
// indicate that this buffer contains weights
// this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
}

ctx_bufs.emplace_back(ctx, buf_map);
ctx_buf_maps.emplace_back(ctx, buf_map);
}

if (llama_supports_gpu_offload()) {
Expand All @@ -6134,22 +6132,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}

// print memory requirements per buffer type
for (auto & buf : pimpl->bufs) {
for (auto & [_, buf] : pimpl->ctxs_bufs) {
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
}

// populate tensors_by_name
for (auto & ctx : pimpl->ctxs) {
for (auto & [ctx, _] : pimpl->ctxs_bufs) {
for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
tensors_by_name.emplace_back(ggml_get_name(cur), cur);
}
}

// load tensor data
for (auto & it : ctx_bufs) {
ggml_context * ctx = it.first;
auto & bufs = it.second;
if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
for (auto & [ctx, buf_map] : ctx_buf_maps) {
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
return false;
}
}
Expand Down Expand Up @@ -6189,8 +6185,8 @@ size_t llama_model::n_devices() const {

std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) {
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
for (const auto & [_, buf] : pimpl->ctxs_bufs) {
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
}
return ret;
}
Expand Down
Loading