From 2e747874d0422eae1ad294fdd27f3c5df27cd0fc Mon Sep 17 00:00:00 2001 From: David Huang <1969802+hjc4869@users.noreply.github.com> Date: Thu, 8 May 2025 20:45:36 +0800 Subject: [PATCH 1/4] Add --disable-op-offload --- common/arg.cpp | 7 +++++ common/common.cpp | 1 + common/common.h | 1 + ggml/include/ggml-backend.h | 4 +-- ggml/src/ggml-backend.cpp | 8 +++-- include/llama.h | 1 + src/llama-context.cpp | 4 ++- src/llama-cparams.h | 1 + tests/test-opt.cpp | 2 +- tools/llama-bench/llama-bench.cpp | 51 +++++++++++++++++++++++++------ tools/mtmd/clip.cpp | 2 +- 11 files changed, 65 insertions(+), 17 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 9f87e9910b540..779ac0ac48abd 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2436,6 +2436,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } )); + add_opt(common_arg( + {"--disable-op-offload"}, + string_format("disable offloading host tensor operations to device (default: %s)", params.disable_op_offload ? "true" : "false"), + [](common_params & params) { + params.disable_op_offload = true; + } + )); add_opt(common_arg( {"--lora"}, "FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)", diff --git a/common/common.cpp b/common/common.cpp index bd20af233695c..a758a06548bb8 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1113,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.offload_kqv = !params.no_kv_offload; cparams.flash_attn = params.flash_attn; cparams.no_perf = params.no_perf; + cparams.disable_op_offload= params.disable_op_offload; if (params.reranking) { cparams.embeddings = true; diff --git a/common/common.h b/common/common.h index 90702245463cb..b8c38b4ae8118 100644 --- a/common/common.h +++ b/common/common.h @@ -332,6 +332,7 @@ struct common_params { bool no_kv_offload = false; // disable KV offloading bool warmup = true; // warmup run bool check_tensors = false; // validate tensor data + bool disable_op_offload= false; // globally disable offload host tensor operations to device bool single_turn = false; // single turn chat conversation diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index ea2c1a402cca1..7966ce1324168 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -248,7 +248,7 @@ extern "C" { // preferrably to run on the same backend as the buffer ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false); + sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, false); // initialize buffers from a max size graph (optional) reserve_graph = build_graph(sched, max_batch_size); @@ -289,7 +289,7 @@ extern "C" { typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); // Initialize a backend scheduler, backends with low index are given priority over backends with high index - GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel); + GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool disable_op_offload); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index c36b5abfb7422..0da80393c8480 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -674,6 +674,8 @@ struct ggml_backend_sched { char * context_buffer; size_t context_buffer_size; + bool disable_op_offload; + int debug; }; @@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); // check if a backend with higher prio wants to offload the op - if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { + if (!sched->disable_op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { for (int b = 0; b < src_backend_id; b++) { if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { SET_CAUSE(tensor, "1.off"); @@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new( ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, - bool parallel) { + bool parallel, + bool disable_op_offload) { GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); @@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new( } sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); + sched->disable_op_offload = disable_op_offload; ggml_backend_sched_reset(sched); diff --git a/include/llama.h b/include/llama.h index a18f365bff6f2..b26e62fb0e843 100644 --- a/include/llama.h +++ b/include/llama.h @@ -362,6 +362,7 @@ extern "C" { bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool flash_attn; // whether to use flash attention [EXPERIMENTAL] bool no_perf; // whether to measure performance timings + bool disable_op_offload; // whether to disable offload host tensor operations to device globally }; // model quantization parameters diff --git a/src/llama-context.cpp b/src/llama-context.cpp index fd64622b8e02d..8c45d7e06635e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -93,6 +93,7 @@ llama_context::llama_context( } cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); + cparams.disable_op_offload = params.disable_op_offload; const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; @@ -243,7 +244,7 @@ llama_context::llama_context( } } - sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); + sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel, cparams.disable_op_offload)); if (pipeline_parallel) { LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get())); @@ -1871,6 +1872,7 @@ llama_context_params llama_context_default_params() { /*.offload_kqv =*/ true, /*.flash_attn =*/ false, /*.no_perf =*/ true, + /*.disable_op_offload =*/ false, }; return result; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 30e550f023a9e..da3c46c8db5cc 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -30,6 +30,7 @@ struct llama_cparams { bool flash_attn; bool no_perf; bool warmup; + bool disable_op_offload; enum llama_pooling_type pooling_type; diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp index f90c92b4b8ecf..8e55af81f624a 100644 --- a/tests/test-opt.cpp +++ b/tests/test-opt.cpp @@ -853,7 +853,7 @@ int main(void) { backends_modded.insert(backends_modded.end(), backends.begin(), backends.end()); ggml_backend_sched_t backend_sched = ggml_backend_sched_new( - backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false); + backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false, false); printf("Backend %zu/%zu: %s\n", i + 1, dev_count, ggml_backend_dev_name(devs[i])); printf(" Device description: %s\n", ggml_backend_dev_description(devs[i])); diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 0786594296e94..f166e71e4cdb1 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -219,6 +219,7 @@ struct cmd_params { std::vector> tensor_buft_overrides; std::vector use_mmap; std::vector embeddings; + std::vector disable_op_offload; ggml_numa_strategy numa; int reps; ggml_sched_priority prio; @@ -253,6 +254,7 @@ static const cmd_params cmd_params_defaults = { /* tensor_buft_overrides*/ { std::vector{{nullptr,nullptr}} }, /* use_mmap */ { true }, /* embeddings */ { false }, + /* disable_op_offload */ { false }, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* prio */ GGML_SCHED_PRIO_NORMAL, @@ -311,6 +313,7 @@ static void print_usage(int /* argc */, char ** argv) { join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -ot --override-tensors =;... (default: disabled)\n"); + printf(" -dopo, --disable-op-offload (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio); printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay); @@ -588,6 +591,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.embeddings.insert(params.embeddings.end(), p.begin(), p.end()); + } else if (arg == "-dopo" || arg == "--disable-op-offload") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.disable_op_offload.insert(params.disable_op_offload.end(), p.begin(), p.end()); } else if (arg == "-ts" || arg == "--tensor-split") { if (++i >= argc) { invalid_param = true; @@ -794,6 +804,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } + if (params.disable_op_offload.empty()) { + params.disable_op_offload = cmd_params_defaults.disable_op_offload; + } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } @@ -833,6 +846,7 @@ struct cmd_params_instance { std::vector tensor_buft_overrides; bool use_mmap; bool embeddings; + bool disable_op_offload; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); @@ -894,14 +908,15 @@ struct cmd_params_instance { llama_context_params to_llama_cparams() const { llama_context_params cparams = llama_context_default_params(); - cparams.n_ctx = n_prompt + n_gen + n_depth; - cparams.n_batch = n_batch; - cparams.n_ubatch = n_ubatch; - cparams.type_k = type_k; - cparams.type_v = type_v; - cparams.offload_kqv = !no_kv_offload; - cparams.flash_attn = flash_attn; - cparams.embeddings = embeddings; + cparams.n_ctx = n_prompt + n_gen + n_depth; + cparams.n_batch = n_batch; + cparams.n_ubatch = n_ubatch; + cparams.type_k = type_k; + cparams.type_v = type_v; + cparams.offload_kqv = !no_kv_offload; + cparams.flash_attn = flash_attn; + cparams.embeddings = embeddings; + cparams.disable_op_offload = disable_op_offload; return cparams; } @@ -921,6 +936,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & ot : params.tensor_buft_overrides) for (const auto & mmp : params.use_mmap) for (const auto & embd : params.embeddings) + for (const auto & dopo : params.disable_op_offload) for (const auto & nb : params.n_batch) for (const auto & nub : params.n_ubatch) for (const auto & tk : params.type_k) @@ -959,6 +975,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, /* .embeddings = */ embd, + /* .disable_op_offload= */ dopo, }; instances.push_back(instance); } @@ -990,6 +1007,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, /* .embeddings = */ embd, + /* .disable_op_offload= */ dopo, }; instances.push_back(instance); } @@ -1021,6 +1039,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, /* .embeddings = */ embd, + /* .disable_op_offload= */ dopo, }; instances.push_back(instance); } @@ -1056,6 +1075,7 @@ struct test { std::vector tensor_buft_overrides; bool use_mmap; bool embeddings; + bool disable_op_offload; int n_prompt; int n_gen; int n_depth; @@ -1089,6 +1109,7 @@ struct test { tensor_buft_overrides = inst.tensor_buft_overrides; use_mmap = inst.use_mmap; embeddings = inst.embeddings; + disable_op_offload = inst.disable_op_offload; n_prompt = inst.n_prompt; n_gen = inst.n_gen; n_depth = inst.n_depth; @@ -1134,7 +1155,7 @@ struct test { "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", - "use_mmap", "embeddings", "n_prompt", "n_gen", "n_depth", "test_time", + "use_mmap", "embeddings", "disable_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", }; return fields; @@ -1146,7 +1167,7 @@ struct test { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || - field == "avg_ns" || field == "stddev_ns") { + field == "avg_ns" || field == "stddev_ns" || field == "disable_op_offload") { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || @@ -1222,6 +1243,7 @@ struct test { tensor_buft_overrides_str, std::to_string(use_mmap), std::to_string(embeddings), + std::to_string(disable_op_offload), std::to_string(n_prompt), std::to_string(n_gen), std::to_string(n_depth), @@ -1404,6 +1426,9 @@ struct markdown_printer : public printer { if (field == "test") { return 15; } + if (field == "disable_op_offload") { + return 4; + } int width = std::max((int) field.length(), 10); @@ -1435,6 +1460,9 @@ struct markdown_printer : public printer { if (field == "embeddings") { return "embd"; } + if (field == "disable_op_offload") { + return "dopo"; + } if (field == "tensor_split") { return "ts"; } @@ -1503,6 +1531,9 @@ struct markdown_printer : public printer { if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) { fields.emplace_back("embeddings"); } + if (params.disable_op_offload.size() > 1 || params.disable_op_offload != cmd_params_defaults.disable_op_offload) { + fields.emplace_back("disable_op_offload"); + } fields.emplace_back("test"); fields.emplace_back("t/s"); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 4e1a7328725ea..f344e53edab29 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -369,7 +369,7 @@ struct clip_ctx { backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu)); sched.reset( - ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false) + ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, false) ); } From 31e19202a77b5720d2bfa9a3520d7c30c1fd2431 Mon Sep 17 00:00:00 2001 From: David Huang <1969802+hjc4869@users.noreply.github.com> Date: Fri, 9 May 2025 19:07:46 +0800 Subject: [PATCH 2/4] Avoid negative bools in library. --- common/common.cpp | 2 +- ggml/include/ggml-backend.h | 2 +- ggml/src/ggml-backend.cpp | 8 ++++---- include/llama.h | 2 +- src/llama-context.cpp | 6 +++--- src/llama-cparams.h | 2 +- tools/llama-bench/llama-bench.cpp | 18 +++++++++--------- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index a758a06548bb8..82f87ccd28ae2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1113,7 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.offload_kqv = !params.no_kv_offload; cparams.flash_attn = params.flash_attn; cparams.no_perf = params.no_perf; - cparams.disable_op_offload= params.disable_op_offload; + cparams.op_offload = !params.disable_op_offload; if (params.reranking) { cparams.embeddings = true; diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 7966ce1324168..51e5d9af19307 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -289,7 +289,7 @@ extern "C" { typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); // Initialize a backend scheduler, backends with low index are given priority over backends with high index - GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool disable_op_offload); + GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 0da80393c8480..6f69d895f170d 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -674,7 +674,7 @@ struct ggml_backend_sched { char * context_buffer; size_t context_buffer_size; - bool disable_op_offload; + bool op_offload; int debug; }; @@ -768,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); // check if a backend with higher prio wants to offload the op - if (!sched->disable_op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { + if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { for (int b = 0; b < src_backend_id; b++) { if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { SET_CAUSE(tensor, "1.off"); @@ -1455,7 +1455,7 @@ ggml_backend_sched_t ggml_backend_sched_new( int n_backends, size_t graph_size, bool parallel, - bool disable_op_offload) { + bool op_offload) { GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); @@ -1500,7 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new( } sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); - sched->disable_op_offload = disable_op_offload; + sched->op_offload = op_offload; ggml_backend_sched_reset(sched); diff --git a/include/llama.h b/include/llama.h index b26e62fb0e843..410930b4bde47 100644 --- a/include/llama.h +++ b/include/llama.h @@ -362,7 +362,7 @@ extern "C" { bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool flash_attn; // whether to use flash attention [EXPERIMENTAL] bool no_perf; // whether to measure performance timings - bool disable_op_offload; // whether to disable offload host tensor operations to device globally + bool op_offload; // whether to offload host tensor operations to device }; // model quantization parameters diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8c45d7e06635e..a12849f0e0df4 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -93,7 +93,7 @@ llama_context::llama_context( } cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); - cparams.disable_op_offload = params.disable_op_offload; + cparams.op_offload = params.op_offload; const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; @@ -244,7 +244,7 @@ llama_context::llama_context( } } - sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel, cparams.disable_op_offload)); + sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel, cparams.op_offload)); if (pipeline_parallel) { LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get())); @@ -1872,7 +1872,7 @@ llama_context_params llama_context_default_params() { /*.offload_kqv =*/ true, /*.flash_attn =*/ false, /*.no_perf =*/ true, - /*.disable_op_offload =*/ false, + /*.op_offload =*/ true, }; return result; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index da3c46c8db5cc..246fa5777deea 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -30,7 +30,7 @@ struct llama_cparams { bool flash_attn; bool no_perf; bool warmup; - bool disable_op_offload; + bool op_offload; enum llama_pooling_type pooling_type; diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index f166e71e4cdb1..a36374fa85863 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -908,15 +908,15 @@ struct cmd_params_instance { llama_context_params to_llama_cparams() const { llama_context_params cparams = llama_context_default_params(); - cparams.n_ctx = n_prompt + n_gen + n_depth; - cparams.n_batch = n_batch; - cparams.n_ubatch = n_ubatch; - cparams.type_k = type_k; - cparams.type_v = type_v; - cparams.offload_kqv = !no_kv_offload; - cparams.flash_attn = flash_attn; - cparams.embeddings = embeddings; - cparams.disable_op_offload = disable_op_offload; + cparams.n_ctx = n_prompt + n_gen + n_depth; + cparams.n_batch = n_batch; + cparams.n_ubatch = n_ubatch; + cparams.type_k = type_k; + cparams.type_v = type_v; + cparams.offload_kqv = !no_kv_offload; + cparams.flash_attn = flash_attn; + cparams.embeddings = embeddings; + cparams.op_offload = !disable_op_offload; return cparams; } From 0d53a04b8182169da0fb66e3b166ee096ef5a226 Mon Sep 17 00:00:00 2001 From: David Huang <1969802+hjc4869@users.noreply.github.com> Date: Fri, 9 May 2025 21:09:45 +0800 Subject: [PATCH 3/4] Fix default value of ggml_backend_sched_new --- ggml/include/ggml-backend.h | 2 +- tests/test-opt.cpp | 2 +- tools/mtmd/clip.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 51e5d9af19307..778927f68217a 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -248,7 +248,7 @@ extern "C" { // preferrably to run on the same backend as the buffer ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, false); + sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true); // initialize buffers from a max size graph (optional) reserve_graph = build_graph(sched, max_batch_size); diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp index 8e55af81f624a..1bc1605113571 100644 --- a/tests/test-opt.cpp +++ b/tests/test-opt.cpp @@ -853,7 +853,7 @@ int main(void) { backends_modded.insert(backends_modded.end(), backends.begin(), backends.end()); ggml_backend_sched_t backend_sched = ggml_backend_sched_new( - backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false, false); + backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false, true); printf("Backend %zu/%zu: %s\n", i + 1, dev_count, ggml_backend_dev_name(devs[i])); printf(" Device description: %s\n", ggml_backend_dev_description(devs[i])); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index f344e53edab29..10bc2577ac244 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -369,7 +369,7 @@ struct clip_ctx { backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu)); sched.reset( - ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, false) + ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true) ); } From eae3a319dae975249d16e18e525340bfc8d4c66d Mon Sep 17 00:00:00 2001 From: David Huang <1969802+hjc4869@users.noreply.github.com> Date: Sun, 11 May 2025 11:06:21 +0800 Subject: [PATCH 4/4] Rename to --no-op-offload for consistency --- common/arg.cpp | 6 ++-- common/common.cpp | 2 +- common/common.h | 2 +- tools/llama-bench/llama-bench.cpp | 46 +++++++++++++++---------------- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 779ac0ac48abd..a4493141ff196 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2437,10 +2437,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } )); add_opt(common_arg( - {"--disable-op-offload"}, - string_format("disable offloading host tensor operations to device (default: %s)", params.disable_op_offload ? "true" : "false"), + {"--no-op-offload"}, + string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"), [](common_params & params) { - params.disable_op_offload = true; + params.no_op_offload = true; } )); add_opt(common_arg( diff --git a/common/common.cpp b/common/common.cpp index 82f87ccd28ae2..710bf1fe2a3c5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1113,7 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.offload_kqv = !params.no_kv_offload; cparams.flash_attn = params.flash_attn; cparams.no_perf = params.no_perf; - cparams.op_offload = !params.disable_op_offload; + cparams.op_offload = !params.no_op_offload; if (params.reranking) { cparams.embeddings = true; diff --git a/common/common.h b/common/common.h index b8c38b4ae8118..cc373050c2bed 100644 --- a/common/common.h +++ b/common/common.h @@ -332,7 +332,7 @@ struct common_params { bool no_kv_offload = false; // disable KV offloading bool warmup = true; // warmup run bool check_tensors = false; // validate tensor data - bool disable_op_offload= false; // globally disable offload host tensor operations to device + bool no_op_offload = false; // globally disable offload host tensor operations to device bool single_turn = false; // single turn chat conversation diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index a36374fa85863..5d26b506bd935 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -219,7 +219,7 @@ struct cmd_params { std::vector> tensor_buft_overrides; std::vector use_mmap; std::vector embeddings; - std::vector disable_op_offload; + std::vector no_op_offload; ggml_numa_strategy numa; int reps; ggml_sched_priority prio; @@ -254,7 +254,7 @@ static const cmd_params cmd_params_defaults = { /* tensor_buft_overrides*/ { std::vector{{nullptr,nullptr}} }, /* use_mmap */ { true }, /* embeddings */ { false }, - /* disable_op_offload */ { false }, + /* no_op_offload */ { false }, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* prio */ GGML_SCHED_PRIO_NORMAL, @@ -313,7 +313,7 @@ static void print_usage(int /* argc */, char ** argv) { join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -ot --override-tensors =;... (default: disabled)\n"); - printf(" -dopo, --disable-op-offload (default: 0)\n"); + printf(" -nopo, --no-op-offload (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio); printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay); @@ -591,13 +591,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.embeddings.insert(params.embeddings.end(), p.begin(), p.end()); - } else if (arg == "-dopo" || arg == "--disable-op-offload") { + } else if (arg == "-nopo" || arg == "--no-op-offload") { if (++i >= argc) { invalid_param = true; break; } auto p = string_split(argv[i], split_delim); - params.disable_op_offload.insert(params.disable_op_offload.end(), p.begin(), p.end()); + params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end()); } else if (arg == "-ts" || arg == "--tensor-split") { if (++i >= argc) { invalid_param = true; @@ -804,8 +804,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } - if (params.disable_op_offload.empty()) { - params.disable_op_offload = cmd_params_defaults.disable_op_offload; + if (params.no_op_offload.empty()) { + params.no_op_offload = cmd_params_defaults.no_op_offload; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; @@ -846,7 +846,7 @@ struct cmd_params_instance { std::vector tensor_buft_overrides; bool use_mmap; bool embeddings; - bool disable_op_offload; + bool no_op_offload; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); @@ -916,7 +916,7 @@ struct cmd_params_instance { cparams.offload_kqv = !no_kv_offload; cparams.flash_attn = flash_attn; cparams.embeddings = embeddings; - cparams.op_offload = !disable_op_offload; + cparams.op_offload = !no_op_offload; return cparams; } @@ -936,7 +936,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & ot : params.tensor_buft_overrides) for (const auto & mmp : params.use_mmap) for (const auto & embd : params.embeddings) - for (const auto & dopo : params.disable_op_offload) + for (const auto & nopo : params.no_op_offload) for (const auto & nb : params.n_batch) for (const auto & nub : params.n_ubatch) for (const auto & tk : params.type_k) @@ -975,7 +975,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, /* .embeddings = */ embd, - /* .disable_op_offload= */ dopo, + /* .no_op_offload= */ nopo, }; instances.push_back(instance); } @@ -1007,7 +1007,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, /* .embeddings = */ embd, - /* .disable_op_offload= */ dopo, + /* .no_op_offload= */ nopo, }; instances.push_back(instance); } @@ -1039,7 +1039,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, /* .embeddings = */ embd, - /* .disable_op_offload= */ dopo, + /* .no_op_offload= */ nopo, }; instances.push_back(instance); } @@ -1075,7 +1075,7 @@ struct test { std::vector tensor_buft_overrides; bool use_mmap; bool embeddings; - bool disable_op_offload; + bool no_op_offload; int n_prompt; int n_gen; int n_depth; @@ -1109,7 +1109,7 @@ struct test { tensor_buft_overrides = inst.tensor_buft_overrides; use_mmap = inst.use_mmap; embeddings = inst.embeddings; - disable_op_offload = inst.disable_op_offload; + no_op_offload = inst.no_op_offload; n_prompt = inst.n_prompt; n_gen = inst.n_gen; n_depth = inst.n_depth; @@ -1155,7 +1155,7 @@ struct test { "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", - "use_mmap", "embeddings", "disable_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", + "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", }; return fields; @@ -1167,7 +1167,7 @@ struct test { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || - field == "avg_ns" || field == "stddev_ns" || field == "disable_op_offload") { + field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || @@ -1243,7 +1243,7 @@ struct test { tensor_buft_overrides_str, std::to_string(use_mmap), std::to_string(embeddings), - std::to_string(disable_op_offload), + std::to_string(no_op_offload), std::to_string(n_prompt), std::to_string(n_gen), std::to_string(n_depth), @@ -1426,7 +1426,7 @@ struct markdown_printer : public printer { if (field == "test") { return 15; } - if (field == "disable_op_offload") { + if (field == "no_op_offload") { return 4; } @@ -1460,8 +1460,8 @@ struct markdown_printer : public printer { if (field == "embeddings") { return "embd"; } - if (field == "disable_op_offload") { - return "dopo"; + if (field == "no_op_offload") { + return "nopo"; } if (field == "tensor_split") { return "ts"; @@ -1531,8 +1531,8 @@ struct markdown_printer : public printer { if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) { fields.emplace_back("embeddings"); } - if (params.disable_op_offload.size() > 1 || params.disable_op_offload != cmd_params_defaults.disable_op_offload) { - fields.emplace_back("disable_op_offload"); + if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) { + fields.emplace_back("no_op_offload"); } fields.emplace_back("test"); fields.emplace_back("t/s");