From 199a8384221d2ec11d66bc3ace7d0135f7151d1f Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Thu, 17 Apr 2025 14:13:29 -0700 Subject: [PATCH 1/3] threading: support for GGML_SCHED_PRIO_LOW, update thread info on Windows to avoid throttling We talked about adding LOW priority for GGML threads in the original threadpool PR. It might be useful for some cases to avoid contention. Latest Windows ARM64 releases started parking (offlining) the CPU cores more aggresively which results in suboptimal performance with n_threads > 4. To deal with that we now disable Power Throttling for our threads for the NORMAL and higher priorities. Co-authored-by: Diego Devesa --- common/arg.cpp | 4 ++-- common/common.cpp | 2 ++ ggml/include/ggml.h | 1 + ggml/src/ggml-cpu/ggml-cpu.c | 21 +++++++++++++++++++++ tools/llama-bench/llama-bench.cpp | 2 +- 5 files changed, 27 insertions(+), 3 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index b6cb0aa9180c1..cfa9878f90730 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1348,9 +1348,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"--prio"}, "N", - string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), + string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority), [](common_params & params, int prio) { - if (prio < 0 || prio > 3) { + if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) { throw std::invalid_argument("invalid value"); } params.cpuparams.priority = (enum ggml_sched_priority) prio; diff --git a/common/common.cpp b/common/common.cpp index d80c47bc393fa..4cc40ed8b37a4 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) { DWORD p = NORMAL_PRIORITY_CLASS; switch (prio) { + case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break; case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break; case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break; case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break; @@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) { int p = 0; switch (prio) { + case GGML_SCHED_PRIO_LOW: p = 5; break; case GGML_SCHED_PRIO_NORMAL: p = 0; break; case GGML_SCHED_PRIO_MEDIUM: p = -5; break; case GGML_SCHED_PRIO_HIGH: p = -10; break; diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 21ee8a1410a4e..2226aadcff893 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2181,6 +2181,7 @@ extern "C" { // scheduling priorities enum ggml_sched_priority { + GGML_SCHED_PRIO_LOW = -1, GGML_SCHED_PRIO_NORMAL, GGML_SCHED_PRIO_MEDIUM, GGML_SCHED_PRIO_HIGH, diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 1dc425fef1419..276112ef8eb48 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2418,12 +2418,30 @@ static bool ggml_thread_apply_priority(int32_t prio) { // This is up to the applications. DWORD p = THREAD_PRIORITY_NORMAL; switch (prio) { + case GGML_SCHED_PRIO_LOW: p = THREAD_PRIORITY_BELOW_NORMAL; break; case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break; case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break; case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break; case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break; } + if (prio != GGML_SCHED_PRIO_LOW) { + // Tell Windows that this thread should not be throttled (needs its own CPU core). + // Newer Windows 11 ARM64 versions aggresively park (offline) CPU cores and often place + // all our threads onto the first 4 cores which results in terrible performance with + // n_threads > 4 + THREAD_POWER_THROTTLING_STATE p; + ZeroMemory(&p, sizeof(p)); + p.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION; + p.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED; + p.StateMask = 0; + + if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &p, sizeof(p))) { + GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError()); + return false; + } + } + if (prio == GGML_SCHED_PRIO_NORMAL) { // Keep inherited policy/priority return true; @@ -2451,6 +2469,8 @@ static bool ggml_thread_apply_priority(int32_t prio) { struct sched_param p; int32_t policy = SCHED_OTHER; switch (prio) { + // TODO: there seems to be no way to set lower prio on Apple platforms + case GGML_SCHED_PRIO_LOW: policy = SCHED_OTHER; p.sched_priority = 0; break; case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; @@ -2507,6 +2527,7 @@ static bool ggml_thread_apply_priority(int32_t prio) { struct sched_param p; int32_t policy = SCHED_OTHER; switch (prio) { + case GGML_SCHED_PRIO_LOW: policy = SCHED_BATCH; p.sched_priority = 0; break; case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 06196cf24fc89..098cf853c7c20 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -315,7 +315,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" --numa numa mode (default: disabled)\n"); printf(" -r, --repetitions number of times to repeat each test (default: %d)\n", cmd_params_defaults.reps); - printf(" --prio <0|1|2|3> process/thread priority (default: %d)\n", + printf(" --prio <-1,0|1|2|3> process/thread priority (default: %d)\n", cmd_params_defaults.prio); printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n", cmd_params_defaults.delay); From 9087dd2664d259d768db68421e401d2a3f2ac071 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Thu, 17 Apr 2025 14:13:29 -0700 Subject: [PATCH 2/3] threading: disable SetThreadInfo() calls for older Windows versions --- ggml/src/ggml-cpu/ggml-cpu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 276112ef8eb48..c7426df2b851b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2427,19 +2427,21 @@ static bool ggml_thread_apply_priority(int32_t prio) { if (prio != GGML_SCHED_PRIO_LOW) { // Tell Windows that this thread should not be throttled (needs its own CPU core). - // Newer Windows 11 ARM64 versions aggresively park (offline) CPU cores and often place + // Newer Windows 11 versions aggresively park (offline) CPU cores and often place // all our threads onto the first 4 cores which results in terrible performance with // n_threads > 4 - THREAD_POWER_THROTTLING_STATE p; - ZeroMemory(&p, sizeof(p)); - p.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION; - p.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED; - p.StateMask = 0; - - if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &p, sizeof(p))) { + #if _WIN32_WINNT >= 0x0602 + THREAD_POWER_THROTTLING_STATE t; + ZeroMemory(&t, sizeof(t)); + t.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION; + t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED; + t.StateMask = 0; + + if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) { GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError()); return false; } + #endif } if (prio == GGML_SCHED_PRIO_NORMAL) { From ac35e50c161a5794a7f691944346e9abeb9aeb9c Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Sat, 31 May 2025 15:38:37 -0700 Subject: [PATCH 3/3] Update tools/llama-bench/llama-bench.cpp Co-authored-by: Diego Devesa --- tools/llama-bench/llama-bench.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 098cf853c7c20..803630d2650ed 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -315,7 +315,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" --numa numa mode (default: disabled)\n"); printf(" -r, --repetitions number of times to repeat each test (default: %d)\n", cmd_params_defaults.reps); - printf(" --prio <-1,0|1|2|3> process/thread priority (default: %d)\n", + printf(" --prio <-1|0|1|2|3> process/thread priority (default: %d)\n", cmd_params_defaults.prio); printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n", cmd_params_defaults.delay);