From 07d6954f6374e6dfaaac33804d4c7df613d7a045 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 13 Oct 2025 13:07:37 +0300 Subject: [PATCH 1/2] server : dynamic token limit for prompt cache --- tools/server/server.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index cf12805b4998a..0537c7911942f 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -1586,12 +1586,19 @@ struct server_prompt_cache { } if (limit_tokens > 0) { - while (states.size() > 1 && n_tokens() > limit_tokens) { + // average size per token + const float size_per_token = std::max(1.0f, float(size()) / (std::max(1, n_tokens()))); + + // dynamically increase the token limit if it can fit in the memory limit + const size_t limit_tokens_cur = limit_size > 0 ? std::max(limit_tokens, limit_size/size_per_token) : limit_tokens; + + while (states.size() > 1 && n_tokens() > limit_tokens_cur) { if (states.empty()) { break; } - SRV_WRN(" - cache token limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0)); + SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n", + limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0)); states.pop_front(); } @@ -1601,7 +1608,8 @@ struct server_prompt_cache { states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens); for (const auto & state : states) { - SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n", (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0)); + SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n", + (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0)); } } }; From 63924a9daa541dd37a16ba98e055e680167acad6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 13 Oct 2025 22:29:23 +0300 Subject: [PATCH 2/2] cont : print estimated token limit --- tools/server/server.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 0537c7911942f..77969d24e13e1 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -1585,13 +1585,13 @@ struct server_prompt_cache { } } - if (limit_tokens > 0) { - // average size per token - const float size_per_token = std::max(1.0f, float(size()) / (std::max(1, n_tokens()))); + // average size per token + const float size_per_token = std::max(1.0f, float(size()) / (std::max(1, n_tokens()))); - // dynamically increase the token limit if it can fit in the memory limit - const size_t limit_tokens_cur = limit_size > 0 ? std::max(limit_tokens, limit_size/size_per_token) : limit_tokens; + // dynamically increase the token limit if it can fit in the memory limit + const size_t limit_tokens_cur = limit_size > 0 ? std::max(limit_tokens, limit_size/size_per_token) : limit_tokens; + if (limit_tokens > 0) { while (states.size() > 1 && n_tokens() > limit_tokens_cur) { if (states.empty()) { break; @@ -1604,8 +1604,8 @@ struct server_prompt_cache { } } - SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens)\n", - states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens); + SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n", + states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur); for (const auto & state : states) { SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",