From 07d6954f6374e6dfaaac33804d4c7df613d7a045 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 13 Oct 2025 13:07:37 +0300
Subject: [PATCH 1/2] server : dynamic token limit for prompt cache

---
 tools/server/server.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index cf12805b4998a..0537c7911942f 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -1586,12 +1586,19 @@ struct server_prompt_cache {
         }
 
         if (limit_tokens > 0) {
-            while (states.size() > 1 && n_tokens() > limit_tokens) {
+            // average size per token
+            const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));
+
+            // dynamically increase the token limit if it can fit in the memory limit
+            const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;
+
+            while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
                 if (states.empty()) {
                     break;
                 }
 
-                SRV_WRN(" - cache token limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));
+                SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",
+                        limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));
 
                 states.pop_front();
             }
@@ -1601,7 +1608,8 @@ struct server_prompt_cache {
                 states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens);
 
         for (const auto & state : states) {
-            SRV_WRN("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n", (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
+            SRV_WRN("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
+                    (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
         }
     }
 };

From 63924a9daa541dd37a16ba98e055e680167acad6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 13 Oct 2025 22:29:23 +0300
Subject: [PATCH 2/2] cont : print estimated token limit

---
 tools/server/server.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 0537c7911942f..77969d24e13e1 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -1585,13 +1585,13 @@ struct server_prompt_cache {
             }
         }
 
-        if (limit_tokens > 0) {
-            // average size per token
-            const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));
+        // average size per token
+        const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));
 
-            // dynamically increase the token limit if it can fit in the memory limit
-            const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;
+        // dynamically increase the token limit if it can fit in the memory limit
+        const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;
 
+        if (limit_tokens > 0) {
             while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
                 if (states.empty()) {
                     break;
@@ -1604,8 +1604,8 @@ struct server_prompt_cache {
             }
         }
 
-        SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens)\n",
-                states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens);
+        SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
+                states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);
 
         for (const auto & state : states) {
             SRV_WRN("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",