From b9511616089330c8bcbb2bccbcbdc6d1dc045f01 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 14 Nov 2025 12:02:37 +0200 Subject: [PATCH 1/2] server : handle context overflow during decode --- tools/server/server.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 535d2c450e21e..a1ffba1473a96 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -4107,6 +4107,9 @@ struct server_context { if (slot.is_processing()) { send_error(slot, err); slot.release(); + + llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1); + slot.prompt.tokens.clear(); } } From 741baaf63e71950f523112aa0de75c9b19294ccf Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 14 Nov 2025 14:02:54 +0200 Subject: [PATCH 2/2] server : minor refactor --- tools/server/server.cpp | 60 ++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index a1ffba1473a96..9e2679999794a 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -1686,14 +1686,13 @@ struct server_slot { llama_state_seq_get_data_ext(ctx, cur->data.data(), cur_size, id, 0); } - void prompt_load(server_prompt_cache & prompt_cache, const server_tokens & tokens) { + bool prompt_load(server_prompt_cache & prompt_cache, const server_tokens & tokens) { bool res = prompt_cache.load(prompt, tokens, ctx, id); if (!res) { SLT_WRN(*this, "%s", "failed to load prompt from cache\n"); - - llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1); - prompt.tokens.clear(); } + + return res; } std::vector lora; @@ -2339,7 +2338,6 @@ struct server_context { llama_batch batch {}; - bool clean_kv_cache = true; bool add_bos_token = true; int32_t n_ctx; // total context for all clients / slots @@ -2701,7 +2699,10 @@ struct server_context { const int64_t t_start = ggml_time_us(); ret->prompt_save(*prompt_cache); - ret->prompt_load(*prompt_cache, task.tokens); + + if (!ret->prompt_load(*prompt_cache, task.tokens)) { + clear_slot(*ret); + } prompt_cache->update(); @@ -2712,12 +2713,21 @@ struct server_context { return ret; } - // return true if at least one slot has been purged + void clear_slot(server_slot & slot) const { + GGML_ASSERT(!slot.is_processing()); + + SLT_WRN(slot, "clearing slot with %zu tokens\n", slot.prompt.tokens.size()); + + llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1); + slot.prompt.tokens.clear(); + } + + // return true if at least one slot has been cleared // TODO: improve logic - // - smarter decision which slot to purge (LRU or longest prompt?) + // - smarter decision which slot to clear (LRU or longest prompt?) // - move slot to level 2 cache instead of removing? // - instead of purging, try to store and resume later? - bool try_purge_idle_slots() { + bool try_clear_idle_slots() { bool res = false; if (!params_base.kv_unified) { @@ -2732,12 +2742,11 @@ struct server_context { if (slot.prompt.n_tokens() > 0) { SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size()); - llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1); - slot.prompt.tokens.clear(); + clear_slot(slot); res = true; - // purge slots one by one + // clear slots one by one break; } } @@ -2847,14 +2856,6 @@ struct server_context { return true; } - void kv_cache_clear() { - SRV_DBG("%s", "clearing KV cache\n"); - - // clear the entire KV cache - llama_memory_clear(llama_get_memory(ctx), true); - clean_kv_cache = false; - } - bool process_token(completion_token_output & result, server_slot & slot) { // remember which tokens were sampled - used for repetition penalties during sampling const std::string token_str = result.text_to_send; @@ -3442,8 +3443,8 @@ struct server_context { // Erase token cache const size_t n_erased = slot->prompt.tokens.size(); - llama_memory_seq_rm(llama_get_memory(ctx), slot->id, -1, -1); - slot->prompt.tokens.clear(); + + clear_slot(*slot); auto res = std::make_unique(); res->id = task.id; @@ -3476,9 +3477,6 @@ struct server_context { if (all_idle) { SRV_INF("%s", "all slots are idle\n"); - if (clean_kv_cache) { - kv_cache_clear(); - } return; } @@ -3872,12 +3870,11 @@ struct server_context { if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) { SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0); - llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1); + + clear_slot(slot); // there is no common part left slot.n_prompt_tokens_cache = 0; - - slot.prompt.tokens.clear(); } // check if we should process the image @@ -4108,8 +4105,9 @@ struct server_context { send_error(slot, err); slot.release(); - llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1); - slot.prompt.tokens.clear(); + // note: it's complicated to keep track of how much of the current batch has been + // processed before the error occurred, so we simply clear the entire context + clear_slot(slot); } } @@ -4118,7 +4116,7 @@ struct server_context { } // retry with half the batch size to try to find a free slot in the KV cache - if (!try_purge_idle_slots()) { + if (!try_clear_idle_slots()) { n_batch /= 2; }