Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 30 additions & 29 deletions tools/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1686,14 +1686,13 @@ struct server_slot {
llama_state_seq_get_data_ext(ctx, cur->data.data(), cur_size, id, 0);
}

void prompt_load(server_prompt_cache & prompt_cache, const server_tokens & tokens) {
bool prompt_load(server_prompt_cache & prompt_cache, const server_tokens & tokens) {
bool res = prompt_cache.load(prompt, tokens, ctx, id);
if (!res) {
SLT_WRN(*this, "%s", "failed to load prompt from cache\n");

llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1);
prompt.tokens.clear();
}

return res;
}

std::vector<common_adapter_lora_info> lora;
Expand Down Expand Up @@ -2339,7 +2338,6 @@ struct server_context {

llama_batch batch {};

bool clean_kv_cache = true;
bool add_bos_token = true;

int32_t n_ctx; // total context for all clients / slots
Expand Down Expand Up @@ -2701,7 +2699,10 @@ struct server_context {
const int64_t t_start = ggml_time_us();

ret->prompt_save(*prompt_cache);
ret->prompt_load(*prompt_cache, task.tokens);

if (!ret->prompt_load(*prompt_cache, task.tokens)) {
clear_slot(*ret);
}

prompt_cache->update();

Expand All @@ -2712,12 +2713,21 @@ struct server_context {
return ret;
}

// return true if at least one slot has been purged
void clear_slot(server_slot & slot) const {
GGML_ASSERT(!slot.is_processing());

SLT_WRN(slot, "clearing slot with %zu tokens\n", slot.prompt.tokens.size());

llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
slot.prompt.tokens.clear();
}

// return true if at least one slot has been cleared
// TODO: improve logic
// - smarter decision which slot to purge (LRU or longest prompt?)
// - smarter decision which slot to clear (LRU or longest prompt?)
// - move slot to level 2 cache instead of removing?
// - instead of purging, try to store and resume later?
bool try_purge_idle_slots() {
bool try_clear_idle_slots() {
bool res = false;

if (!params_base.kv_unified) {
Expand All @@ -2732,12 +2742,11 @@ struct server_context {
if (slot.prompt.n_tokens() > 0) {
SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size());

llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
slot.prompt.tokens.clear();
clear_slot(slot);

res = true;

// purge slots one by one
// clear slots one by one
break;
}
}
Expand Down Expand Up @@ -2847,14 +2856,6 @@ struct server_context {
return true;
}

void kv_cache_clear() {
SRV_DBG("%s", "clearing KV cache\n");

// clear the entire KV cache
llama_memory_clear(llama_get_memory(ctx), true);
clean_kv_cache = false;
}

bool process_token(completion_token_output & result, server_slot & slot) {
// remember which tokens were sampled - used for repetition penalties during sampling
const std::string token_str = result.text_to_send;
Expand Down Expand Up @@ -3442,8 +3443,8 @@ struct server_context {

// Erase token cache
const size_t n_erased = slot->prompt.tokens.size();
llama_memory_seq_rm(llama_get_memory(ctx), slot->id, -1, -1);
slot->prompt.tokens.clear();

clear_slot(*slot);

auto res = std::make_unique<server_task_result_slot_erase>();
res->id = task.id;
Expand Down Expand Up @@ -3476,9 +3477,6 @@ struct server_context {

if (all_idle) {
SRV_INF("%s", "all slots are idle\n");
if (clean_kv_cache) {
kv_cache_clear();
}

return;
}
Expand Down Expand Up @@ -3872,12 +3870,11 @@ struct server_context {

if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);

clear_slot(slot);

// there is no common part left
slot.n_prompt_tokens_cache = 0;

slot.prompt.tokens.clear();
}

// check if we should process the image
Expand Down Expand Up @@ -4107,6 +4104,10 @@ struct server_context {
if (slot.is_processing()) {
send_error(slot, err);
slot.release();

// note: it's complicated to keep track of how much of the current batch has been
// processed before the error occurred, so we simply clear the entire context
clear_slot(slot);
}
}

Expand All @@ -4115,7 +4116,7 @@ struct server_context {
}

// retry with half the batch size to try to find a free slot in the KV cache
if (!try_purge_idle_slots()) {
if (!try_clear_idle_slots()) {
n_batch /= 2;
}

Expand Down
Loading