From b7a42d8572dd8bde8ff7efe00b8efde7a3fa2e6b Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 8 Nov 2025 11:11:36 +0000 Subject: [PATCH 01/24] Add CLI commands for dumping and loading activations in GGUF format This commit adds two new CLI options to the main tool: - --dump-activations: Dumps intermediate layer activations to a GGUF file - --load-activations: Loads and displays activations from a GGUF file The implementation: - Adds activation collection callback that captures tensors from various operations (MUL_MAT, ADD, MUL, NORM, RMS_NORM) - Saves collected activations with metadata (version, count) to GGUF format - Loads and displays activation tensor information from GGUF files This is useful for debugging, analysis, and understanding model behavior by examining intermediate activations during inference. --- common/arg.cpp | 14 +++ common/common.h | 2 + tools/main/main.cpp | 221 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 237 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index 5597de121c132..b2392b13f16eb 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1110,6 +1110,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.prompt_cache_ro = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"--dump-activations"}, "FNAME", + "file to dump activations to in GGUF format (default: none)", + [](common_params & params, const std::string & value) { + params.path_dump_activations = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"--load-activations"}, "FNAME", + "file to load activations from in GGUF format (default: none)", + [](common_params & params, const std::string & value) { + params.path_load_activations = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-r", "--reverse-prompt"}, "PROMPT", "halt generation at PROMPT, return control in interactive mode\n", diff --git a/common/common.h b/common/common.h index 54b7849b17448..2f6fdbaffff60 100644 --- a/common/common.h +++ b/common/common.h @@ -328,6 +328,8 @@ struct common_params { std::string system_prompt = ""; // NOLINT std::string prompt_file = ""; // store the external prompt file name // NOLINT std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT + std::string path_dump_activations = ""; // path to GGUF file for dumping activations // NOLINT + std::string path_load_activations = ""; // path to GGUF file for loading activations // NOLINT std::string input_prefix = ""; // string to prefix user inputs with // NOLINT std::string input_suffix = ""; // string to suffix user inputs with // NOLINT std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 498e00e3a5e58..05f5673f489af 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -5,6 +5,7 @@ #include "sampling.h" #include "llama.h" #include "chat.h" +#include "gguf.h" #include #include @@ -14,6 +15,8 @@ #include #include #include +#include +#include #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) #include @@ -41,6 +44,18 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; +// Activation dumping structures +struct activation_tensor { + std::string name; + std::vector ne; // dimensions + ggml_type type; + std::vector data; +}; + +static std::unordered_map g_activations; +static std::mutex g_activations_mutex; +static bool g_dump_activations = false; + static void print_usage(int argc, char ** argv) { (void) argc; @@ -62,6 +77,189 @@ static bool file_is_empty(const std::string & path) { return f.tellg() == 0; } +// Filter tensor name (remove backend prefix and suffix) +static std::string filter_tensor_name(const char * name) { + std::string wname; + const char * p = strchr(name, '#'); + if (p != NULL) { + p = p + 1; + const char * q = strchr(p, '#'); + if (q != NULL) { + wname = std::string(p, q - p); + } else { + wname = p; + } + } else { + wname = name; + } + return wname; +} + +// Callback for collecting activations +static bool activation_collector(struct ggml_tensor * t, bool ask, void * user_data) { + (void) user_data; + + if (!g_dump_activations) { + return false; + } + + // Filter for interesting operations and tensors + if (ask) { + // We're interested in capturing activations from various operations + if (t->op == GGML_OP_MUL_MAT || t->op == GGML_OP_MUL_MAT_ID || + t->op == GGML_OP_ADD || t->op == GGML_OP_MUL || + t->op == GGML_OP_NORM || t->op == GGML_OP_RMS_NORM) { + return true; + } + return false; + } + + // Collect the tensor data + std::lock_guard lock(g_activations_mutex); + + std::string tensor_name = filter_tensor_name(t->name); + if (tensor_name.empty()) { + tensor_name = std::string(t->name); + } + + // Check if we already have this tensor (to avoid duplicates) + if (g_activations.find(tensor_name) != g_activations.end()) { + return true; + } + + activation_tensor act; + act.name = tensor_name; + act.type = t->type; + + // Store dimensions + for (int i = 0; i < GGML_MAX_DIMS; i++) { + act.ne.push_back(t->ne[i]); + } + + // Copy tensor data + const size_t tensor_size = ggml_nbytes(t); + act.data.resize(tensor_size); + + const bool is_host = ggml_backend_buffer_is_host(t->buffer); + if (is_host) { + memcpy(act.data.data(), t->data, tensor_size); + } else { + ggml_backend_tensor_get(t, act.data.data(), 0, tensor_size); + } + + g_activations[tensor_name] = std::move(act); + + return true; +} + +// Save collected activations to GGUF file +static bool save_activations_to_gguf(const std::string & filename) { + if (g_activations.empty()) { + LOG_ERR("No activations collected to save\n"); + return false; + } + + LOG("Saving %zu activations to %s\n", g_activations.size(), filename.c_str()); + + struct gguf_context * ctx = gguf_init_empty(); + + // Add metadata + gguf_set_val_u32(ctx, "activation.version", 1); + gguf_set_val_u32(ctx, "activation.count", (uint32_t)g_activations.size()); + + // Create a ggml context for tensor management + struct ggml_init_params params = { + /*.mem_size =*/ 1024ull*1024ull*1024ull, // 1GB should be enough for metadata + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, // We already have the data, don't allocate + }; + + struct ggml_context * ctx_data = ggml_init(params); + + // Add each activation tensor + for (auto & pair : g_activations) { + activation_tensor & act = pair.second; + + // Create tensor + struct ggml_tensor * tensor = ggml_new_tensor(ctx_data, act.type, GGML_MAX_DIMS, act.ne.data()); + ggml_set_name(tensor, act.name.c_str()); + + // Set tensor data pointer (non-const since ggml_tensor->data is void*) + tensor->data = act.data.data(); + + // Add to GGUF + gguf_add_tensor(ctx, tensor); + } + + // Write to file + gguf_write_to_file(ctx, filename.c_str(), false); + + LOG("Successfully saved %zu activations\n", g_activations.size()); + + ggml_free(ctx_data); + gguf_free(ctx); + + return true; +} + +// Load activations from GGUF file +static bool load_activations_from_gguf(const std::string & filename) { + LOG("Loading activations from %s\n", filename.c_str()); + + struct ggml_context * ctx_data = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + }; + + struct gguf_context * ctx = gguf_init_from_file(filename.c_str(), params); + + if (!ctx) { + LOG_ERR("Failed to load activations from %s\n", filename.c_str()); + return false; + } + + // Read metadata + const int n_kv = gguf_get_n_kv(ctx); + LOG("Activation file contains %d metadata entries\n", n_kv); + + for (int i = 0; i < n_kv; i++) { + const char * key = gguf_get_key(ctx, i); + const enum gguf_type type = gguf_get_kv_type(ctx, i); + + if (strcmp(key, "activation.count") == 0 && type == GGUF_TYPE_UINT32) { + uint32_t count = gguf_get_val_u32(ctx, i); + LOG(" activation.count = %u\n", count); + } else if (strcmp(key, "activation.version") == 0 && type == GGUF_TYPE_UINT32) { + uint32_t version = gguf_get_val_u32(ctx, i); + LOG(" activation.version = %u\n", version); + } + } + + // Read tensors + const int n_tensors = gguf_get_n_tensors(ctx); + LOG("Loaded %d activation tensors:\n", n_tensors); + + for (int i = 0; i < n_tensors; i++) { + const char * name = gguf_get_tensor_name(ctx, i); + struct ggml_tensor * tensor = ggml_get_tensor(ctx_data, name); + + if (tensor) { + LOG(" [%d] %s: type=%s, dims=[", i, name, ggml_type_name(tensor->type)); + for (int j = 0; j < GGML_MAX_DIMS; j++) { + if (j > 0) LOG(", "); + LOG("%lld", (long long)tensor->ne[j]); + } + LOG("], size=%zu bytes\n", ggml_nbytes(tensor)); + } + } + + gguf_free(ctx); + + return true; +} + #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) static void sigint_handler(int signo) { if (signo == SIGINT) { @@ -147,6 +345,21 @@ int main(int argc, char ** argv) { return 1; } + // Handle activation loading + if (!params.path_load_activations.empty()) { + if (!load_activations_from_gguf(params.path_load_activations)) { + LOG_ERR("%s: failed to load activations\n", __func__); + return 1; + } + } + + // Setup activation dumping callback + if (!params.path_dump_activations.empty()) { + g_dump_activations = true; + params.cb_eval = activation_collector; + LOG("Activation dumping enabled, will save to: %s\n", params.path_dump_activations.c_str()); + } + auto * mem = llama_get_memory(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -979,6 +1192,14 @@ int main(int argc, char ** argv) { } LOG("\n\n"); + + // Save activations if dumping was enabled + if (g_dump_activations && !params.path_dump_activations.empty()) { + if (!save_activations_to_gguf(params.path_dump_activations)) { + LOG_ERR("%s: failed to save activations\n", __func__); + } + } + common_perf_print(ctx, smpl); common_sampler_free(smpl); From 2ac5ded6eb433cafee938dbd808ee7b24bdaeff3 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 8 Nov 2025 11:21:38 +0000 Subject: [PATCH 02/24] Add interactive commands for saving/loading activations during conversation This commit adds two interactive commands that can be used during a chat session: - /\/save - Triggers activation collection for the next inference pass and saves the collected activations to a GGUF file - /\/load - Loads and displays activations from a GGUF file Key features: - Activations are collected only for one inference pass when /\/save is used - The callback is automatically enabled in interactive mode - Clear user feedback when activations are being collected and saved - Helpful command information displayed when entering interactive mode This allows users to interactively capture activations at specific points in a conversation, making it much more flexible for debugging and analysis. Example usage: > /\/save response1.gguf > What is the capital of France? [activations collected and saved after response] > /\/load response1.gguf [displays tensor information] --- tools/main/main.cpp | 63 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 05f5673f489af..a408b1f446bf4 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -55,6 +55,8 @@ struct activation_tensor { static std::unordered_map g_activations; static std::mutex g_activations_mutex; static bool g_dump_activations = false; +static bool g_dump_activations_once = false; +static std::string g_activation_save_path = ""; static void print_usage(int argc, char ** argv) { (void) argc; @@ -99,7 +101,7 @@ static std::string filter_tensor_name(const char * name) { static bool activation_collector(struct ggml_tensor * t, bool ask, void * user_data) { (void) user_data; - if (!g_dump_activations) { + if (!g_dump_activations && !g_dump_activations_once) { return false; } @@ -358,6 +360,9 @@ int main(int argc, char ** argv) { g_dump_activations = true; params.cb_eval = activation_collector; LOG("Activation dumping enabled, will save to: %s\n", params.path_dump_activations.c_str()); + } else if (params.interactive) { + // Enable callback in interactive mode for on-demand activation dumping + params.cb_eval = activation_collector; } auto * mem = llama_get_memory(ctx); @@ -637,6 +642,9 @@ int main(int argc, char ** argv) { if (params.interactive) { LOG_INF("%s: interactive mode on.\n", __func__); + LOG_INF("Special commands:\n"); + LOG_INF(" /\\/save - Save activations from next inference to GGUF file\n"); + LOG_INF(" /\\/load - Load and display activations from GGUF file\n"); if (!params.antiprompt.empty()) { for (const auto & antiprompt : params.antiprompt) { @@ -1026,6 +1034,18 @@ int main(int argc, char ** argv) { LOG_DBG("found an EOG token\n"); if (params.interactive) { + // Save activations if one-time dump was requested + if (g_dump_activations_once && !g_activation_save_path.empty()) { + LOG("\nSaving collected activations...\n"); + if (save_activations_to_gguf(g_activation_save_path)) { + LOG("Activations saved successfully!\n"); + } else { + LOG_ERR("Failed to save activations\n"); + } + g_dump_activations_once = false; + g_activation_save_path = ""; + } + if (!params.antiprompt.empty()) { // tokenize and inject first reverse prompt const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true); @@ -1094,6 +1114,47 @@ int main(int argc, char ** argv) { buffer.pop_back(); } + // Handle special activation commands + if (buffer.rfind("/\\/save ", 0) == 0) { + // Extract filename + std::string filename = buffer.substr(8); // Skip "/\/save " + // Trim whitespace + filename.erase(0, filename.find_first_not_of(" \t\n\r\f\v")); + filename.erase(filename.find_last_not_of(" \t\n\r\f\v") + 1); + + if (!filename.empty()) { + LOG("Activations will be saved to: %s\n", filename.c_str()); + LOG("Collecting activations for the next inference pass...\n"); + + // Clear previous activations and prepare for new collection + { + std::lock_guard lock(g_activations_mutex); + g_activations.clear(); + } + + g_activation_save_path = filename; + g_dump_activations_once = true; + } else { + LOG_ERR("Error: No filename specified for /\\/save command\n"); + } + buffer.clear(); + } else if (buffer.rfind("/\\/load ", 0) == 0) { + // Extract filename + std::string filename = buffer.substr(8); // Skip "/\/load " + // Trim whitespace + filename.erase(0, filename.find_first_not_of(" \t\n\r\f\v")); + filename.erase(filename.find_last_not_of(" \t\n\r\f\v") + 1); + + if (!filename.empty()) { + if (!load_activations_from_gguf(filename)) { + LOG_ERR("Failed to load activations from: %s\n", filename.c_str()); + } + } else { + LOG_ERR("Error: No filename specified for /\\/load command\n"); + } + buffer.clear(); + } + if (buffer.empty()) { // Enter key on empty line lets the user pass control back LOG_DBG("empty line, passing control back\n"); } else { // Add tokens to embd only if the input buffer is non-empty From efac1f50b7abc09ae0286465d393fba3eaac9d67 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 8 Nov 2025 11:37:55 +0000 Subject: [PATCH 03/24] Fix activation save/load commands - prevent unwanted inference and add debugging The previous implementation had a bug where after entering the /\/save or /\/load command, the code would continue processing instead of looping back to wait for the next user input. Changes: - Added explicit 'continue' after processing commands to loop back properly - Set is_interacting=true to ensure we stay in interactive mode - Improved user feedback messages (clearer instructions) - Added debug logging to help diagnose activation collection issues: * Log when callback collects activations * Log activation count before saving * Better error messages if no activations collected This fixes the issue where /\/save would trigger inference immediately instead of waiting for the next user prompt. --- tools/main/main.cpp | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index a408b1f446bf4..de89f24aca32d 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -119,6 +119,8 @@ static bool activation_collector(struct ggml_tensor * t, bool ask, void * user_d // Collect the tensor data std::lock_guard lock(g_activations_mutex); + LOG_DBG("Collecting activation from tensor: %s\n", t->name); + std::string tensor_name = filter_tensor_name(t->name); if (tensor_name.empty()) { tensor_name = std::string(t->name); @@ -156,8 +158,11 @@ static bool activation_collector(struct ggml_tensor * t, bool ask, void * user_d // Save collected activations to GGUF file static bool save_activations_to_gguf(const std::string & filename) { + LOG_DBG("save_activations_to_gguf called with %zu activations\n", g_activations.size()); + if (g_activations.empty()) { - LOG_ERR("No activations collected to save\n"); + LOG_ERR("No activations collected to save (collected %zu tensors)\n", g_activations.size()); + LOG_ERR("This might mean the callback wasn't triggered during inference.\n"); return false; } @@ -1036,11 +1041,13 @@ int main(int argc, char ** argv) { if (params.interactive) { // Save activations if one-time dump was requested if (g_dump_activations_once && !g_activation_save_path.empty()) { - LOG("\nSaving collected activations...\n"); + LOG("\nSaving collected activations to %s...\n", g_activation_save_path.c_str()); + LOG_DBG("g_dump_activations_once=%d, collected %zu activations\n", + g_dump_activations_once, g_activations.size()); if (save_activations_to_gguf(g_activation_save_path)) { - LOG("Activations saved successfully!\n"); + LOG("Activations saved successfully to %s!\n", g_activation_save_path.c_str()); } else { - LOG_ERR("Failed to save activations\n"); + LOG_ERR("Failed to save activations to %s\n", g_activation_save_path.c_str()); } g_dump_activations_once = false; g_activation_save_path = ""; @@ -1123,8 +1130,9 @@ int main(int argc, char ** argv) { filename.erase(filename.find_last_not_of(" \t\n\r\f\v") + 1); if (!filename.empty()) { + LOG("\n"); LOG("Activations will be saved to: %s\n", filename.c_str()); - LOG("Collecting activations for the next inference pass...\n"); + LOG("Please enter your next prompt to trigger activation collection.\n"); // Clear previous activations and prepare for new collection { @@ -1137,7 +1145,9 @@ int main(int argc, char ** argv) { } else { LOG_ERR("Error: No filename specified for /\\/save command\n"); } - buffer.clear(); + // Keep is_interacting true and continue to wait for next input + is_interacting = true; + continue; } else if (buffer.rfind("/\\/load ", 0) == 0) { // Extract filename std::string filename = buffer.substr(8); // Skip "/\/load " @@ -1146,13 +1156,16 @@ int main(int argc, char ** argv) { filename.erase(filename.find_last_not_of(" \t\n\r\f\v") + 1); if (!filename.empty()) { + LOG("\n"); if (!load_activations_from_gguf(filename)) { LOG_ERR("Failed to load activations from: %s\n", filename.c_str()); } } else { LOG_ERR("Error: No filename specified for /\\/load command\n"); } - buffer.clear(); + // Keep is_interacting true and continue to wait for next input + is_interacting = true; + continue; } if (buffer.empty()) { // Enter key on empty line lets the user pass control back From 453874390d88893b98f11849e1774f5131db2c32 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 8 Nov 2025 11:48:05 +0000 Subject: [PATCH 04/24] Critical fix: Set activation callback before context creation The previous code was setting params.cb_eval AFTER the context was created by common_init_from_params(params). This meant the context never received the callback, so no activations were being collected. This commit moves the callback setup to happen BEFORE context creation, which is essential for the callback to be properly registered with the llama context. Changes: - Moved callback setup before common_init_from_params() call - Added debug logging to confirm callback is enabled in interactive mode - Added comment explaining why the order matters This fixes the issue where 0 activations were being collected because the callback was never being triggered during inference. --- tools/main/main.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index de89f24aca32d..0641c1eed40dc 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -340,6 +340,18 @@ int main(int argc, char ** argv) { std::vector chat_msgs; + // Setup activation dumping callback BEFORE creating context + // The callback must be set on params before common_init_from_params is called + if (!params.path_dump_activations.empty()) { + g_dump_activations = true; + params.cb_eval = activation_collector; + LOG("Activation dumping enabled, will save to: %s\n", params.path_dump_activations.c_str()); + } else if (params.interactive) { + // Enable callback in interactive mode for on-demand activation dumping + params.cb_eval = activation_collector; + LOG_DBG("Activation callback enabled for interactive mode\n"); + } + // load the model and apply lora adapter, if any LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); common_init_result llama_init = common_init_from_params(params); @@ -360,16 +372,6 @@ int main(int argc, char ** argv) { } } - // Setup activation dumping callback - if (!params.path_dump_activations.empty()) { - g_dump_activations = true; - params.cb_eval = activation_collector; - LOG("Activation dumping enabled, will save to: %s\n", params.path_dump_activations.c_str()); - } else if (params.interactive) { - // Enable callback in interactive mode for on-demand activation dumping - params.cb_eval = activation_collector; - } - auto * mem = llama_get_memory(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); From de61f108064cf1006e936f96b9a82004fbd3dae2 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 8 Nov 2025 11:53:00 +0000 Subject: [PATCH 05/24] Add extensive debugging and disable warmup for activation callback This commit adds debugging to help diagnose why the callback isn't being triggered: 1. Set params.cb_eval_user_data explicitly to nullptr 2. Disable warmup (params.warmup = false) as warmup may interfere with callbacks - following the pattern from imatrix tool 3. Add debug logging to show when callback is first invoked 4. Add debug logging when callback asks about tensors This will help us understand if: - The callback is being invoked at all - The callback is seeing tensors but filtering them out - There's an issue with the callback registration To test with debug output, run: LLAMA_LOG_LEVEL=5 ./llama-cli -m model.gguf --interactive --- tools/main/main.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 0641c1eed40dc..d75638e390e7b 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -101,6 +101,13 @@ static std::string filter_tensor_name(const char * name) { static bool activation_collector(struct ggml_tensor * t, bool ask, void * user_data) { (void) user_data; + // Log that callback is being called + static bool first_call = true; + if (first_call) { + LOG_DBG("Activation callback is being invoked!\n"); + first_call = false; + } + if (!g_dump_activations && !g_dump_activations_once) { return false; } @@ -111,6 +118,7 @@ static bool activation_collector(struct ggml_tensor * t, bool ask, void * user_d if (t->op == GGML_OP_MUL_MAT || t->op == GGML_OP_MUL_MAT_ID || t->op == GGML_OP_ADD || t->op == GGML_OP_MUL || t->op == GGML_OP_NORM || t->op == GGML_OP_RMS_NORM) { + LOG_DBG("Callback asking about tensor %s (op=%d)\n", t->name, t->op); return true; } return false; @@ -345,10 +353,14 @@ int main(int argc, char ** argv) { if (!params.path_dump_activations.empty()) { g_dump_activations = true; params.cb_eval = activation_collector; + params.cb_eval_user_data = nullptr; + params.warmup = false; // Disable warmup to ensure callback works LOG("Activation dumping enabled, will save to: %s\n", params.path_dump_activations.c_str()); } else if (params.interactive) { // Enable callback in interactive mode for on-demand activation dumping params.cb_eval = activation_collector; + params.cb_eval_user_data = nullptr; + params.warmup = false; // Disable warmup to ensure callback works LOG_DBG("Activation callback enabled for interactive mode\n"); } From e728b4dee1e5be549fce26cf70282838b47d196f Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 8 Nov 2025 11:56:43 +0000 Subject: [PATCH 06/24] Improve activation collection logging and handle GPU tensors better This commit improves visibility into the activation collection process, especially when using GPU offloading: Changes: 1. Changed first callback log from LOG_DBG to LOG - now always visible to immediately show if callback is being invoked 2. Added collection counter to track how many activations are collected 3. Changed duplicate handling - instead of skipping, create unique names (tensor_1, tensor_2, etc.) to capture all activations 4. Added detailed logging showing if tensors are on CPU vs GPU 5. Added periodic progress logging every 10 collections 6. Improved debug output to show operation type The code already handles GPU tensors correctly using ggml_backend_tensor_get() to copy from GPU memory to host, following the same pattern as the imatrix tool. This will help diagnose why activations might not be collected when using GPU offloading. --- tools/main/main.cpp | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index d75638e390e7b..bc7987a8af536 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -104,7 +104,7 @@ static bool activation_collector(struct ggml_tensor * t, bool ask, void * user_d // Log that callback is being called static bool first_call = true; if (first_call) { - LOG_DBG("Activation callback is being invoked!\n"); + LOG("Activation callback is being invoked!\n"); first_call = false; } @@ -127,20 +127,25 @@ static bool activation_collector(struct ggml_tensor * t, bool ask, void * user_d // Collect the tensor data std::lock_guard lock(g_activations_mutex); - LOG_DBG("Collecting activation from tensor: %s\n", t->name); + static int collect_count = 0; + collect_count++; + + LOG_DBG("Collecting activation #%d from tensor: %s (op=%d)\n", collect_count, t->name, t->op); std::string tensor_name = filter_tensor_name(t->name); if (tensor_name.empty()) { tensor_name = std::string(t->name); } - // Check if we already have this tensor (to avoid duplicates) - if (g_activations.find(tensor_name) != g_activations.end()) { - return true; + // Make unique name if we already have this tensor + std::string unique_name = tensor_name; + int counter = 1; + while (g_activations.find(unique_name) != g_activations.end()) { + unique_name = tensor_name + "_" + std::to_string(counter++); } activation_tensor act; - act.name = tensor_name; + act.name = unique_name; act.type = t->type; // Store dimensions @@ -148,18 +153,27 @@ static bool activation_collector(struct ggml_tensor * t, bool ask, void * user_d act.ne.push_back(t->ne[i]); } - // Copy tensor data + // Copy tensor data - handle both CPU and GPU tensors const size_t tensor_size = ggml_nbytes(t); act.data.resize(tensor_size); + // Check if tensor is on host (CPU) or device (GPU) const bool is_host = ggml_backend_buffer_is_host(t->buffer); + + LOG_DBG(" Tensor %s: size=%zu bytes, is_host=%d\n", unique_name.c_str(), tensor_size, is_host); + if (is_host) { memcpy(act.data.data(), t->data, tensor_size); } else { + // Tensor is on GPU, need to copy it to host memory ggml_backend_tensor_get(t, act.data.data(), 0, tensor_size); } - g_activations[tensor_name] = std::move(act); + g_activations[unique_name] = std::move(act); + + if (collect_count % 10 == 0) { + LOG_DBG(" Collected %d activations so far (total unique: %zu)\n", collect_count, g_activations.size()); + } return true; } From c76249963f5a71094977b54543bbc064191a0a3d Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 8 Nov 2025 12:08:58 +0000 Subject: [PATCH 07/24] CRITICAL FIX: Disable graph reuse to enable activation callbacks ROOT CAUSE FOUND: The callback was never being invoked because llama.cpp reuses the computation graph on subsequent inferences for performance. When the graph is reused, the callback is NOT set (see llama-context.cpp:778 - callback only set when graph is rebuilt). SOLUTION: Set the environment variable LLAMA_GRAPH_REUSE_DISABLE=1 when activation collection is enabled. This forces the graph to be rebuilt on every inference, which ensures the callback gets set and invoked. Changes: 1. Call setenv("LLAMA_GRAPH_REUSE_DISABLE", "1", 1) in both: - When --dump-activations is used - When in interactive mode (for /\/save command) 2. Added log message to inform users graph reuse is disabled Note: Disabling graph reuse will slightly reduce performance, but this is necessary for the callback mechanism to work. This only affects runs where activation dumping is enabled. This should finally fix the issue where 0 activations were being collected! --- tools/main/main.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index bc7987a8af536..fcfeced0fc4b9 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -364,18 +364,25 @@ int main(int argc, char ** argv) { // Setup activation dumping callback BEFORE creating context // The callback must be set on params before common_init_from_params is called + // IMPORTANT: Graph reuse must be disabled for callbacks to work properly if (!params.path_dump_activations.empty()) { g_dump_activations = true; params.cb_eval = activation_collector; params.cb_eval_user_data = nullptr; params.warmup = false; // Disable warmup to ensure callback works + // Disable graph reuse so callback gets set on every inference + setenv("LLAMA_GRAPH_REUSE_DISABLE", "1", 1); LOG("Activation dumping enabled, will save to: %s\n", params.path_dump_activations.c_str()); + LOG("Graph reuse disabled to ensure callback is invoked\n"); } else if (params.interactive) { // Enable callback in interactive mode for on-demand activation dumping params.cb_eval = activation_collector; params.cb_eval_user_data = nullptr; params.warmup = false; // Disable warmup to ensure callback works + // Disable graph reuse so callback gets set on every inference + setenv("LLAMA_GRAPH_REUSE_DISABLE", "1", 1); LOG_DBG("Activation callback enabled for interactive mode\n"); + LOG("Graph reuse disabled to ensure callback works\n"); } // load the model and apply lora adapter, if any From 219497831a2d15f523540631b3e222f93b0759e2 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 8 Nov 2025 12:25:04 +0000 Subject: [PATCH 08/24] Implement clean LLM state save/load using llama_state API Replace callback-based approach with simpler state serialization: - Remove all callback and graph reuse disabling code - Use llama_state_get_data/set_data for complete state capture - Save/restore: KV cache, RNG state, logits, embeddings - No performance penalty from disabled optimizations Features: - CLI flags: --dump-activations, --load-activations - Interactive: /\/save , /\/load - GGUF format with metadata (version, size, type) - Exact state restoration for conversation continuity This provides the ability to save and restore the LLM to exactly the same conversational state, enabling conversation checkpointing and experimentation with different conversation branches. --- tools/main/main-state-save.cpp | 125 +++++++++++++ tools/main/main.cpp | 328 ++++++++++----------------------- 2 files changed, 219 insertions(+), 234 deletions(-) create mode 100644 tools/main/main-state-save.cpp diff --git a/tools/main/main-state-save.cpp b/tools/main/main-state-save.cpp new file mode 100644 index 0000000000000..331570d517125 --- /dev/null +++ b/tools/main/main-state-save.cpp @@ -0,0 +1,125 @@ +// State save/load functions for main.cpp +// This file contains the simplified implementation using llama_state_get_data/set_data + +#include "llama.h" +#include "log.h" +#include "gguf.h" +#include +#include + +// Save complete LLM state to GGUF file +// This includes: KV cache, logits, embeddings, RNG state +static bool save_llm_state_to_gguf(llama_context * ctx, const std::string & filename) { + LOG("\nSaving LLM state to %s...\n", filename.c_str()); + + // Get the size of the state + const size_t state_size = llama_state_get_size(ctx); + LOG("State size: %zu bytes (%.2f MB)\n", state_size, state_size / (1024.0 * 1024.0)); + + // Allocate buffer and get state data + std::vector state_data(state_size); + const size_t written = llama_state_get_data(ctx, state_data.data(), state_size); + + if (written != state_size) { + LOG_ERR("Failed to get state data: got %zu bytes, expected %zu\n", written, state_size); + return false; + } + + // Create GGUF context + struct gguf_context * gguf_ctx = gguf_init_empty(); + + // Add metadata + gguf_set_val_u32(gguf_ctx, "llm_state.version", 1); + gguf_set_val_u64(gguf_ctx, "llm_state.size", state_size); + gguf_set_val_str(gguf_ctx, "llm_state.type", "kv_cache_rng_logits_embeddings"); + + // For GGUF, we need to add the state as a tensor + // Create a ggml context for the tensor + struct ggml_init_params params = { + /*.mem_size =*/ state_size + 1024*1024, // Extra space for tensor metadata + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, // We already have the data + }; + + struct ggml_context * ggml_ctx = ggml_init(params); + + // Create a 1D tensor to hold the state data + int64_t ne[4] = {(int64_t)state_size, 1, 1, 1}; + struct ggml_tensor * state_tensor = ggml_new_tensor(ggml_ctx, GGML_TYPE_I8, 1, ne); + ggml_set_name(state_tensor, "llm_state_data"); + state_tensor->data = state_data.data(); + + // Add tensor to GGUF + gguf_add_tensor(gguf_ctx, state_tensor); + + // Write to file + gguf_write_to_file(gguf_ctx, filename.c_str(), false); + + LOG("Successfully saved LLM state (%zu bytes)\n", written); + + // Cleanup + ggml_free(ggml_ctx); + gguf_free(gguf_ctx); + + return true; +} + +// Load complete LLM state from GGUF file +static bool load_llm_state_from_gguf(llama_context * ctx, const std::string & filename) { + LOG("\nLoading LLM state from %s...\n", filename.c_str()); + + struct ggml_context * ggml_ctx = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ggml_ctx, + }; + + struct gguf_context * gguf_ctx = gguf_init_from_file(filename.c_str(), params); + + if (!gguf_ctx) { + LOG_ERR("Failed to load state file: %s\n", filename.c_str()); + return false; + } + + // Read metadata + const int n_kv = gguf_get_n_kv(gguf_ctx); + uint32_t version = 0; + uint64_t state_size = 0; + + for (int i = 0; i < n_kv; i++) { + const char * key = gguf_get_key(gguf_ctx, i); + const enum gguf_type type = gguf_get_kv_type(gguf_ctx, i); + + if (strcmp(key, "llm_state.version") == 0 && type == GGUF_TYPE_UINT32) { + version = gguf_get_val_u32(gguf_ctx, i); + } else if (strcmp(key, "llm_state.size") == 0 && type == GGUF_TYPE_UINT64) { + state_size = gguf_get_val_u64(gguf_ctx, i); + } + } + + LOG("State version: %u, size: %lu bytes (%.2f MB)\n", version, state_size, state_size / (1024.0 * 1024.0)); + + // Get the state tensor + struct ggml_tensor * state_tensor = ggml_get_tensor(ggml_ctx, "llm_state_data"); + if (!state_tensor) { + LOG_ERR("State tensor not found in file\n"); + gguf_free(gguf_ctx); + return false; + } + + // Set the state + const size_t loaded = llama_state_set_data(ctx, (const uint8_t*)state_tensor->data, ggml_nbytes(state_tensor)); + + if (loaded == 0) { + LOG_ERR("Failed to set state data\n"); + gguf_free(gguf_ctx); + return false; + } + + LOG("Successfully loaded LLM state (%zu bytes)\n", loaded); + + gguf_free(gguf_ctx); + + return true; +} diff --git a/tools/main/main.cpp b/tools/main/main.cpp index fcfeced0fc4b9..00ea5ff06d138 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -15,8 +15,6 @@ #include #include #include -#include -#include #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) #include @@ -44,19 +42,9 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; -// Activation dumping structures -struct activation_tensor { - std::string name; - std::vector ne; // dimensions - ggml_type type; - std::vector data; -}; - -static std::unordered_map g_activations; -static std::mutex g_activations_mutex; -static bool g_dump_activations = false; -static bool g_dump_activations_once = false; -static std::string g_activation_save_path = ""; +// State save/load flags for interactive commands +static bool g_save_state_next = false; +static std::string g_state_save_path = ""; static void print_usage(int argc, char ** argv) { (void) argc; @@ -79,212 +67,118 @@ static bool file_is_empty(const std::string & path) { return f.tellg() == 0; } -// Filter tensor name (remove backend prefix and suffix) -static std::string filter_tensor_name(const char * name) { - std::string wname; - const char * p = strchr(name, '#'); - if (p != NULL) { - p = p + 1; - const char * q = strchr(p, '#'); - if (q != NULL) { - wname = std::string(p, q - p); - } else { - wname = p; - } - } else { - wname = name; - } - return wname; -} - -// Callback for collecting activations -static bool activation_collector(struct ggml_tensor * t, bool ask, void * user_data) { - (void) user_data; +// Save complete LLM state (KV cache + RNG + logits + embeddings) to GGUF file +static bool save_llm_state_to_gguf(llama_context * ctx, const std::string & filename) { + LOG("\nSaving LLM state to %s...\n", filename.c_str()); - // Log that callback is being called - static bool first_call = true; - if (first_call) { - LOG("Activation callback is being invoked!\n"); - first_call = false; - } + // Get the size of the state + const size_t state_size = llama_state_get_size(ctx); + LOG("State size: %zu bytes (%.2f MB)\n", state_size, state_size / (1024.0 * 1024.0)); - if (!g_dump_activations && !g_dump_activations_once) { - return false; - } + // Allocate buffer and get state data + std::vector state_data(state_size); + const size_t written = llama_state_get_data(ctx, state_data.data(), state_size); - // Filter for interesting operations and tensors - if (ask) { - // We're interested in capturing activations from various operations - if (t->op == GGML_OP_MUL_MAT || t->op == GGML_OP_MUL_MAT_ID || - t->op == GGML_OP_ADD || t->op == GGML_OP_MUL || - t->op == GGML_OP_NORM || t->op == GGML_OP_RMS_NORM) { - LOG_DBG("Callback asking about tensor %s (op=%d)\n", t->name, t->op); - return true; - } + if (written != state_size) { + LOG_ERR("Failed to get state data: got %zu bytes, expected %zu\n", written, state_size); return false; } - // Collect the tensor data - std::lock_guard lock(g_activations_mutex); - - static int collect_count = 0; - collect_count++; - - LOG_DBG("Collecting activation #%d from tensor: %s (op=%d)\n", collect_count, t->name, t->op); - - std::string tensor_name = filter_tensor_name(t->name); - if (tensor_name.empty()) { - tensor_name = std::string(t->name); - } - - // Make unique name if we already have this tensor - std::string unique_name = tensor_name; - int counter = 1; - while (g_activations.find(unique_name) != g_activations.end()) { - unique_name = tensor_name + "_" + std::to_string(counter++); - } - - activation_tensor act; - act.name = unique_name; - act.type = t->type; - - // Store dimensions - for (int i = 0; i < GGML_MAX_DIMS; i++) { - act.ne.push_back(t->ne[i]); - } - - // Copy tensor data - handle both CPU and GPU tensors - const size_t tensor_size = ggml_nbytes(t); - act.data.resize(tensor_size); - - // Check if tensor is on host (CPU) or device (GPU) - const bool is_host = ggml_backend_buffer_is_host(t->buffer); - - LOG_DBG(" Tensor %s: size=%zu bytes, is_host=%d\n", unique_name.c_str(), tensor_size, is_host); - - if (is_host) { - memcpy(act.data.data(), t->data, tensor_size); - } else { - // Tensor is on GPU, need to copy it to host memory - ggml_backend_tensor_get(t, act.data.data(), 0, tensor_size); - } - - g_activations[unique_name] = std::move(act); - - if (collect_count % 10 == 0) { - LOG_DBG(" Collected %d activations so far (total unique: %zu)\n", collect_count, g_activations.size()); - } - - return true; -} - -// Save collected activations to GGUF file -static bool save_activations_to_gguf(const std::string & filename) { - LOG_DBG("save_activations_to_gguf called with %zu activations\n", g_activations.size()); - - if (g_activations.empty()) { - LOG_ERR("No activations collected to save (collected %zu tensors)\n", g_activations.size()); - LOG_ERR("This might mean the callback wasn't triggered during inference.\n"); - return false; - } - - LOG("Saving %zu activations to %s\n", g_activations.size(), filename.c_str()); - - struct gguf_context * ctx = gguf_init_empty(); + // Create GGUF context + struct gguf_context * gguf_ctx = gguf_init_empty(); // Add metadata - gguf_set_val_u32(ctx, "activation.version", 1); - gguf_set_val_u32(ctx, "activation.count", (uint32_t)g_activations.size()); + gguf_set_val_u32(gguf_ctx, "llm_state.version", 1); + gguf_set_val_u64(gguf_ctx, "llm_state.size", state_size); + gguf_set_val_str(gguf_ctx, "llm_state.type", "kv_cache_rng_logits_embeddings"); - // Create a ggml context for tensor management + // Create a ggml context for the tensor struct ggml_init_params params = { - /*.mem_size =*/ 1024ull*1024ull*1024ull, // 1GB should be enough for metadata + /*.mem_size =*/ state_size + 1024*1024, // Extra space for tensor metadata /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, // We already have the data, don't allocate + /*.no_alloc =*/ true, // We already have the data }; - struct ggml_context * ctx_data = ggml_init(params); - - // Add each activation tensor - for (auto & pair : g_activations) { - activation_tensor & act = pair.second; + struct ggml_context * ggml_ctx = ggml_init(params); - // Create tensor - struct ggml_tensor * tensor = ggml_new_tensor(ctx_data, act.type, GGML_MAX_DIMS, act.ne.data()); - ggml_set_name(tensor, act.name.c_str()); + // Create a 1D tensor to hold the state data + int64_t ne[4] = {(int64_t)state_size, 1, 1, 1}; + struct ggml_tensor * state_tensor = ggml_new_tensor(ggml_ctx, GGML_TYPE_I8, 1, ne); + ggml_set_name(state_tensor, "llm_state_data"); + state_tensor->data = state_data.data(); - // Set tensor data pointer (non-const since ggml_tensor->data is void*) - tensor->data = act.data.data(); - - // Add to GGUF - gguf_add_tensor(ctx, tensor); - } + // Add tensor to GGUF + gguf_add_tensor(gguf_ctx, state_tensor); // Write to file - gguf_write_to_file(ctx, filename.c_str(), false); + gguf_write_to_file(gguf_ctx, filename.c_str(), false); - LOG("Successfully saved %zu activations\n", g_activations.size()); + LOG("Successfully saved LLM state (%zu bytes)\n", written); - ggml_free(ctx_data); - gguf_free(ctx); + // Cleanup + ggml_free(ggml_ctx); + gguf_free(gguf_ctx); return true; } -// Load activations from GGUF file -static bool load_activations_from_gguf(const std::string & filename) { - LOG("Loading activations from %s\n", filename.c_str()); +// Load complete LLM state from GGUF file +static bool load_llm_state_from_gguf(llama_context * ctx, const std::string & filename) { + LOG("\nLoading LLM state from %s...\n", filename.c_str()); - struct ggml_context * ctx_data = NULL; + struct ggml_context * ggml_ctx = NULL; struct gguf_init_params params = { /*.no_alloc = */ false, - /*.ctx = */ &ctx_data, + /*.ctx = */ &ggml_ctx, }; - struct gguf_context * ctx = gguf_init_from_file(filename.c_str(), params); + struct gguf_context * gguf_ctx = gguf_init_from_file(filename.c_str(), params); - if (!ctx) { - LOG_ERR("Failed to load activations from %s\n", filename.c_str()); + if (!gguf_ctx) { + LOG_ERR("Failed to load state file: %s\n", filename.c_str()); return false; } // Read metadata - const int n_kv = gguf_get_n_kv(ctx); - LOG("Activation file contains %d metadata entries\n", n_kv); + const int n_kv = gguf_get_n_kv(gguf_ctx); + uint32_t version = 0; + uint64_t state_size = 0; for (int i = 0; i < n_kv; i++) { - const char * key = gguf_get_key(ctx, i); - const enum gguf_type type = gguf_get_kv_type(ctx, i); - - if (strcmp(key, "activation.count") == 0 && type == GGUF_TYPE_UINT32) { - uint32_t count = gguf_get_val_u32(ctx, i); - LOG(" activation.count = %u\n", count); - } else if (strcmp(key, "activation.version") == 0 && type == GGUF_TYPE_UINT32) { - uint32_t version = gguf_get_val_u32(ctx, i); - LOG(" activation.version = %u\n", version); + const char * key = gguf_get_key(gguf_ctx, i); + const enum gguf_type type = gguf_get_kv_type(gguf_ctx, i); + + if (strcmp(key, "llm_state.version") == 0 && type == GGUF_TYPE_UINT32) { + version = gguf_get_val_u32(gguf_ctx, i); + } else if (strcmp(key, "llm_state.size") == 0 && type == GGUF_TYPE_UINT64) { + state_size = gguf_get_val_u64(gguf_ctx, i); } } - // Read tensors - const int n_tensors = gguf_get_n_tensors(ctx); - LOG("Loaded %d activation tensors:\n", n_tensors); + LOG("State version: %u, size: %lu bytes (%.2f MB)\n", version, state_size, state_size / (1024.0 * 1024.0)); - for (int i = 0; i < n_tensors; i++) { - const char * name = gguf_get_tensor_name(ctx, i); - struct ggml_tensor * tensor = ggml_get_tensor(ctx_data, name); + // Get the state tensor + struct ggml_tensor * state_tensor = ggml_get_tensor(ggml_ctx, "llm_state_data"); + if (!state_tensor) { + LOG_ERR("State tensor not found in file\n"); + gguf_free(gguf_ctx); + return false; + } - if (tensor) { - LOG(" [%d] %s: type=%s, dims=[", i, name, ggml_type_name(tensor->type)); - for (int j = 0; j < GGML_MAX_DIMS; j++) { - if (j > 0) LOG(", "); - LOG("%lld", (long long)tensor->ne[j]); - } - LOG("], size=%zu bytes\n", ggml_nbytes(tensor)); - } + // Set the state + const size_t loaded = llama_state_set_data(ctx, (const uint8_t*)state_tensor->data, ggml_nbytes(state_tensor)); + + if (loaded == 0) { + LOG_ERR("Failed to set state data\n"); + gguf_free(gguf_ctx); + return false; } - gguf_free(ctx); + LOG("Successfully loaded LLM state (%zu bytes)\n", loaded); + LOG("LLM has been restored to the exact state when the save was made\n"); + + gguf_free(gguf_ctx); return true; } @@ -362,29 +256,6 @@ int main(int argc, char ** argv) { std::vector chat_msgs; - // Setup activation dumping callback BEFORE creating context - // The callback must be set on params before common_init_from_params is called - // IMPORTANT: Graph reuse must be disabled for callbacks to work properly - if (!params.path_dump_activations.empty()) { - g_dump_activations = true; - params.cb_eval = activation_collector; - params.cb_eval_user_data = nullptr; - params.warmup = false; // Disable warmup to ensure callback works - // Disable graph reuse so callback gets set on every inference - setenv("LLAMA_GRAPH_REUSE_DISABLE", "1", 1); - LOG("Activation dumping enabled, will save to: %s\n", params.path_dump_activations.c_str()); - LOG("Graph reuse disabled to ensure callback is invoked\n"); - } else if (params.interactive) { - // Enable callback in interactive mode for on-demand activation dumping - params.cb_eval = activation_collector; - params.cb_eval_user_data = nullptr; - params.warmup = false; // Disable warmup to ensure callback works - // Disable graph reuse so callback gets set on every inference - setenv("LLAMA_GRAPH_REUSE_DISABLE", "1", 1); - LOG_DBG("Activation callback enabled for interactive mode\n"); - LOG("Graph reuse disabled to ensure callback works\n"); - } - // load the model and apply lora adapter, if any LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); common_init_result llama_init = common_init_from_params(params); @@ -397,10 +268,10 @@ int main(int argc, char ** argv) { return 1; } - // Handle activation loading + // Handle state loading if (!params.path_load_activations.empty()) { - if (!load_activations_from_gguf(params.path_load_activations)) { - LOG_ERR("%s: failed to load activations\n", __func__); + if (!load_llm_state_from_gguf(ctx, params.path_load_activations)) { + LOG_ERR("%s: failed to load LLM state\n", __func__); return 1; } } @@ -683,8 +554,8 @@ int main(int argc, char ** argv) { if (params.interactive) { LOG_INF("%s: interactive mode on.\n", __func__); LOG_INF("Special commands:\n"); - LOG_INF(" /\\/save - Save activations from next inference to GGUF file\n"); - LOG_INF(" /\\/load - Load and display activations from GGUF file\n"); + LOG_INF(" /\\/save - Save complete LLM state (KV cache, etc.) to GGUF file\n"); + LOG_INF(" /\\/load - Load LLM state from GGUF file to restore exact conversation state\n"); if (!params.antiprompt.empty()) { for (const auto & antiprompt : params.antiprompt) { @@ -1074,18 +945,13 @@ int main(int argc, char ** argv) { LOG_DBG("found an EOG token\n"); if (params.interactive) { - // Save activations if one-time dump was requested - if (g_dump_activations_once && !g_activation_save_path.empty()) { - LOG("\nSaving collected activations to %s...\n", g_activation_save_path.c_str()); - LOG_DBG("g_dump_activations_once=%d, collected %zu activations\n", - g_dump_activations_once, g_activations.size()); - if (save_activations_to_gguf(g_activation_save_path)) { - LOG("Activations saved successfully to %s!\n", g_activation_save_path.c_str()); - } else { - LOG_ERR("Failed to save activations to %s\n", g_activation_save_path.c_str()); + // Save LLM state if requested + if (g_save_state_next && !g_state_save_path.empty()) { + if (!save_llm_state_to_gguf(ctx, g_state_save_path)) { + LOG_ERR("Failed to save LLM state to %s\n", g_state_save_path.c_str()); } - g_dump_activations_once = false; - g_activation_save_path = ""; + g_save_state_next = false; + g_state_save_path = ""; } if (!params.antiprompt.empty()) { @@ -1156,7 +1022,7 @@ int main(int argc, char ** argv) { buffer.pop_back(); } - // Handle special activation commands + // Handle special state save/load commands if (buffer.rfind("/\\/save ", 0) == 0) { // Extract filename std::string filename = buffer.substr(8); // Skip "/\/save " @@ -1166,17 +1032,11 @@ int main(int argc, char ** argv) { if (!filename.empty()) { LOG("\n"); - LOG("Activations will be saved to: %s\n", filename.c_str()); - LOG("Please enter your next prompt to trigger activation collection.\n"); + LOG("LLM state will be saved to: %s\n", filename.c_str()); + LOG("State will be saved after your next prompt and response.\n"); - // Clear previous activations and prepare for new collection - { - std::lock_guard lock(g_activations_mutex); - g_activations.clear(); - } - - g_activation_save_path = filename; - g_dump_activations_once = true; + g_state_save_path = filename; + g_save_state_next = true; } else { LOG_ERR("Error: No filename specified for /\\/save command\n"); } @@ -1192,8 +1052,8 @@ int main(int argc, char ** argv) { if (!filename.empty()) { LOG("\n"); - if (!load_activations_from_gguf(filename)) { - LOG_ERR("Failed to load activations from: %s\n", filename.c_str()); + if (!load_llm_state_from_gguf(ctx, filename)) { + LOG_ERR("Failed to load LLM state from: %s\n", filename.c_str()); } } else { LOG_ERR("Error: No filename specified for /\\/load command\n"); @@ -1302,10 +1162,10 @@ int main(int argc, char ** argv) { LOG("\n\n"); - // Save activations if dumping was enabled - if (g_dump_activations && !params.path_dump_activations.empty()) { - if (!save_activations_to_gguf(params.path_dump_activations)) { - LOG_ERR("%s: failed to save activations\n", __func__); + // Save LLM state if dumping was enabled via CLI flag + if (!params.path_dump_activations.empty()) { + if (!save_llm_state_to_gguf(ctx, params.path_dump_activations)) { + LOG_ERR("%s: failed to save LLM state\n", __func__); } } From 8a709a6bbd3488b1d07a13d3d5ed5c4861d9726f Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 8 Nov 2025 13:38:46 +0000 Subject: [PATCH 09/24] Add runtime temperature control with /\/temp commands Implements interactive temperature get/set functionality: - /\/temp - displays current temperature setting - /\/temp - changes temperature on the fly Technical implementation: - Add common_sampler_get_temp() and common_sampler_set_temp() to sampling API - Set temp works by removing old temperature sampler from chain and replacing it - Preserves dynamic temperature settings (dynatemp_range, exponent) when set - Validates temperature values (must be >= 0.0) This allows users to experiment with different temperature values during a conversation without restarting the program, enabling exploration of how temperature affects model outputs in real-time. --- common/sampling.cpp | 72 +++++++++++++++++++++++++++++++++++++++++++++ common/sampling.h | 4 +++ tools/main/main.cpp | 36 +++++++++++++++++++++++ 3 files changed, 112 insertions(+) diff --git a/common/sampling.cpp b/common/sampling.cpp index c69d525b5b358..9051f08bcb5eb 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -4,6 +4,7 @@ #include "log.h" #include +#include #include #include @@ -599,3 +600,74 @@ std::vector common_sampler_types_from_chars(const std::stri return samplers; } + +// Get current temperature from the sampler +float common_sampler_get_temp(const struct common_sampler * gsmpl) { + if (!gsmpl) { + return 0.0f; + } + return gsmpl->params.temp; +} + +// Set temperature at runtime by replacing the temperature sampler in the chain +bool common_sampler_set_temp(struct common_sampler * gsmpl, float new_temp) { + if (!gsmpl || !gsmpl->chain) { + return false; + } + + // Find the temperature sampler in the chain + const int n_samplers = llama_sampler_chain_n(gsmpl->chain); + int temp_idx = -1; + + for (int i = 0; i < n_samplers; i++) { + struct llama_sampler * s = llama_sampler_chain_get(gsmpl->chain, i); + const char * name = llama_sampler_name(s); + + // Look for "temp" or "temp-ext" sampler + if (strcmp(name, "temp") == 0 || strcmp(name, "temp-ext") == 0) { + temp_idx = i; + break; + } + } + + if (temp_idx == -1) { + // No temperature sampler found - this might happen with mirostat + return false; + } + + // Remove the old temperature sampler + struct llama_sampler * old_temp = llama_sampler_chain_remove(gsmpl->chain, temp_idx); + if (old_temp) { + llama_sampler_free(old_temp); + } + + // Create new temperature sampler + struct llama_sampler * new_temp_sampler; + + // Use temp_ext if dynamic temperature was enabled, otherwise use simple temp + if (gsmpl->params.dynatemp_range > 0.0f) { + new_temp_sampler = llama_sampler_init_temp_ext(new_temp, gsmpl->params.dynatemp_range, gsmpl->params.dynatemp_exponent); + } else { + new_temp_sampler = llama_sampler_init_temp(new_temp); + } + + // Add at the end first + llama_sampler_chain_add(gsmpl->chain, new_temp_sampler); + + // Move it to the correct position by swapping + const int current_n = llama_sampler_chain_n(gsmpl->chain); + for (int i = current_n - 1; i > temp_idx; i--) { + // Remove from end + struct llama_sampler * s = llama_sampler_chain_remove(gsmpl->chain, i); + // Remove the one before + struct llama_sampler * s_prev = llama_sampler_chain_remove(gsmpl->chain, i - 1); + // Add them back in swapped order + llama_sampler_chain_add(gsmpl->chain, s); + llama_sampler_chain_add(gsmpl->chain, s_prev); + } + + // Update the params to reflect the new temperature + gsmpl->params.temp = new_temp; + + return true; +} diff --git a/common/sampling.h b/common/sampling.h index e198eecda3810..b1cadbb133f58 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -99,6 +99,10 @@ std::string common_sampler_print(const struct common_sampler * gsmpl); // get a string representation of the last accepted tokens std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n); +// get/set temperature at runtime +float common_sampler_get_temp(const struct common_sampler * gsmpl); +bool common_sampler_set_temp(struct common_sampler * gsmpl, float temp); + char common_sampler_type_to_chr(enum common_sampler_type cnstr); std::string common_sampler_type_to_str(enum common_sampler_type cnstr); diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 00ea5ff06d138..8f781e70fccbb 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -556,6 +556,8 @@ int main(int argc, char ** argv) { LOG_INF("Special commands:\n"); LOG_INF(" /\\/save - Save complete LLM state (KV cache, etc.) to GGUF file\n"); LOG_INF(" /\\/load - Load LLM state from GGUF file to restore exact conversation state\n"); + LOG_INF(" /\\/temp - Show current temperature setting\n"); + LOG_INF(" /\\/temp - Set temperature to a new value (e.g., /\\/temp 0.7)\n"); if (!params.antiprompt.empty()) { for (const auto & antiprompt : params.antiprompt) { @@ -1061,6 +1063,40 @@ int main(int argc, char ** argv) { // Keep is_interacting true and continue to wait for next input is_interacting = true; continue; + } else if (buffer.rfind("/\\/temp", 0) == 0) { + // Handle temperature get/set command + std::string temp_arg = buffer.substr(7); // Skip "/\/temp" + // Trim whitespace + temp_arg.erase(0, temp_arg.find_first_not_of(" \t\n\r\f\v")); + temp_arg.erase(temp_arg.find_last_not_of(" \t\n\r\f\v") + 1); + + if (temp_arg.empty()) { + // Show current temperature + LOG("\n"); + LOG("Current temperature: %.2f\n", common_sampler_get_temp(smpl)); + } else { + // Set new temperature + try { + float new_temp = std::stof(temp_arg); + if (new_temp < 0.0f) { + LOG_ERR("Error: Temperature must be >= 0.0\n"); + } else { + LOG("\n"); + float old_temp = common_sampler_get_temp(smpl); + LOG("Changing temperature from %.2f to %.2f\n", old_temp, new_temp); + if (common_sampler_set_temp(smpl, new_temp)) { + LOG("Temperature successfully updated to %.2f\n", new_temp); + } else { + LOG_ERR("Failed to update temperature\n"); + } + } + } catch (const std::exception & e) { + LOG_ERR("Error: Invalid temperature value '%s'\n", temp_arg.c_str()); + } + } + // Keep is_interacting true and continue to wait for next input + is_interacting = true; + continue; } if (buffer.empty()) { // Enter key on empty line lets the user pass control back From 5c3afd22b84e89ec4d5f1e6a043123c681f51349 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 8 Nov 2025 14:06:14 +0000 Subject: [PATCH 10/24] Fix temperature sampler replacement logic The previous swapping logic to reposition the sampler in the chain was buggy and not actually changing the temperature. Fixed approach: 1. Remove old temperature sampler at position temp_idx 2. Collect all samplers that come after that position 3. Add new temperature sampler with new value 4. Add back all collected samplers in original order This correctly preserves the sampler chain order while replacing the temperature sampler, so temperature changes now actually take effect. Users should now see dramatic differences in output randomness: - temp 0.1: very deterministic - temp 1.0: balanced - temp 10.0+: increasingly random/creative --- common/sampling.cpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 9051f08bcb5eb..c8ae762a0a547 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -641,7 +641,14 @@ bool common_sampler_set_temp(struct common_sampler * gsmpl, float new_temp) { llama_sampler_free(old_temp); } - // Create new temperature sampler + // Collect all samplers that come after the temp position + std::vector samplers_after; + int n_after = llama_sampler_chain_n(gsmpl->chain) - temp_idx; + for (int i = 0; i < n_after; i++) { + samplers_after.push_back(llama_sampler_chain_remove(gsmpl->chain, temp_idx)); + } + + // Create and add new temperature sampler struct llama_sampler * new_temp_sampler; // Use temp_ext if dynamic temperature was enabled, otherwise use simple temp @@ -651,19 +658,11 @@ bool common_sampler_set_temp(struct common_sampler * gsmpl, float new_temp) { new_temp_sampler = llama_sampler_init_temp(new_temp); } - // Add at the end first llama_sampler_chain_add(gsmpl->chain, new_temp_sampler); - // Move it to the correct position by swapping - const int current_n = llama_sampler_chain_n(gsmpl->chain); - for (int i = current_n - 1; i > temp_idx; i--) { - // Remove from end - struct llama_sampler * s = llama_sampler_chain_remove(gsmpl->chain, i); - // Remove the one before - struct llama_sampler * s_prev = llama_sampler_chain_remove(gsmpl->chain, i - 1); - // Add them back in swapped order + // Add back the samplers that came after + for (auto * s : samplers_after) { llama_sampler_chain_add(gsmpl->chain, s); - llama_sampler_chain_add(gsmpl->chain, s_prev); } // Update the params to reflect the new temperature From aa0f238c9ef76f41d86f604b10fbe5507b2defd0 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 8 Nov 2025 14:24:08 +0000 Subject: [PATCH 11/24] Add debug logging to diagnose temperature change issue User reports temperature changes aren't taking effect. Added detailed logging to trace: - Sampler chain composition - Temperature sampler detection - Removal and replacement process - Final chain state This will help identify why temperature modifications aren't affecting model output behavior. --- common/sampling.cpp | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index c8ae762a0a547..31ea25de1c8d8 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -612,6 +612,7 @@ float common_sampler_get_temp(const struct common_sampler * gsmpl) { // Set temperature at runtime by replacing the temperature sampler in the chain bool common_sampler_set_temp(struct common_sampler * gsmpl, float new_temp) { if (!gsmpl || !gsmpl->chain) { + LOG_ERR("%s: invalid sampler or chain\n", __func__); return false; } @@ -619,33 +620,46 @@ bool common_sampler_set_temp(struct common_sampler * gsmpl, float new_temp) { const int n_samplers = llama_sampler_chain_n(gsmpl->chain); int temp_idx = -1; + LOG_INF("%s: searching for temperature sampler in chain of %d samplers\n", __func__, n_samplers); + for (int i = 0; i < n_samplers; i++) { struct llama_sampler * s = llama_sampler_chain_get(gsmpl->chain, i); const char * name = llama_sampler_name(s); + LOG_INF("%s: sampler[%d] = '%s'\n", __func__, i, name); // Look for "temp" or "temp-ext" sampler if (strcmp(name, "temp") == 0 || strcmp(name, "temp-ext") == 0) { temp_idx = i; + LOG_INF("%s: found temperature sampler '%s' at index %d\n", __func__, name, i); break; } } if (temp_idx == -1) { // No temperature sampler found - this might happen with mirostat + LOG_ERR("%s: no temperature sampler found in chain\n", __func__); return false; } + LOG_INF("%s: removing old temperature sampler at index %d\n", __func__, temp_idx); + // Remove the old temperature sampler struct llama_sampler * old_temp = llama_sampler_chain_remove(gsmpl->chain, temp_idx); if (old_temp) { llama_sampler_free(old_temp); + LOG_INF("%s: freed old temperature sampler\n", __func__); } // Collect all samplers that come after the temp position std::vector samplers_after; int n_after = llama_sampler_chain_n(gsmpl->chain) - temp_idx; + LOG_INF("%s: collecting %d samplers after temp position\n", __func__, n_after); + for (int i = 0; i < n_after; i++) { - samplers_after.push_back(llama_sampler_chain_remove(gsmpl->chain, temp_idx)); + struct llama_sampler * s = llama_sampler_chain_remove(gsmpl->chain, temp_idx); + const char * name = llama_sampler_name(s); + LOG_INF("%s: removed sampler '%s'\n", __func__, name); + samplers_after.push_back(s); } // Create and add new temperature sampler @@ -653,20 +667,29 @@ bool common_sampler_set_temp(struct common_sampler * gsmpl, float new_temp) { // Use temp_ext if dynamic temperature was enabled, otherwise use simple temp if (gsmpl->params.dynatemp_range > 0.0f) { + LOG_INF("%s: creating temp-ext sampler with temp=%.2f, range=%.2f, exp=%.2f\n", + __func__, new_temp, gsmpl->params.dynatemp_range, gsmpl->params.dynatemp_exponent); new_temp_sampler = llama_sampler_init_temp_ext(new_temp, gsmpl->params.dynatemp_range, gsmpl->params.dynatemp_exponent); } else { + LOG_INF("%s: creating temp sampler with temp=%.2f\n", __func__, new_temp); new_temp_sampler = llama_sampler_init_temp(new_temp); } llama_sampler_chain_add(gsmpl->chain, new_temp_sampler); + LOG_INF("%s: added new temperature sampler\n", __func__); // Add back the samplers that came after for (auto * s : samplers_after) { + const char * name = llama_sampler_name(s); llama_sampler_chain_add(gsmpl->chain, s); + LOG_INF("%s: re-added sampler '%s'\n", __func__, name); } // Update the params to reflect the new temperature gsmpl->params.temp = new_temp; + LOG_INF("%s: final chain has %d samplers\n", __func__, llama_sampler_chain_n(gsmpl->chain)); + LOG_INF("%s: temperature update complete\n", __func__); + return true; } From bbe6799db7a73d278289cfecb98491c25a38b9b8 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 8 Nov 2025 14:34:32 +0000 Subject: [PATCH 12/24] Add auto KV cache save/load for llama-server faster restarts Implements automatic KV cache persistence for debugging workflows: New CLI flags: - --kv-cache-auto-save Automatically saves all slot KV caches to timestamped directory on server shutdown: _YYYYMMDD_HHMMSS/ - --kv-cache-auto-load Automatically loads all slot KV caches from specified directory on server startup Implementation: - auto_save_kv_cache(): Saves each non-empty slot to slot_N.bin in timestamped directory on server shutdown - auto_load_kv_cache(): Loads each slot from slot_N.bin files on server initialization - Uses llama_state_seq_save_file/load_file for per-slot persistence - Integrated into server lifecycle: load after init(), save in cleanup Usage example: # First run - build up KV cache state llama-server -m model.gguf --kv-cache-auto-save my_cache # Server shutdown creates: my_cache_20250108_143022/ # Second run - restore state instantly llama-server -m model.gguf --kv-cache-auto-load my_cache_20250108_143022 This enables fast server restarts during debugging by preserving the complete conversation/context state across sessions. --- common/arg.cpp | 14 ++++++ common/common.h | 4 ++ tools/server/server.cpp | 109 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index b2392b13f16eb..71e346c4d711d 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2467,6 +2467,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--kv-cache-auto-save"}, "BASE_NAME", + "automatically save all KV cache to BASE_NAME_/ directory on server shutdown (default: disabled)", + [](common_params & params, const std::string & value) { + params.kv_cache_auto_save_base = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--kv-cache-auto-load"}, "DIRNAME", + "automatically load KV cache from specified timestamped directory on server startup (default: disabled)", + [](common_params & params, const std::string & value) { + params.kv_cache_auto_load = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--jinja"}, "use jinja template for chat (default: disabled)", diff --git a/common/common.h b/common/common.h index 2f6fdbaffff60..a5cfcc0c77bed 100644 --- a/common/common.h +++ b/common/common.h @@ -459,6 +459,10 @@ struct common_params { std::string slot_save_path; + // Auto KV cache save/load for faster server restarts + std::string kv_cache_auto_save_base; // base name for auto-saving KV cache on shutdown (with timestamp) + std::string kv_cache_auto_load; // specific timestamped name to load on startup + float slot_prompt_similarity = 0.1f; // batched-bench params diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 164e8cf4e7084..338ff8d6def9a 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -2596,6 +2597,110 @@ struct server_context { /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, /* enable_thinking */ enable_thinking, }; + + // Auto-load KV cache if requested + if (!params_base.kv_cache_auto_load.empty()) { + auto_load_kv_cache(); + } + } + + // Auto-save KV cache on shutdown with timestamp + void auto_save_kv_cache() { + if (params_base.kv_cache_auto_save_base.empty()) { + return; + } + + // Generate timestamp directory name + auto now = std::chrono::system_clock::now(); + auto time_t = std::chrono::system_clock::to_time_t(now); + std::tm tm_time; +#ifdef _WIN32 + localtime_s(&tm_time, &time_t); +#else + localtime_r(&time_t, &tm_time); +#endif + char timestamp[64]; + std::strftime(timestamp, sizeof(timestamp), "%Y%m%d_%H%M%S", &tm_time); + + std::string dir_name = params_base.kv_cache_auto_save_base + "_" + timestamp; + + SRV_INF("auto-saving KV cache to directory: %s\n", dir_name.c_str()); + + // Create directory +#ifdef _WIN32 + _mkdir(dir_name.c_str()); +#else + mkdir(dir_name.c_str(), 0755); +#endif + + // Save each slot + int saved_count = 0; + for (const server_slot & slot : slots) { + if (slot.prompt.tokens.empty()) { + continue; // Skip empty slots + } + + std::string filepath = dir_name + DIRECTORY_SEPARATOR + "slot_" + std::to_string(slot.id) + ".bin"; + + const llama_tokens & tokens = slot.prompt.tokens.get_text_tokens(); + const size_t token_count = tokens.size(); + + const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot.id, tokens.data(), token_count); + + if (nwrite > 0) { + SRV_INF("saved slot %d: %zu tokens, %zu bytes to %s\n", slot.id, token_count, nwrite, filepath.c_str()); + saved_count++; + } else { + SRV_WRN("failed to save slot %d to %s\n", slot.id, filepath.c_str()); + } + } + + SRV_INF("KV cache auto-save complete: %d slots saved to %s\n", saved_count, dir_name.c_str()); + } + + // Auto-load KV cache on startup from specified directory + void auto_load_kv_cache() { + if (params_base.kv_cache_auto_load.empty()) { + return; + } + + std::string dir_name = params_base.kv_cache_auto_load; + + SRV_INF("auto-loading KV cache from directory: %s\n", dir_name.c_str()); + + int loaded_count = 0; + + // Try to load each slot + for (server_slot & slot : slots) { + std::string filepath = dir_name + DIRECTORY_SEPARATOR + "slot_" + std::to_string(slot.id) + ".bin"; + + // Check if file exists + std::ifstream file(filepath); + if (!file.good()) { + SRV_DBG("slot %d file not found: %s - skipping\n", slot.id, filepath.c_str()); + continue; + } + file.close(); + + llama_tokens tokens; + tokens.resize(slot.n_ctx); + size_t token_count = 0; + + const size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot.id, tokens.data(), tokens.size(), &token_count); + + if (nread > 0 && token_count > 0) { + tokens.resize(token_count); + slot.prompt.tokens.clear(); + slot.prompt.tokens.insert(tokens); + + SRV_INF("loaded slot %d: %zu tokens, %zu bytes from %s\n", slot.id, token_count, nread, filepath.c_str()); + loaded_count++; + } else { + SRV_WRN("failed to load slot %d from %s\n", slot.id, filepath.c_str()); + } + } + + SRV_INF("KV cache auto-load complete: %d slots loaded from %s\n", loaded_count, dir_name.c_str()); } server_slot * get_slot_by_id(int id) { @@ -5672,6 +5777,10 @@ int main(int argc, char ** argv) { // clean up function, to be called before exit auto clean_up = [&svr, &ctx_server]() { SRV_INF("%s: cleaning up before exit...\n", __func__); + + // Auto-save KV cache if enabled + ctx_server.auto_save_kv_cache(); + svr->stop(); ctx_server.queue_results.terminate(); llama_backend_free(); From 21db687e98cb4cfe088c561e643718257ddc884e Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 8 Nov 2025 14:39:31 +0000 Subject: [PATCH 13/24] Add HTTP endpoint to trigger KV cache save on demand New endpoint: POST /save-kv-cache Allows saving KV cache at any point during server operation, not just on shutdown. Useful for creating checkpoints during interactive debugging sessions. Request body (optional): { "dirname": "my_checkpoint" // Custom directory name } Response: { "success": true, "directory": "my_checkpoint", // or timestamped if not specified "message": "KV cache saved successfully" } If dirname is not provided, automatically generates timestamped directory name using --kv-cache-auto-save base name. Implementation: - Refactored auto_save_kv_cache() into save_kv_cache_to_dir(dirname) - save_kv_cache_to_dir() accepts optional custom directory name - Returns directory name on success, empty string on failure - New endpoint handler parse JSON body and calls save function - Registered at: POST /save-kv-cache Usage examples: # Save with custom name curl -X POST http://localhost:8080/save-kv-cache \ -H "Content-Type: application/json" \ -d '{"dirname": "checkpoint_before_fix"}' # Save with auto-generated timestamp curl -X POST http://localhost:8080/save-kv-cache \ -H "Content-Type: application/json" \ -d '{}' --- tools/server/server.cpp | 77 +++++++++++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 15 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 338ff8d6def9a..53e98ffc45bbe 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2604,25 +2604,33 @@ struct server_context { } } - // Auto-save KV cache on shutdown with timestamp - void auto_save_kv_cache() { - if (params_base.kv_cache_auto_save_base.empty()) { - return; - } + // Save KV cache to specified directory (or generate timestamped name if empty) + // Returns the directory name used, or empty string on failure + std::string save_kv_cache_to_dir(const std::string & custom_dir = "") { + std::string dir_name; + + if (custom_dir.empty()) { + // Auto-generate timestamp directory name + if (params_base.kv_cache_auto_save_base.empty()) { + SRV_ERR("%s", "no directory specified and no auto-save base configured\n"); + return ""; + } - // Generate timestamp directory name - auto now = std::chrono::system_clock::now(); - auto time_t = std::chrono::system_clock::to_time_t(now); - std::tm tm_time; + auto now = std::chrono::system_clock::now(); + auto time_t = std::chrono::system_clock::to_time_t(now); + std::tm tm_time; #ifdef _WIN32 - localtime_s(&tm_time, &time_t); + localtime_s(&tm_time, &time_t); #else - localtime_r(&time_t, &tm_time); + localtime_r(&time_t, &tm_time); #endif - char timestamp[64]; - std::strftime(timestamp, sizeof(timestamp), "%Y%m%d_%H%M%S", &tm_time); + char timestamp[64]; + std::strftime(timestamp, sizeof(timestamp), "%Y%m%d_%H%M%S", &tm_time); - std::string dir_name = params_base.kv_cache_auto_save_base + "_" + timestamp; + dir_name = params_base.kv_cache_auto_save_base + "_" + timestamp; + } else { + dir_name = custom_dir; + } SRV_INF("auto-saving KV cache to directory: %s\n", dir_name.c_str()); @@ -2655,7 +2663,16 @@ struct server_context { } } - SRV_INF("KV cache auto-save complete: %d slots saved to %s\n", saved_count, dir_name.c_str()); + SRV_INF("KV cache save complete: %d slots saved to %s\n", saved_count, dir_name.c_str()); + return dir_name; + } + + // Auto-save KV cache on shutdown with timestamp (convenience wrapper) + void auto_save_kv_cache() { + if (params_base.kv_cache_auto_save_base.empty()) { + return; + } + save_kv_cache_to_dir(); // Use default timestamped directory } // Auto-load KV cache on startup from specified directory @@ -4978,6 +4995,34 @@ int main(int argc, char ** argv) { res_ok(res, result->to_json()); }; + const auto handle_kv_cache_save = [&ctx_server, &res_ok, &res_error](const httplib::Request & req, httplib::Response & res) { + std::string dirname; + + // Parse request body if provided + if (!req.body.empty()) { + json request_data = json::parse(req.body); + if (request_data.contains("dirname") && request_data["dirname"].is_string()) { + dirname = request_data["dirname"]; + } + } + // If dirname is empty, save_kv_cache_to_dir will generate a timestamped name + + std::string saved_dir = ctx_server.save_kv_cache_to_dir(dirname); + + if (saved_dir.empty()) { + res_error(res, format_error_response("Failed to save KV cache - check server logs", ERROR_TYPE_SERVER)); + return; + } + + json response = { + {"success", true}, + {"directory", saved_dir}, + {"message", "KV cache saved successfully"} + }; + + res_ok(res, response); + }; + const auto handle_slots_action = [¶ms, &res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) { if (params.slot_save_path.empty()) { res_error(res, format_error_response("This server does not support slots action. Start it with `--slot-save-path`", ERROR_TYPE_NOT_SUPPORTED)); @@ -5763,6 +5808,8 @@ int main(int argc, char ** argv) { // Save & load slots svr->Get (params.api_prefix + "/slots", handle_slots); svr->Post(params.api_prefix + "/slots/:id_slot", handle_slots_action); + // Save KV cache on demand + svr->Post(params.api_prefix + "/save-kv-cache", handle_kv_cache_save); // // Start the server From 3faa493cad2c74cf4fe6edfe8496bbb884cc5c68 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 8 Nov 2025 15:58:03 +0000 Subject: [PATCH 14/24] Add streaming activation capture system for intermediate tensor analysis Implements a production-ready activation capture system that allows real-time streaming of intermediate layer activations to disk for analysis: - Queue-based async I/O with background writer thread to avoid blocking inference - GPU tensor support via automatic ggml_backend_tensor_get() transfers - Flexible filtering by regex patterns and layer ranges - Binary file format (LLMACT01) with timestamped metadata entries - Size limits to prevent unbounded disk usage - HTTP endpoints: * POST /activations/start - Begin capture with filters and limits * POST /activations/stop - Stop capture and finalize output * GET /activations/status - Query current capture statistics Implementation details: - Callback set via params.cb_eval at model initialization - Global pointer g_activation_capture enables thread-safe dynamic control - Producer-consumer pattern with condition variables for queue management - Atomic counters for bytes_written and entries_captured statistics This enables debugging and analysis workflows like: - Comparing activations between model versions - Identifying problematic layers causing inference issues - Analyzing attention patterns and intermediate representations - Debugging quantization effects on specific layers --- tools/server/server.cpp | 309 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 309 insertions(+) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 53e98ffc45bbe..6190d8640952c 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include #include #include @@ -2315,9 +2317,45 @@ struct server_response { } }; +// Activation capture entry for streaming to disk +struct activation_entry { + uint64_t timestamp_us; // microseconds since epoch + std::string label; // tensor name like "blk.0.attn_q" + enum ggml_type type; // tensor data type + int64_t ne[4]; // tensor dimensions + std::vector data; // tensor data (copied from GPU if needed) +}; + +// Activation capture system for streaming intermediate tensors +struct activation_capture { + std::atomic active{false}; + std::string output_file; + std::vector filters; // regex patterns for tensor names + int layer_start = -1; // -1 = all layers + int layer_end = -1; + size_t max_size_bytes = 0; // 0 = unlimited + + std::mutex queue_mutex; + std::condition_variable queue_cv; + std::queue entry_queue; + std::thread writer_thread; + std::atomic should_stop{false}; + std::atomic bytes_written{0}; + std::atomic entries_captured{0}; + + // Callback user data + llama_context * ctx = nullptr; +}; + +// Global pointer for activation capture (accessed by callback) +static activation_capture * g_activation_capture = nullptr; + struct server_context { common_params params_base; + // Activation capture system + std::unique_ptr act_capture; + // note: keep these alive - they determine the lifetime of the model, context, etc. common_init_result llama_init; common_init_result llama_init_dft; @@ -2385,6 +2423,10 @@ struct server_context { params_base = params; + // Set up activation capture callback (inactive until explicitly started) + params_base.cb_eval = activation_capture_callback; + params_base.cb_eval_user_data = nullptr; + llama_init = common_init_from_params(params_base); model = llama_init.model.get(); @@ -2720,6 +2762,232 @@ struct server_context { SRV_INF("KV cache auto-load complete: %d slots loaded from %s\n", loaded_count, dir_name.c_str()); } + // Activation capture: background writer thread + static void activation_writer_thread(activation_capture * capture) { + std::ofstream file(capture->output_file, std::ios::binary); + if (!file.is_open()) { + SRV_ERR("failed to open activation file: %s\n", capture->output_file.c_str()); + return; + } + + // Write magic header + const char magic[9] = "LLMACT01"; + file.write(magic, 8); + + while (true) { + activation_entry entry; + { + std::unique_lock lock(capture->queue_mutex); + capture->queue_cv.wait(lock, [capture] { + return !capture->entry_queue.empty() || capture->should_stop.load(); + }); + + if (capture->should_stop.load() && capture->entry_queue.empty()) { + break; + } + + if (capture->entry_queue.empty()) { + continue; + } + + entry = std::move(capture->entry_queue.front()); + capture->entry_queue.pop(); + } + + // Write entry to file + // Format: timestamp(8) + label_len(4) + label + type(1) + dims(4*8) + data_size(8) + data + file.write(reinterpret_cast(&entry.timestamp_us), sizeof(uint64_t)); + + uint32_t label_len = entry.label.size(); + file.write(reinterpret_cast(&label_len), sizeof(uint32_t)); + file.write(entry.label.data(), label_len); + + int8_t type_byte = static_cast(entry.type); + file.write(reinterpret_cast(&type_byte), sizeof(int8_t)); + + file.write(reinterpret_cast(entry.ne), sizeof(entry.ne)); + + uint64_t data_size = entry.data.size(); + file.write(reinterpret_cast(&data_size), sizeof(uint64_t)); + file.write(reinterpret_cast(entry.data.data()), data_size); + + capture->bytes_written.fetch_add(sizeof(uint64_t) + sizeof(uint32_t) + label_len + + sizeof(int8_t) + sizeof(entry.ne) + sizeof(uint64_t) + data_size); + + // Check size limit + if (capture->max_size_bytes > 0 && capture->bytes_written.load() >= capture->max_size_bytes) { + SRV_INF("activation capture reached size limit: %zu bytes\n", capture->bytes_written.load()); + capture->active.store(false); + break; + } + } + + file.close(); + SRV_INF("activation writer thread finished: %zu entries, %zu bytes written to %s\n", + capture->entries_captured.load(), capture->bytes_written.load(), capture->output_file.c_str()); + } + + // Activation capture: callback for tensor evaluation + static bool activation_capture_callback(struct ggml_tensor * t, bool ask, void * user_data) { + (void)user_data; // unused + if (!ask) return true; // We only care about the "ask" phase + + activation_capture * capture = g_activation_capture; + if (!capture || !capture->active.load()) { + return true; + } + + const char * name = ggml_get_name(t); + if (!name || strlen(name) == 0) { + return true; // Skip unnamed tensors + } + + std::string tensor_name(name); + + // Apply filters + if (!capture->filters.empty()) { + bool matches = false; + for (const auto & filter : capture->filters) { + if (std::regex_match(tensor_name, filter)) { + matches = true; + break; + } + } + if (!matches) { + return true; // Doesn't match any filter + } + } + + // Apply layer range filter (extract layer number from name like "blk.5.attn_q") + if (capture->layer_start >= 0) { + std::regex layer_regex(R"(blk\.(\d+)\.)"); + std::smatch match; + if (std::regex_search(tensor_name, match, layer_regex)) { + int layer_num = std::stoi(match[1]); + if (layer_num < capture->layer_start || layer_num > capture->layer_end) { + return true; // Outside layer range + } + } + } + + // Create entry + activation_entry entry; + entry.timestamp_us = ggml_time_us(); + entry.label = tensor_name; + entry.type = t->type; + for (int i = 0; i < 4; i++) { + entry.ne[i] = t->ne[i]; + } + + // Copy tensor data (handles GPU->CPU transfer automatically) + size_t nbytes = ggml_nbytes(t); + entry.data.resize(nbytes); + ggml_backend_tensor_get(t, entry.data.data(), 0, nbytes); + + // Queue entry for writing + { + std::lock_guard lock(capture->queue_mutex); + capture->entry_queue.push(std::move(entry)); + capture->entries_captured.fetch_add(1); + } + capture->queue_cv.notify_one(); + + return true; // Continue graph evaluation + } + + // Start activation capture + bool start_activation_capture(const std::string & output_file, + const std::vector & filter_patterns, + int layer_start = -1, + int layer_end = -1, + size_t max_size_mb = 0) { + if (act_capture && act_capture->active.load()) { + SRV_WRN("%s", "activation capture already active\n"); + return false; + } + + act_capture = std::make_unique(); + act_capture->output_file = output_file; + act_capture->layer_start = layer_start; + act_capture->layer_end = layer_end; + act_capture->max_size_bytes = max_size_mb * 1024 * 1024; + act_capture->ctx = ctx; + + // Compile regex filters + for (const auto & pattern : filter_patterns) { + try { + act_capture->filters.emplace_back(pattern); + } catch (const std::regex_error & e) { + SRV_ERR("invalid regex pattern '%s': %s\n", pattern.c_str(), e.what()); + return false; + } + } + + // Start writer thread + act_capture->should_stop.store(false); + act_capture->writer_thread = std::thread(activation_writer_thread, act_capture.get()); + + // Set global pointer for callback + g_activation_capture = act_capture.get(); + + act_capture->active.store(true); + + SRV_INF("activation capture started: file=%s, filters=%zu, layers=[%d,%d], max_size=%zu MB\n", + output_file.c_str(), filter_patterns.size(), layer_start, layer_end, max_size_mb); + + return true; + } + + // Stop activation capture + json stop_activation_capture() { + if (!act_capture || !act_capture->active.load()) { + return { + {"error", "no active capture"} + }; + } + + act_capture->active.store(false); + + // Clear global pointer + g_activation_capture = nullptr; + + // Stop writer thread + act_capture->should_stop.store(true); + act_capture->queue_cv.notify_one(); + if (act_capture->writer_thread.joinable()) { + act_capture->writer_thread.join(); + } + + json result = { + {"success", true}, + {"file", act_capture->output_file}, + {"entries_captured", act_capture->entries_captured.load()}, + {"bytes_written", act_capture->bytes_written.load()}, + {"message", "Activation capture stopped"} + }; + + act_capture.reset(); + + return result; + } + + // Get activation capture status + json get_activation_capture_status() const { + if (!act_capture) { + return { + {"active", false} + }; + } + + return { + {"active", act_capture->active.load()}, + {"file", act_capture->output_file}, + {"entries_captured", act_capture->entries_captured.load()}, + {"bytes_written", act_capture->bytes_written.load()}, + {"queue_size", act_capture->entry_queue.size()} + }; + } + server_slot * get_slot_by_id(int id) { for (server_slot & slot : slots) { if (slot.id == id) { @@ -5811,6 +6079,47 @@ int main(int argc, char ** argv) { // Save KV cache on demand svr->Post(params.api_prefix + "/save-kv-cache", handle_kv_cache_save); + // Activation capture endpoints + const auto handle_activations_start = [&ctx_server, &res_ok, &res_error](const httplib::Request & req, httplib::Response & res) { + json request_data = json::parse(req.body); + + std::string output_file = request_data.value("output_file", "activations.bin"); + std::vector filters = request_data.value("filters", std::vector()); + int layer_start = request_data.value("layer_start", -1); + int layer_end = request_data.value("layer_end", -1); + size_t max_size_mb = request_data.value("max_size_mb", 0); + + bool success = ctx_server.start_activation_capture(output_file, filters, layer_start, layer_end, max_size_mb); + + if (success) { + json response = { + {"success", true}, + {"message", "Activation capture started"}, + {"output_file", output_file}, + {"filters", filters}, + {"layer_range", {layer_start, layer_end}}, + {"max_size_mb", max_size_mb} + }; + res_ok(res, response); + } else { + res_error(res, format_error_response("Failed to start activation capture", ERROR_TYPE_SERVER)); + } + }; + + const auto handle_activations_stop = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { + json result = ctx_server.stop_activation_capture(); + res_ok(res, result); + }; + + const auto handle_activations_status = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { + json status = ctx_server.get_activation_capture_status(); + res_ok(res, status); + }; + + svr->Post(params.api_prefix + "/activations/start", handle_activations_start); + svr->Post(params.api_prefix + "/activations/stop", handle_activations_stop); + svr->Get (params.api_prefix + "/activations/status", handle_activations_status); + // // Start the server // From 32607286777457d10670b9a532002097e3815f6d Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 9 Nov 2025 01:04:26 +0000 Subject: [PATCH 15/24] Add tool-calling capability to llama-cli Implements a flexible tool-calling system that allows LLMs to invoke external executables from a "tools" directory: Features: - LLM can request tool list by outputting * Automatically scans "tools" directory for executables * Runs each with "help" parameter to collect usage info * Injects concatenated help text back into conversation - LLM can execute tools by outputting tool-name args * Executes the specified tool from tools/ directory * Captures stdout/stderr and exit code * Injects output back into conversation for LLM to process Implementation: - Platform-specific: Full support on Unix/macOS, stub on Windows - Uses popen() for command execution with output capture - Alphabetically sorted tool listing for consistency - Robust parsing of tool-launch tags with argument extraction - Checks recent output buffer (128 tokens) for tag detection Example tools directory structure: tools/ calculator (executable) web_search (executable) file_reader (executable) This enables LLMs to: - Access external data sources - Perform calculations - Query databases or APIs - Interact with system utilities - Extend capabilities without retraining Security note: Only executables in the "tools" directory are accessible, providing a sandboxed environment for tool execution. --- tools/main/main.cpp | 234 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 227 insertions(+), 7 deletions(-) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 8f781e70fccbb..974b3e9a2f195 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -15,10 +15,14 @@ #include #include #include +#include #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) #include #include +#include +#include +#include #elif defined (_WIN32) #define WIN32_LEAN_AND_MEAN #ifndef NOMINMAX @@ -67,6 +71,171 @@ static bool file_is_empty(const std::string & path) { return f.tellg() == 0; } +// Tool calling support functions +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +static bool is_executable(const std::string & path) { + struct stat st; + if (stat(path.c_str(), &st) != 0) { + return false; + } + return (st.st_mode & S_IXUSR) != 0; +} + +static std::string execute_command(const std::string & command) { + std::string result; + FILE* pipe = popen(command.c_str(), "r"); + if (!pipe) { + return "Error: Failed to execute command\n"; + } + + char buffer[256]; + while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { + result += buffer; + } + + int status = pclose(pipe); + if (WIFEXITED(status) && WEXITSTATUS(status) != 0) { + result += "\n[Tool exited with code " + std::to_string(WEXITSTATUS(status)) + "]\n"; + } + + return result; +} + +static std::vector get_tool_executables(const std::string & tools_dir) { + std::vector executables; + + DIR* dir = opendir(tools_dir.c_str()); + if (!dir) { + return executables; + } + + struct dirent* entry; + while ((entry = readdir(dir)) != nullptr) { + if (entry->d_name[0] == '.') { + continue; // Skip hidden files and . / .. + } + + std::string full_path = tools_dir + "/" + entry->d_name; + if (is_executable(full_path)) { + executables.push_back(entry->d_name); + } + } + + closedir(dir); + + // Sort alphabetically + std::sort(executables.begin(), executables.end()); + + return executables; +} + +static std::string collect_tools_help(const std::string & tools_dir) { + std::vector executables = get_tool_executables(tools_dir); + + if (executables.empty()) { + return "No executable tools found in the 'tools' directory.\n"; + } + + std::ostringstream help_text; + help_text << "Available tools:\n\n"; + + for (const auto & tool_name : executables) { + help_text << "=== " << tool_name << " ===\n"; + std::string command = tools_dir + "/" + tool_name + " help"; + std::string output = execute_command(command); + help_text << output; + if (!output.empty() && output.back() != '\n') { + help_text << "\n"; + } + help_text << "\nTo use this tool: " << tool_name << " [arguments]\n\n"; + } + + return help_text.str(); +} + +static std::string execute_tool(const std::string & tools_dir, const std::string & tool_name, const std::string & args) { + std::string full_path = tools_dir + "/" + tool_name; + + if (!is_executable(full_path)) { + return "Error: Tool '" + tool_name + "' not found or not executable\n"; + } + + std::string command = full_path; + if (!args.empty()) { + // Simple shell escaping - wrap in quotes if contains spaces + command += " " + args; + } + + LOG("\n[Executing tool: %s]\n", command.c_str()); + std::string output = execute_command(command); + LOG("[Tool output follows]\n"); + + return output; +} +#elif defined (_WIN32) +// Windows implementations (simplified - no tool support on Windows for now) +static bool is_executable(const std::string & path) { + return false; +} + +static std::string execute_command(const std::string & command) { + return "Error: Tool execution not supported on Windows\n"; +} + +static std::vector get_tool_executables(const std::string & tools_dir) { + return std::vector(); +} + +static std::string collect_tools_help(const std::string & tools_dir) { + return "Tool execution is not supported on Windows.\n"; +} + +static std::string execute_tool(const std::string & tools_dir, const std::string & tool_name, const std::string & args) { + return "Error: Tool execution not supported on Windows\n"; +} +#endif + +// Check if the recent output contains +static bool check_for_tools_help(const std::string & text) { + return text.find("") != std::string::npos; +} + +// Check if the recent output contains ... and extract tool name and args +static bool check_for_tool_launch(const std::string & text, std::string & tool_name, std::string & args) { + size_t start = text.find(""); + if (start == std::string::npos) { + return false; + } + + size_t end = text.find("", start); + if (end == std::string::npos) { + return false; + } + + // Extract the content between tags + start += 13; // length of "" + std::string content = text.substr(start, end - start); + + // Trim whitespace + content.erase(0, content.find_first_not_of(" \t\n\r")); + content.erase(content.find_last_not_of(" \t\n\r") + 1); + + // Split into tool name and args + size_t space_pos = content.find(' '); + if (space_pos == std::string::npos) { + tool_name = content; + args = ""; + } else { + tool_name = content.substr(0, space_pos); + args = content.substr(space_pos + 1); + // Trim args + args.erase(0, args.find_first_not_of(" \t\n\r")); + args.erase(args.find_last_not_of(" \t\n\r") + 1); + } + + return !tool_name.empty(); +} + // Save complete LLM state (KV cache + RNG + logits + embeddings) to GGUF file static bool save_llm_state_to_gguf(llama_context * ctx, const std::string & filename) { LOG("\nSaving LLM state to %s...\n", filename.c_str()); @@ -558,6 +727,10 @@ int main(int argc, char ** argv) { LOG_INF(" /\\/load - Load LLM state from GGUF file to restore exact conversation state\n"); LOG_INF(" /\\/temp - Show current temperature setting\n"); LOG_INF(" /\\/temp - Set temperature to a new value (e.g., /\\/temp 0.7)\n"); + LOG_INF("\n"); + LOG_INF("Tool calling (when 'tools' directory exists):\n"); + LOG_INF(" Model can output to get list of available tools\n"); + LOG_INF(" Model can output tool-name args to execute a tool\n"); if (!params.antiprompt.empty()) { for (const auto & antiprompt : params.antiprompt) { @@ -898,10 +1071,57 @@ int main(int argc, char ** argv) { // if not currently processing queued inputs; if ((int) embd_inp.size() <= n_consumed) { + // Check for tool requests in recent output + const int n_prev = 128; // Look back further to catch full tags + const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev); + + // Check for request + if (check_for_tools_help(last_output)) { + LOG_DBG("Detected request\n"); + + // Check if tools directory exists + if (file_exists("tools")) { + std::string help_text = collect_tools_help("tools"); + + LOG("\n[Tools Help Requested]\n"); + LOG("%s", help_text.c_str()); + LOG("[End of Tools Help]\n\n"); + + // Inject the help text back into the conversation + auto help_tokens = common_tokenize(ctx, "\n\n" + help_text, false, true); + embd_inp.insert(embd_inp.end(), help_tokens.begin(), help_tokens.end()); + + // Continue generation after injecting help + is_interacting = false; + } else { + LOG("\n[Tools Help Requested but 'tools' directory not found]\n\n"); + auto msg_tokens = common_tokenize(ctx, "\n\nNo 'tools' directory found.\n\n", false, true); + embd_inp.insert(embd_inp.end(), msg_tokens.begin(), msg_tokens.end()); + } + } + + // Check for ... request + std::string tool_name, tool_args; + if (check_for_tool_launch(last_output, tool_name, tool_args)) { + LOG_DBG("Detected request: tool=%s, args=%s\n", tool_name.c_str(), tool_args.c_str()); + + // Execute the tool + std::string tool_output = execute_tool("tools", tool_name, tool_args); + + LOG("%s", tool_output.c_str()); + LOG("[End of Tool Output]\n\n"); + + // Inject the tool output back into the conversation + auto output_tokens = common_tokenize(ctx, "\n\n" + tool_output + "\n\n", false, true); + embd_inp.insert(embd_inp.end(), output_tokens.begin(), output_tokens.end()); + + // Continue generation after injecting output + is_interacting = false; + } + // check for reverse prompt in the last n_prev tokens if (!params.antiprompt.empty()) { - const int n_prev = 32; - const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev); + const std::string last_output_for_antiprompt = common_sampler_prev_str(smpl, ctx, 32); is_antiprompt = false; // Check if each of the reverse prompts appears at the end of the output. @@ -909,11 +1129,11 @@ int main(int argc, char ** argv) { // so we'll compensate for that by widening the search window a bit. for (std::string & antiprompt : params.antiprompt) { size_t extra_padding = params.interactive ? 0 : 2; - size_t search_start_pos = last_output.length() > static_cast(antiprompt.length() + extra_padding) - ? last_output.length() - static_cast(antiprompt.length() + extra_padding) + size_t search_start_pos = last_output_for_antiprompt.length() > static_cast(antiprompt.length() + extra_padding) + ? last_output_for_antiprompt.length() - static_cast(antiprompt.length() + extra_padding) : 0; - if (last_output.find(antiprompt, search_start_pos) != std::string::npos) { + if (last_output_for_antiprompt.find(antiprompt, search_start_pos) != std::string::npos) { if (params.interactive) { is_interacting = true; } @@ -923,8 +1143,8 @@ int main(int argc, char ** argv) { } // check for reverse prompt using special tokens - // avoid calling common_sampler_last() if last_output is empty - if (!last_output.empty()) { + // avoid calling common_sampler_last() if last_output_for_antiprompt is empty + if (!last_output_for_antiprompt.empty()) { llama_token last_token = common_sampler_last(smpl); for (auto token : antiprompt_token) { if (token == last_token) { From 66669d6a369e934f6434e24e87c3e29532900676 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 9 Nov 2025 01:33:10 +0000 Subject: [PATCH 16/24] Fix tool execution to prevent help examples from triggering Changed tool detection logic to use mutually exclusive checks (if/else) instead of independent checks. This prevents tool help text containing examples from being accidentally executed. Previously: - Both and were checked in the same iteration - If help text contained example usage like "calc 2+2", it would be detected and executed immediately after being injected Now: - Only one tool action is processed per iteration - If is detected, skip check until next iteration - Help examples remain as documentation without triggering execution --- tools/main/main.cpp | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 974b3e9a2f195..3531c49613033 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -1076,6 +1076,7 @@ int main(int argc, char ** argv) { const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev); // Check for request + // Note: Only one tool action per iteration to prevent help examples from being executed if (check_for_tools_help(last_output)) { LOG_DBG("Detected request\n"); @@ -1098,25 +1099,25 @@ int main(int argc, char ** argv) { auto msg_tokens = common_tokenize(ctx, "\n\nNo 'tools' directory found.\n\n", false, true); embd_inp.insert(embd_inp.end(), msg_tokens.begin(), msg_tokens.end()); } - } - - // Check for ... request - std::string tool_name, tool_args; - if (check_for_tool_launch(last_output, tool_name, tool_args)) { - LOG_DBG("Detected request: tool=%s, args=%s\n", tool_name.c_str(), tool_args.c_str()); + } else { + // Check for ... request only if we didn't handle tools-help + std::string tool_name, tool_args; + if (check_for_tool_launch(last_output, tool_name, tool_args)) { + LOG_DBG("Detected request: tool=%s, args=%s\n", tool_name.c_str(), tool_args.c_str()); - // Execute the tool - std::string tool_output = execute_tool("tools", tool_name, tool_args); + // Execute the tool + std::string tool_output = execute_tool("tools", tool_name, tool_args); - LOG("%s", tool_output.c_str()); - LOG("[End of Tool Output]\n\n"); + LOG("%s", tool_output.c_str()); + LOG("[End of Tool Output]\n\n"); - // Inject the tool output back into the conversation - auto output_tokens = common_tokenize(ctx, "\n\n" + tool_output + "\n\n", false, true); - embd_inp.insert(embd_inp.end(), output_tokens.begin(), output_tokens.end()); + // Inject the tool output back into the conversation + auto output_tokens = common_tokenize(ctx, "\n\n" + tool_output + "\n\n", false, true); + embd_inp.insert(embd_inp.end(), output_tokens.begin(), output_tokens.end()); - // Continue generation after injecting output - is_interacting = false; + // Continue generation after injecting output + is_interacting = false; + } } // check for reverse prompt in the last n_prev tokens From 0ce2fcc9c368cd00dd766bb712dbc83eb942e289 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 9 Nov 2025 01:46:15 +0000 Subject: [PATCH 17/24] Prevent tool execution inside think tags and duplicate runs Fixes two critical issues with tool calling: 1. Think tag filtering: - Tool tags inside ... are now ignored - Added is_inside_think_tags() to check if a position is within think blocks - Prevents accidental tool execution during model reasoning - Recursively searches for tool-launch tags outside think blocks 2. Duplicate execution prevention: - Tracks last executed tool signature (tool_name|args) - Skips re-execution if same tool call detected in buffer - Resets signature on new user input to allow reuse in conversation - Prevents multiple executions when tag remains in 128-token lookback window Example scenarios now handled correctly: Scenario 1 - Think tags: Model: Maybe I should use calc 2+2 Result: Tool NOT executed (inside think block) Model: Let me calculate this. calc 2+2 Result: Tool executed (outside think block) Scenario 2 - Duplicates: Model generates: search foo Iteration 1: Tool executed, output injected Iteration 2: Same tag still in buffer -> skipped User types new input Model generates: search foo Result: Tool executed again (signature reset on user input) This ensures tools are only executed when the model explicitly intends to use them outside of reasoning blocks, and each tool call executes exactly once. --- tools/main/main.cpp | 75 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 3531c49613033..6bdba57745a6a 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -50,6 +50,9 @@ static bool need_insert_eot = false; static bool g_save_state_next = false; static std::string g_state_save_path = ""; +// Tool execution tracking to prevent duplicate executions +static std::string g_last_executed_tool_signature = ""; + static void print_usage(int argc, char ** argv) { (void) argc; @@ -195,18 +198,48 @@ static std::string execute_tool(const std::string & tools_dir, const std::string } #endif -// Check if the recent output contains +// Check if a position in text is inside ... tags +static bool is_inside_think_tags(const std::string & text, size_t pos) { + // Find the most recent before pos + size_t think_start = text.rfind("", pos); + if (think_start == std::string::npos) { + return false; // No tag before this position + } + + // Check if there's a between think_start and pos + size_t think_end = text.find("", think_start); + if (think_end == std::string::npos || think_end > pos) { + return true; // We're inside an unclosed or currently open think block + } + + return false; // The think block was closed before pos +} + +// Check if the recent output contains (outside of think tags) static bool check_for_tools_help(const std::string & text) { - return text.find("") != std::string::npos; + size_t pos = text.find(""); + if (pos == std::string::npos) { + return false; + } + + // Make sure it's not inside think tags + return !is_inside_think_tags(text, pos); } // Check if the recent output contains ... and extract tool name and args -static bool check_for_tool_launch(const std::string & text, std::string & tool_name, std::string & args) { - size_t start = text.find(""); +// Returns false if inside think tags or if already processed +static bool check_for_tool_launch(const std::string & text, std::string & tool_name, std::string & args, size_t search_from = 0) { + size_t start = text.find("", search_from); if (start == std::string::npos) { return false; } + // Check if this tag is inside think tags + if (is_inside_think_tags(text, start)) { + // Try to find the next one after this + return check_for_tool_launch(text, tool_name, args, start + 1); + } + size_t end = text.find("", start); if (end == std::string::npos) { return false; @@ -1103,20 +1136,31 @@ int main(int argc, char ** argv) { // Check for ... request only if we didn't handle tools-help std::string tool_name, tool_args; if (check_for_tool_launch(last_output, tool_name, tool_args)) { - LOG_DBG("Detected request: tool=%s, args=%s\n", tool_name.c_str(), tool_args.c_str()); + // Create signature to check for duplicate execution + std::string tool_signature = tool_name + "|" + tool_args; - // Execute the tool - std::string tool_output = execute_tool("tools", tool_name, tool_args); + // Only execute if this is a new tool call (not the same as last execution) + if (tool_signature != g_last_executed_tool_signature) { + LOG_DBG("Detected request: tool=%s, args=%s\n", tool_name.c_str(), tool_args.c_str()); - LOG("%s", tool_output.c_str()); - LOG("[End of Tool Output]\n\n"); + // Execute the tool + std::string tool_output = execute_tool("tools", tool_name, tool_args); - // Inject the tool output back into the conversation - auto output_tokens = common_tokenize(ctx, "\n\n" + tool_output + "\n\n", false, true); - embd_inp.insert(embd_inp.end(), output_tokens.begin(), output_tokens.end()); + LOG("%s", tool_output.c_str()); + LOG("[End of Tool Output]\n\n"); - // Continue generation after injecting output - is_interacting = false; + // Inject the tool output back into the conversation + auto output_tokens = common_tokenize(ctx, "\n\n" + tool_output + "\n\n", false, true); + embd_inp.insert(embd_inp.end(), output_tokens.begin(), output_tokens.end()); + + // Remember this execution to prevent duplicates + g_last_executed_tool_signature = tool_signature; + + // Continue generation after injecting output + is_interacting = false; + } else { + LOG_DBG("Skipping duplicate tool execution: tool=%s, args=%s\n", tool_name.c_str(), tool_args.c_str()); + } } } @@ -1377,6 +1421,9 @@ int main(int argc, char ** argv) { // reset assistant message assistant_ss.str(""); + // Reset tool execution tracking on new user input + g_last_executed_tool_signature = ""; + n_remain -= line_inp.size(); LOG_DBG("n_remain: %d\n", n_remain); } From cb933ad9c33878e120a6a8514cd9fde1d3ed3456 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 9 Nov 2025 02:04:12 +0000 Subject: [PATCH 18/24] Add --idle-action-interval flag for automatic agent thinking Implements a feature to auto-submit empty input after a period of user inactivity, allowing the agent to continue thinking without user interaction. New CLI flag: --idle-action-interval N Auto-submit empty input after N minutes of idle time (default: 0 = disabled) How it works: - Tracks last activity timestamp (updated on any user input) - Before waiting for user input, checks if idle interval has elapsed - If idle timeout reached, automatically submits empty input - Resets timer after auto-submission for next iteration - Any keystroke/input from user resets the idle timer Use cases: - Agent continues reasoning/thinking during long idle periods - Useful for autonomous workflows where agent should self-prompt - Allows agent to work through complex problems without waiting Example usage: llama-cli -m model.gguf --idle-action-interval 5 (Agent will auto-submit empty input after 5 minutes of no user activity) Implementation notes: - Activity time tracked globally via g_last_activity_time - Idle check happens when interactive mode waits for input - Auto-submitted input is distinguishable from Ctrl+D (EOF) - Console readline is bypassed when idle timeout triggers - Timer resets on both manual and automatic input submission Changes: - common/common.h: Added idle_action_interval parameter - common/arg.cpp: Added --idle-action-interval argument parser - tools/main/main.cpp: Implemented idle timeout logic and tracking --- common/arg.cpp | 7 ++++++ common/common.h | 1 + tools/main/main.cpp | 55 +++++++++++++++++++++++++++++++++++++++------ 3 files changed, 56 insertions(+), 7 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 71e346c4d711d..b20996396c05d 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1178,6 +1178,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.interactive_first = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"--idle-action-interval"}, "N", + "auto-submit empty input after N minutes of idle time with no keystrokes (default: 0 = disabled)", + [](common_params & params, const std::string & value) { + params.idle_action_interval = std::stoi(value); + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-mli", "--multiline-input"}, "allows you to write or paste multiple lines without ending each in '\\'", diff --git a/common/common.h b/common/common.h index a5cfcc0c77bed..3161b0c4c2bb5 100644 --- a/common/common.h +++ b/common/common.h @@ -372,6 +372,7 @@ struct common_params { bool special = false; // enable special token output bool interactive = false; // interactive mode bool interactive_first = false; // wait for user input immediately + int32_t idle_action_interval = 0; // auto-submit empty input after N minutes of idle (0 = disabled) bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 6bdba57745a6a..e36d397de83aa 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -53,6 +53,32 @@ static std::string g_state_save_path = ""; // Tool execution tracking to prevent duplicate executions static std::string g_last_executed_tool_signature = ""; +// Idle timeout tracking +static time_t g_last_activity_time = 0; + +// Check if idle timeout has elapsed and we should auto-submit empty input +static bool should_auto_submit_on_idle(int idle_interval_minutes) { + if (idle_interval_minutes <= 0) { + return false; // Feature disabled + } + + time_t current_time = time(nullptr); + if (g_last_activity_time == 0) { + g_last_activity_time = current_time; + return false; + } + + int elapsed_seconds = (int)(current_time - g_last_activity_time); + int idle_threshold_seconds = idle_interval_minutes * 60; + + return elapsed_seconds >= idle_threshold_seconds; +} + +// Update activity timestamp +static void update_activity_time() { + g_last_activity_time = time(nullptr); +} + static void print_usage(int argc, char ** argv) { (void) argc; @@ -1265,18 +1291,33 @@ int main(int argc, char ** argv) { console::set_display(console::user_input); display = params.display_prompt; - std::string line; - bool another_line = true; - do { - another_line = console::readline(line, params.multiline_input); - buffer += line; - } while (another_line); + // Check for idle timeout before reading input + bool auto_submitted = false; + if (should_auto_submit_on_idle(params.idle_action_interval)) { + // Auto-submit empty input due to idle timeout + LOG_DBG("Auto-submitting empty input after %d minutes of idle time\n", params.idle_action_interval); + LOG("\n[Idle timeout - auto-submitting empty input]\n"); + buffer = ""; // Empty input + auto_submitted = true; + update_activity_time(); // Reset timer for next iteration + } else { + // Normal input reading + std::string line; + bool another_line = true; + do { + another_line = console::readline(line, params.multiline_input); + buffer += line; + } while (another_line); + + // User provided input, update activity time + update_activity_time(); + } // done taking input, reset color console::set_display(console::reset); display = true; - if (buffer.empty()) { // Ctrl+D on empty line exits + if (buffer.empty() && !auto_submitted) { // Ctrl+D on empty line exits (but not auto-submit) LOG("EOF by user\n"); break; } From 7d55eaa32cdf9aeb15b79aa9988dceff509f55c3 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 9 Nov 2025 02:25:31 +0000 Subject: [PATCH 19/24] Fix idle timer to only count time waiting for user input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the idle timer would continue running while the agent was generating output, which could cause immediate auto-submission if the agent took longer than the idle interval to respond. Now the timer resets when transitioning to interactive mode (when is_interacting becomes true), ensuring it only measures time spent waiting for user input, not time spent generating. Behavior before fix: 1. User enters input → timer updates (t=0) 2. Agent generates for 10 minutes → timer running (t=10m) 3. Agent finishes → idle check 4. If idle_interval=5m → triggers immediately (10m > 5m) Behavior after fix: 1. User enters input → timer updates (t=0) 2. Agent generates for 10 minutes → timer running but will be reset 3. Agent finishes, enters interactive mode → timer resets (t=0) 4. User idle for 5 minutes → idle timeout triggers correctly (5m >= 5m) The fix adds update_activity_time() call at line 1277 when entering the "waiting for user input" state, right before displaying the prompt. --- tools/main/main.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index e36d397de83aa..fb6b598493bea 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -1272,6 +1272,10 @@ int main(int argc, char ** argv) { if ((n_past > 0 || waiting_for_first_input) && is_interacting) { LOG_DBG("waiting for user input\n"); + // Reset idle timer when we start waiting for user input + // This ensures we only count time spent waiting, not time spent generating + update_activity_time(); + if (params.conversation_mode) { LOG("\n> "); } From 4f94acbb99163fbaf19bf0ca48ead9623b0f1349 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 9 Nov 2025 02:38:26 +0000 Subject: [PATCH 20/24] Implement actual timeout support for idle action interval Previous implementation checked for timeout but then blocked indefinitely on readline. Now uses select() (Unix) / WaitForSingleObject (Windows) to check if input is available before blocking. Changes: - common/console.h: Added readline_with_timeout() function - common/console.cpp: Implemented timeout using select()/WaitForSingleObject - tools/main/main.cpp: Use readline_with_timeout with calculated timeout How it works: 1. Calculate remaining timeout based on idle_action_interval and elapsed time 2. Call readline_with_timeout() with remaining seconds 3. If timeout occurs, auto-submit empty input 4. If user types anything, reset timer and disable timeout for continuation lines Unix implementation: - Uses select() on STDIN_FILENO with timeout - Returns immediately if input available or timeout elapsed Windows implementation: - Uses WaitForSingleObject() on stdin handle with timeout - Returns immediately if input available or timeout elapsed This fixes the issue where idle timeout would never trigger because readline() was blocking indefinitely waiting for input. --- common/console.cpp | 56 +++++++++++++++++++++++++++++++++++++++++++ common/console.h | 1 + tools/main/main.cpp | 58 +++++++++++++++++++++++++++++---------------- 3 files changed, 94 insertions(+), 21 deletions(-) diff --git a/common/console.cpp b/common/console.cpp index 078a8d678d933..a435504c2adfa 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -501,4 +501,60 @@ namespace console { return readline_advanced(line, multiline_input); } + bool readline_with_timeout(std::string & line, bool multiline_input, int timeout_seconds, bool & timed_out) { + timed_out = false; + + if (timeout_seconds <= 0) { + // No timeout, use regular readline + return readline(line, multiline_input); + } + +#if defined(_WIN32) + // Windows: check if input is available with timeout + HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE); + DWORD result = WaitForSingleObject(hStdin, timeout_seconds * 1000); + + if (result == WAIT_TIMEOUT) { + timed_out = true; + line.clear(); + return false; + } + + if (result != WAIT_OBJECT_0) { + // Error occurred + line.clear(); + return false; + } + + // Input is available, use regular readline + return readline(line, multiline_input); +#else + // Unix: use select() to check for input with timeout + fd_set readfds; + struct timeval tv; + + FD_ZERO(&readfds); + FD_SET(STDIN_FILENO, &readfds); + + tv.tv_sec = timeout_seconds; + tv.tv_usec = 0; + + int retval = select(STDIN_FILENO + 1, &readfds, NULL, NULL, &tv); + + if (retval == -1) { + // Error occurred + line.clear(); + return false; + } else if (retval == 0) { + // Timeout occurred + timed_out = true; + line.clear(); + return false; + } + + // Input is available, use regular readline + return readline(line, multiline_input); +#endif + } + } diff --git a/common/console.h b/common/console.h index ec175269b9d8a..e7a17618563e6 100644 --- a/common/console.h +++ b/common/console.h @@ -16,4 +16,5 @@ namespace console { void cleanup(); void set_display(display_t display); bool readline(std::string & line, bool multiline_input); + bool readline_with_timeout(std::string & line, bool multiline_input, int timeout_seconds, bool & timed_out); } diff --git a/tools/main/main.cpp b/tools/main/main.cpp index fb6b598493bea..5284f554a7b95 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -1295,33 +1295,49 @@ int main(int argc, char ** argv) { console::set_display(console::user_input); display = params.display_prompt; - // Check for idle timeout before reading input - bool auto_submitted = false; - if (should_auto_submit_on_idle(params.idle_action_interval)) { - // Auto-submit empty input due to idle timeout - LOG_DBG("Auto-submitting empty input after %d minutes of idle time\n", params.idle_action_interval); - LOG("\n[Idle timeout - auto-submitting empty input]\n"); - buffer = ""; // Empty input - auto_submitted = true; - update_activity_time(); // Reset timer for next iteration - } else { - // Normal input reading - std::string line; - bool another_line = true; - do { - another_line = console::readline(line, params.multiline_input); - buffer += line; - } while (another_line); - - // User provided input, update activity time - update_activity_time(); + // Calculate remaining timeout for readline + int timeout_seconds = 0; + if (params.idle_action_interval > 0) { + time_t current_time = time(nullptr); + int elapsed_seconds = (int)(current_time - g_last_activity_time); + int idle_threshold_seconds = params.idle_action_interval * 60; + int remaining_seconds = idle_threshold_seconds - elapsed_seconds; + + if (remaining_seconds > 0) { + timeout_seconds = remaining_seconds; + } else { + timeout_seconds = 1; // Will timeout immediately + } } + // Read input with timeout support + std::string line; + bool another_line = true; + bool timed_out = false; + + do { + another_line = console::readline_with_timeout(line, params.multiline_input, timeout_seconds, timed_out); + buffer += line; + + if (timed_out) { + // Idle timeout occurred + LOG_DBG("Idle timeout triggered during input wait\n"); + LOG("\n[Idle timeout - auto-submitting empty input]\n"); + update_activity_time(); // Reset timer for next iteration + another_line = false; // Stop reading more lines + break; + } + + // User provided input, update activity time and disable timeout for continuation lines + update_activity_time(); + timeout_seconds = 0; // No timeout for continuation lines + } while (another_line); + // done taking input, reset color console::set_display(console::reset); display = true; - if (buffer.empty() && !auto_submitted) { // Ctrl+D on empty line exits (but not auto-submit) + if (buffer.empty() && !timed_out) { // Ctrl+D on empty line exits (but not timeout) LOG("EOF by user\n"); break; } From b3c0681e69453b09c7cdfbd6c930fbd500520145 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 9 Nov 2025 02:56:53 +0000 Subject: [PATCH 21/24] Reset tool deduplication on idle timeout to enable tools during agent thinking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, tool deduplication signature was only reset on explicit user input, preventing the LLM from reusing tools during idle-triggered thinking sessions. Now the signature is also reset when idle timeout triggers auto-submission, treating it as a new conversational turn where tools can be used again. Behavior before: 1. LLM uses calculator 5+5 → executes 2. User idles, timeout triggers → empty input submitted 3. LLM tries calculator 10+10 → blocked (duplicate) Behavior after: 1. LLM uses calculator 5+5 → executes 2. User idles, timeout triggers → signature reset, empty input submitted 3. LLM tries calculator 10+10 → executes (new turn) This allows the agent to fully utilize tools during autonomous thinking sessions triggered by the --idle-action-interval feature. --- tools/main/main.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 5284f554a7b95..5b068d3c93dd2 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -1324,6 +1324,10 @@ int main(int argc, char ** argv) { LOG_DBG("Idle timeout triggered during input wait\n"); LOG("\n[Idle timeout - auto-submitting empty input]\n"); update_activity_time(); // Reset timer for next iteration + + // Reset tool execution tracking to allow tools during idle thinking + g_last_executed_tool_signature = ""; + another_line = false; // Stop reading more lines break; } From a6a76a730ed3a88d4b07df855dc7d0a72ee69fec Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 9 Nov 2025 15:31:59 +0000 Subject: [PATCH 22/24] Add /\/timeout command to control idle action interval during session Adds an interactive command to view and change the idle action interval without restarting llama-cli. Usage: /\/timeout - Show current timeout and disable if enabled /\/timeout - Set idle timeout to N minutes /\/timeout 0 - Disable idle timeout Behavior: - /\/timeout with no args displays current setting and disables if enabled - /\/timeout N sets the timeout to N minutes (0 = disabled) - When setting a new non-zero timeout, timer resets immediately - Complements --idle-action-interval CLI flag with runtime control Example session: > /\/timeout Idle timeout is currently disabled (0 minutes) > /\/timeout 5 Changing idle timeout from 0 to 5 minutes Idle timeout set to 5 minutes > /\/timeout Current idle timeout: 5 minutes Disabling idle timeout This allows users to dynamically enable/disable autonomous agent thinking during a session based on their workflow needs. --- tools/main/main.cpp | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 5b068d3c93dd2..076cedba8a2ce 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -786,6 +786,8 @@ int main(int argc, char ** argv) { LOG_INF(" /\\/load - Load LLM state from GGUF file to restore exact conversation state\n"); LOG_INF(" /\\/temp - Show current temperature setting\n"); LOG_INF(" /\\/temp - Set temperature to a new value (e.g., /\\/temp 0.7)\n"); + LOG_INF(" /\\/timeout - Show or disable idle timeout (0 = disabled)\n"); + LOG_INF(" /\\/timeout - Set idle timeout to N minutes (e.g., /\\/timeout 5)\n"); LOG_INF("\n"); LOG_INF("Tool calling (when 'tools' directory exists):\n"); LOG_INF(" Model can output to get list of available tools\n"); @@ -1427,6 +1429,49 @@ int main(int argc, char ** argv) { // Keep is_interacting true and continue to wait for next input is_interacting = true; continue; + } else if (buffer.rfind("/\\/timeout", 0) == 0) { + // Handle idle timeout get/set command + std::string timeout_arg = buffer.substr(10); // Skip "/\/timeout" + // Trim whitespace + timeout_arg.erase(0, timeout_arg.find_first_not_of(" \t\n\r\f\v")); + timeout_arg.erase(timeout_arg.find_last_not_of(" \t\n\r\f\v") + 1); + + if (timeout_arg.empty()) { + // Show current timeout or disable it + LOG("\n"); + if (params.idle_action_interval > 0) { + LOG("Current idle timeout: %d minutes\n", params.idle_action_interval); + LOG("Disabling idle timeout\n"); + params.idle_action_interval = 0; + } else { + LOG("Idle timeout is currently disabled (0 minutes)\n"); + } + } else { + // Set new timeout + try { + int new_timeout = std::stoi(timeout_arg); + if (new_timeout < 0) { + LOG_ERR("Error: Timeout must be >= 0\n"); + } else { + LOG("\n"); + int old_timeout = params.idle_action_interval; + LOG("Changing idle timeout from %d to %d minutes\n", old_timeout, new_timeout); + params.idle_action_interval = new_timeout; + if (new_timeout == 0) { + LOG("Idle timeout disabled\n"); + } else { + LOG("Idle timeout set to %d minutes\n", new_timeout); + // Reset timer to start counting from now + update_activity_time(); + } + } + } catch (const std::exception & e) { + LOG_ERR("Error: Invalid timeout value '%s'\n", timeout_arg.c_str()); + } + } + // Keep is_interacting true and continue to wait for next input + is_interacting = true; + continue; } if (buffer.empty()) { // Enter key on empty line lets the user pass control back From e9685b01d84552a98ab78f21573841bc56bfe4f7 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 9 Nov 2025 19:59:35 +0000 Subject: [PATCH 23/24] Fix buffer.back() undefined behavior on idle timeout with empty input When idle timeout triggered with no user input, buffer was empty but the code attempted to access buffer.back() which is undefined behavior. The issue occurred because: 1. Timeout triggers with empty buffer 2. Condition `buffer.empty() && !timed_out` is FALSE (timed_out is true) 3. Code continues to access buffer.back() on empty string 4. Undefined behavior reads garbage memory 5. LLM sees random characters instead of empty input This explains why the LLM reported receiving a "one-word string" instead of empty input - it was reading uninitialized memory. Fix: Add check to ensure buffer is not empty before accessing buffer.back() Changed from: if (buffer.back() == '\n') { To: if (!buffer.empty() && buffer.back() == '\n') { Now idle timeout correctly sends truly empty input to the model. --- tools/main/main.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 076cedba8a2ce..3862a88d37872 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -1348,7 +1348,8 @@ int main(int argc, char ** argv) { break; } - if (buffer.back() == '\n') { + // Process newline handling only if buffer is not empty + if (!buffer.empty() && buffer.back() == '\n') { // Implement #587: // If the user wants the text to end in a newline, // this should be accomplished by explicitly adding a newline by using \ followed by return, From a19a01bc7f68cb016212b726092e8233e028053c Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 9 Nov 2025 22:36:19 +0000 Subject: [PATCH 24/24] Add debug logging for tool output token count Added LOG_DBG to show how many tokens are in tool output before injection. This helps diagnose KV cache allocation failures with large contexts and flash attention when tool output is injected. Related to investigating "decode: failed to find a memory slot" errors when using very large contexts (e.g., 96000) with flash attention enabled. --- tools/main/main.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 3862a88d37872..1beb619a76849 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -1179,6 +1179,10 @@ int main(int argc, char ** argv) { // Inject the tool output back into the conversation auto output_tokens = common_tokenize(ctx, "\n\n" + tool_output + "\n\n", false, true); + + // For large outputs with flash attention and big contexts, inject in chunks + // to avoid KV cache allocation failures + LOG_DBG("Tool output: %zu tokens\n", output_tokens.size()); embd_inp.insert(embd_inp.end(), output_tokens.begin(), output_tokens.end()); // Remember this execution to prevent duplicates