ggml-org · ayourtch · Nov 8, 2025 · Nov 8, 2025 · Nov 8, 2025 · Nov 8, 2025
@@ -1110,6 +1110,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.prompt_cache_ro = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"--dump-activations"}, "FNAME",
+        "file to dump activations to in GGUF format (default: none)",
+        [](common_params & params, const std::string & value) {
+            params.path_dump_activations = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"--load-activations"}, "FNAME",
+        "file to load activations from in GGUF format (default: none)",
+        [](common_params & params, const std::string & value) {
+            params.path_load_activations = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-r", "--reverse-prompt"}, "PROMPT",
         "halt generation at PROMPT, return control in interactive mode\n",
@@ -1164,6 +1178,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.interactive_first = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"--idle-action-interval"}, "N",
+        "auto-submit empty input after N minutes of idle time with no keystrokes (default: 0 = disabled)",
+        [](common_params & params, const std::string & value) {
+            params.idle_action_interval = std::stoi(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-mli", "--multiline-input"},
         "allows you to write or paste multiple lines without ending each in '\\'",
@@ -2453,6 +2474,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--kv-cache-auto-save"}, "BASE_NAME",
+        "automatically save all KV cache to BASE_NAME_<timestamp>/ directory on server shutdown (default: disabled)",
+        [](common_params & params, const std::string & value) {
+            params.kv_cache_auto_save_base = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--kv-cache-auto-load"}, "DIRNAME",
+        "automatically load KV cache from specified timestamped directory on server startup (default: disabled)",
+        [](common_params & params, const std::string & value) {
+            params.kv_cache_auto_load = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--jinja"},
         "use jinja template for chat (default: disabled)",

@@ -328,6 +328,8 @@ struct common_params {
     std::string system_prompt        = "";                                                                  // NOLINT
     std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
     std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
+    std::string path_dump_activations = ""; // path to GGUF file for dumping activations                     // NOLINT
+    std::string path_load_activations = ""; // path to GGUF file for loading activations                     // NOLINT
     std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
     std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
     std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
@@ -370,6 +372,7 @@ struct common_params {
     bool special           = false; // enable special token output
     bool interactive       = false; // interactive mode
     bool interactive_first = false; // wait for user input immediately
+    int32_t idle_action_interval = 0; // auto-submit empty input after N minutes of idle (0 = disabled)
     bool prompt_cache_all  = false; // save user input and generations to prompt cache
     bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
 
@@ -457,6 +460,10 @@ struct common_params {
 
     std::string slot_save_path;
 
+    // Auto KV cache save/load for faster server restarts
+    std::string kv_cache_auto_save_base; // base name for auto-saving KV cache on shutdown (with timestamp)
+    std::string kv_cache_auto_load;      // specific timestamped name to load on startup
+
     float slot_prompt_similarity = 0.1f;
 
     // batched-bench params

@@ -501,4 +501,60 @@ namespace console {
         return readline_advanced(line, multiline_input);
     }
 
+    bool readline_with_timeout(std::string & line, bool multiline_input, int timeout_seconds, bool & timed_out) {
+        timed_out = false;
+
+        if (timeout_seconds <= 0) {
+            // No timeout, use regular readline
+            return readline(line, multiline_input);
+        }
+
+#if defined(_WIN32)
+        // Windows: check if input is available with timeout
+        HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
+        DWORD result = WaitForSingleObject(hStdin, timeout_seconds * 1000);
+
+        if (result == WAIT_TIMEOUT) {
+            timed_out = true;
+            line.clear();
+            return false;
+        }
+
+        if (result != WAIT_OBJECT_0) {
+            // Error occurred
+            line.clear();
+            return false;
+        }
+
+        // Input is available, use regular readline
+        return readline(line, multiline_input);
+#else
+        // Unix: use select() to check for input with timeout
+        fd_set readfds;
+        struct timeval tv;
+
+        FD_ZERO(&readfds);
+        FD_SET(STDIN_FILENO, &readfds);
+
+        tv.tv_sec = timeout_seconds;
+        tv.tv_usec = 0;
+
+        int retval = select(STDIN_FILENO + 1, &readfds, NULL, NULL, &tv);
+
+        if (retval == -1) {
+            // Error occurred
+            line.clear();
+            return false;
+        } else if (retval == 0) {
+            // Timeout occurred
+            timed_out = true;
+            line.clear();
+            return false;
+        }
+
+        // Input is available, use regular readline
+        return readline(line, multiline_input);
+#endif
+    }
+
 }
@@ -16,4 +16,5 @@ namespace console {
     void cleanup();
     void set_display(display_t display);
     bool readline(std::string & line, bool multiline_input);
+    bool readline_with_timeout(std::string & line, bool multiline_input, int timeout_seconds, bool & timed_out);
 }
@@ -4,6 +4,7 @@
 #include "log.h"
 
 #include <cmath>
+#include <cstring>
 #include <unordered_map>
 #include <algorithm>
 
@@ -599,3 +600,96 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
 
     return samplers;
 }
+
+// Get current temperature from the sampler
+float common_sampler_get_temp(const struct common_sampler * gsmpl) {
+    if (!gsmpl) {
+        return 0.0f;
+    }
+    return gsmpl->params.temp;
+}
+
+// Set temperature at runtime by replacing the temperature sampler in the chain
+bool common_sampler_set_temp(struct common_sampler * gsmpl, float new_temp) {
+    if (!gsmpl || !gsmpl->chain) {
+        LOG_ERR("%s: invalid sampler or chain\n", __func__);
+        return false;
+    }
+
+    // Find the temperature sampler in the chain
+    const int n_samplers = llama_sampler_chain_n(gsmpl->chain);
+    int temp_idx = -1;
+
+    LOG_INF("%s: searching for temperature sampler in chain of %d samplers\n", __func__, n_samplers);
+
+    for (int i = 0; i < n_samplers; i++) {
+        struct llama_sampler * s = llama_sampler_chain_get(gsmpl->chain, i);
+        const char * name = llama_sampler_name(s);
+        LOG_INF("%s: sampler[%d] = '%s'\n", __func__, i, name);
+
+        // Look for "temp" or "temp-ext" sampler
+        if (strcmp(name, "temp") == 0 || strcmp(name, "temp-ext") == 0) {
+            temp_idx = i;
+            LOG_INF("%s: found temperature sampler '%s' at index %d\n", __func__, name, i);
+            break;
+        }
+    }
+
+    if (temp_idx == -1) {
+        // No temperature sampler found - this might happen with mirostat
+        LOG_ERR("%s: no temperature sampler found in chain\n", __func__);
+        return false;
+    }
+
+    LOG_INF("%s: removing old temperature sampler at index %d\n", __func__, temp_idx);
+
+    // Remove the old temperature sampler
+    struct llama_sampler * old_temp = llama_sampler_chain_remove(gsmpl->chain, temp_idx);
+    if (old_temp) {
+        llama_sampler_free(old_temp);
+        LOG_INF("%s: freed old temperature sampler\n", __func__);
+    }
+
+    // Collect all samplers that come after the temp position
+    std::vector<struct llama_sampler *> samplers_after;
+    int n_after = llama_sampler_chain_n(gsmpl->chain) - temp_idx;
+    LOG_INF("%s: collecting %d samplers after temp position\n", __func__, n_after);
+
+    for (int i = 0; i < n_after; i++) {
+        struct llama_sampler * s = llama_sampler_chain_remove(gsmpl->chain, temp_idx);
+        const char * name = llama_sampler_name(s);
+        LOG_INF("%s: removed sampler '%s'\n", __func__, name);
+        samplers_after.push_back(s);
+    }
+
+    // Create and add new temperature sampler
+    struct llama_sampler * new_temp_sampler;
+
+    // Use temp_ext if dynamic temperature was enabled, otherwise use simple temp
+    if (gsmpl->params.dynatemp_range > 0.0f) {
+        LOG_INF("%s: creating temp-ext sampler with temp=%.2f, range=%.2f, exp=%.2f\n",
+                __func__, new_temp, gsmpl->params.dynatemp_range, gsmpl->params.dynatemp_exponent);
+        new_temp_sampler = llama_sampler_init_temp_ext(new_temp, gsmpl->params.dynatemp_range, gsmpl->params.dynatemp_exponent);
+    } else {
+        LOG_INF("%s: creating temp sampler with temp=%.2f\n", __func__, new_temp);
+        new_temp_sampler = llama_sampler_init_temp(new_temp);
+    }
+
+    llama_sampler_chain_add(gsmpl->chain, new_temp_sampler);
+    LOG_INF("%s: added new temperature sampler\n", __func__);
+
+    // Add back the samplers that came after
+    for (auto * s : samplers_after) {
+        const char * name = llama_sampler_name(s);
+        llama_sampler_chain_add(gsmpl->chain, s);
+        LOG_INF("%s: re-added sampler '%s'\n", __func__, name);
+    }
+
+    // Update the params to reflect the new temperature
+    gsmpl->params.temp = new_temp;
+
+    LOG_INF("%s: final chain has %d samplers\n", __func__, llama_sampler_chain_n(gsmpl->chain));
+    LOG_INF("%s: temperature update complete\n", __func__);
+
+    return true;
+}
@@ -99,6 +99,10 @@ std::string common_sampler_print(const struct common_sampler * gsmpl);
 // get a string representation of the last accepted tokens
 std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
 
+// get/set temperature at runtime
+float common_sampler_get_temp(const struct common_sampler * gsmpl);
+bool  common_sampler_set_temp(struct common_sampler * gsmpl, float temp);
+
 char        common_sampler_type_to_chr(enum common_sampler_type cnstr);
 std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
 

@@ -0,0 +1,125 @@
+// State save/load functions for main.cpp
+// This file contains the simplified implementation using llama_state_get_data/set_data
+
+#include "llama.h"
+#include "log.h"
+#include "gguf.h"
+#include <vector>
+#include <fstream>
+
+// Save complete LLM state to GGUF file
+// This includes: KV cache, logits, embeddings, RNG state
+static bool save_llm_state_to_gguf(llama_context * ctx, const std::string & filename) {
+    LOG("\nSaving LLM state to %s...\n", filename.c_str());
+
+    // Get the size of the state
+    const size_t state_size = llama_state_get_size(ctx);
+    LOG("State size: %zu bytes (%.2f MB)\n", state_size, state_size / (1024.0 * 1024.0));
+
+    // Allocate buffer and get state data
+    std::vector<uint8_t> state_data(state_size);
+    const size_t written = llama_state_get_data(ctx, state_data.data(), state_size);
+
+    if (written != state_size) {
+        LOG_ERR("Failed to get state data: got %zu bytes, expected %zu\n", written, state_size);
+        return false;
+    }
+
+    // Create GGUF context
+    struct gguf_context * gguf_ctx = gguf_init_empty();
+
+    // Add metadata
+    gguf_set_val_u32(gguf_ctx, "llm_state.version", 1);
+    gguf_set_val_u64(gguf_ctx, "llm_state.size", state_size);
+    gguf_set_val_str(gguf_ctx, "llm_state.type", "kv_cache_rng_logits_embeddings");
+
+    // For GGUF, we need to add the state as a tensor
+    // Create a ggml context for the tensor
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ state_size + 1024*1024,  // Extra space for tensor metadata
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,  // We already have the data
+    };
+
+    struct ggml_context * ggml_ctx = ggml_init(params);
+
+    // Create a 1D tensor to hold the state data
+    int64_t ne[4] = {(int64_t)state_size, 1, 1, 1};
+    struct ggml_tensor * state_tensor = ggml_new_tensor(ggml_ctx, GGML_TYPE_I8, 1, ne);
+    ggml_set_name(state_tensor, "llm_state_data");
+    state_tensor->data = state_data.data();
+
+    // Add tensor to GGUF
+    gguf_add_tensor(gguf_ctx, state_tensor);
+
+    // Write to file
+    gguf_write_to_file(gguf_ctx, filename.c_str(), false);
+
+    LOG("Successfully saved LLM state (%zu bytes)\n", written);
+
+    // Cleanup
+    ggml_free(ggml_ctx);
+    gguf_free(gguf_ctx);
+
+    return true;
+}
+
+// Load complete LLM state from GGUF file
+static bool load_llm_state_from_gguf(llama_context * ctx, const std::string & filename) {
+    LOG("\nLoading LLM state from %s...\n", filename.c_str());
+
+    struct ggml_context * ggml_ctx = NULL;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ &ggml_ctx,
+    };
+
+    struct gguf_context * gguf_ctx = gguf_init_from_file(filename.c_str(), params);
+
+    if (!gguf_ctx) {
+        LOG_ERR("Failed to load state file: %s\n", filename.c_str());
+        return false;
+    }
+
+    // Read metadata
+    const int n_kv = gguf_get_n_kv(gguf_ctx);
+    uint32_t version = 0;
+    uint64_t state_size = 0;
+
+    for (int i = 0; i < n_kv; i++) {
+        const char * key = gguf_get_key(gguf_ctx, i);
+        const enum gguf_type type = gguf_get_kv_type(gguf_ctx, i);
+
+        if (strcmp(key, "llm_state.version") == 0 && type == GGUF_TYPE_UINT32) {
+            version = gguf_get_val_u32(gguf_ctx, i);
+        } else if (strcmp(key, "llm_state.size") == 0 && type == GGUF_TYPE_UINT64) {
+            state_size = gguf_get_val_u64(gguf_ctx, i);
+        }
+    }
+
+    LOG("State version: %u, size: %lu bytes (%.2f MB)\n", version, state_size, state_size / (1024.0 * 1024.0));
+
+    // Get the state tensor
+    struct ggml_tensor * state_tensor = ggml_get_tensor(ggml_ctx, "llm_state_data");
+    if (!state_tensor) {
+        LOG_ERR("State tensor not found in file\n");
+        gguf_free(gguf_ctx);
+        return false;
+    }
+
+    // Set the state
+    const size_t loaded = llama_state_set_data(ctx, (const uint8_t*)state_tensor->data, ggml_nbytes(state_tensor));
+
+    if (loaded == 0) {
+        LOG_ERR("Failed to set state data\n");
+        gguf_free(gguf_ctx);
+        return false;
+    }
+
+    LOG("Successfully loaded LLM state (%zu bytes)\n", loaded);
+
+    gguf_free(gguf_ctx);
+
+    return true;
+}