Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
b7a42d8
Add CLI commands for dumping and loading activations in GGUF format
claude Nov 8, 2025
2ac5ded
Add interactive commands for saving/loading activations during conver…
claude Nov 8, 2025
efac1f5
Fix activation save/load commands - prevent unwanted inference and ad…
claude Nov 8, 2025
4538743
Critical fix: Set activation callback before context creation
claude Nov 8, 2025
de61f10
Add extensive debugging and disable warmup for activation callback
claude Nov 8, 2025
e728b4d
Improve activation collection logging and handle GPU tensors better
claude Nov 8, 2025
c762499
CRITICAL FIX: Disable graph reuse to enable activation callbacks
claude Nov 8, 2025
2194978
Implement clean LLM state save/load using llama_state API
claude Nov 8, 2025
8a709a6
Add runtime temperature control with /\/temp commands
claude Nov 8, 2025
5c3afd2
Fix temperature sampler replacement logic
claude Nov 8, 2025
aa0f238
Add debug logging to diagnose temperature change issue
claude Nov 8, 2025
bbe6799
Add auto KV cache save/load for llama-server faster restarts
claude Nov 8, 2025
21db687
Add HTTP endpoint to trigger KV cache save on demand
claude Nov 8, 2025
3faa493
Add streaming activation capture system for intermediate tensor analysis
claude Nov 8, 2025
3260728
Add tool-calling capability to llama-cli
claude Nov 9, 2025
66669d6
Fix tool execution to prevent help examples from triggering
claude Nov 9, 2025
0ce2fcc
Prevent tool execution inside think tags and duplicate runs
claude Nov 9, 2025
cb933ad
Add --idle-action-interval flag for automatic agent thinking
claude Nov 9, 2025
7d55eaa
Fix idle timer to only count time waiting for user input
claude Nov 9, 2025
4f94acb
Implement actual timeout support for idle action interval
claude Nov 9, 2025
b3c0681
Reset tool deduplication on idle timeout to enable tools during agent…
claude Nov 9, 2025
a6a76a7
Add /\/timeout command to control idle action interval during session
claude Nov 9, 2025
e9685b0
Fix buffer.back() undefined behavior on idle timeout with empty input
claude Nov 9, 2025
a19a01b
Add debug logging for tool output token count
claude Nov 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1110,6 +1110,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.prompt_cache_ro = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg(
{"--dump-activations"}, "FNAME",
"file to dump activations to in GGUF format (default: none)",
[](common_params & params, const std::string & value) {
params.path_dump_activations = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg(
{"--load-activations"}, "FNAME",
"file to load activations from in GGUF format (default: none)",
[](common_params & params, const std::string & value) {
params.path_load_activations = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg(
{"-r", "--reverse-prompt"}, "PROMPT",
"halt generation at PROMPT, return control in interactive mode\n",
Expand Down Expand Up @@ -1164,6 +1178,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.interactive_first = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg(
{"--idle-action-interval"}, "N",
"auto-submit empty input after N minutes of idle time with no keystrokes (default: 0 = disabled)",
[](common_params & params, const std::string & value) {
params.idle_action_interval = std::stoi(value);
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg(
{"-mli", "--multiline-input"},
"allows you to write or paste multiple lines without ending each in '\\'",
Expand Down Expand Up @@ -2453,6 +2474,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--kv-cache-auto-save"}, "BASE_NAME",
"automatically save all KV cache to BASE_NAME_<timestamp>/ directory on server shutdown (default: disabled)",
[](common_params & params, const std::string & value) {
params.kv_cache_auto_save_base = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--kv-cache-auto-load"}, "DIRNAME",
"automatically load KV cache from specified timestamped directory on server startup (default: disabled)",
[](common_params & params, const std::string & value) {
params.kv_cache_auto_load = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--jinja"},
"use jinja template for chat (default: disabled)",
Expand Down
7 changes: 7 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,8 @@ struct common_params {
std::string system_prompt = ""; // NOLINT
std::string prompt_file = ""; // store the external prompt file name // NOLINT
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
std::string path_dump_activations = ""; // path to GGUF file for dumping activations // NOLINT
std::string path_load_activations = ""; // path to GGUF file for loading activations // NOLINT
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
Expand Down Expand Up @@ -370,6 +372,7 @@ struct common_params {
bool special = false; // enable special token output
bool interactive = false; // interactive mode
bool interactive_first = false; // wait for user input immediately
int32_t idle_action_interval = 0; // auto-submit empty input after N minutes of idle (0 = disabled)
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it

Expand Down Expand Up @@ -457,6 +460,10 @@ struct common_params {

std::string slot_save_path;

// Auto KV cache save/load for faster server restarts
std::string kv_cache_auto_save_base; // base name for auto-saving KV cache on shutdown (with timestamp)
std::string kv_cache_auto_load; // specific timestamped name to load on startup

float slot_prompt_similarity = 0.1f;

// batched-bench params
Expand Down
56 changes: 56 additions & 0 deletions common/console.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -501,4 +501,60 @@ namespace console {
return readline_advanced(line, multiline_input);
}

bool readline_with_timeout(std::string & line, bool multiline_input, int timeout_seconds, bool & timed_out) {
timed_out = false;

if (timeout_seconds <= 0) {
// No timeout, use regular readline
return readline(line, multiline_input);
}

#if defined(_WIN32)
// Windows: check if input is available with timeout
HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
DWORD result = WaitForSingleObject(hStdin, timeout_seconds * 1000);

if (result == WAIT_TIMEOUT) {
timed_out = true;
line.clear();
return false;
}

if (result != WAIT_OBJECT_0) {
// Error occurred
line.clear();
return false;
}

// Input is available, use regular readline
return readline(line, multiline_input);
#else
// Unix: use select() to check for input with timeout
fd_set readfds;
struct timeval tv;

FD_ZERO(&readfds);
FD_SET(STDIN_FILENO, &readfds);

tv.tv_sec = timeout_seconds;
tv.tv_usec = 0;

int retval = select(STDIN_FILENO + 1, &readfds, NULL, NULL, &tv);

if (retval == -1) {
// Error occurred
line.clear();
return false;
} else if (retval == 0) {
// Timeout occurred
timed_out = true;
line.clear();
return false;
}

// Input is available, use regular readline
return readline(line, multiline_input);
#endif
}

}
1 change: 1 addition & 0 deletions common/console.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ namespace console {
void cleanup();
void set_display(display_t display);
bool readline(std::string & line, bool multiline_input);
bool readline_with_timeout(std::string & line, bool multiline_input, int timeout_seconds, bool & timed_out);
}
94 changes: 94 additions & 0 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "log.h"

#include <cmath>
#include <cstring>
#include <unordered_map>
#include <algorithm>

Expand Down Expand Up @@ -599,3 +600,96 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri

return samplers;
}

// Get current temperature from the sampler
float common_sampler_get_temp(const struct common_sampler * gsmpl) {
if (!gsmpl) {
return 0.0f;
}
return gsmpl->params.temp;
}

// Set temperature at runtime by replacing the temperature sampler in the chain
bool common_sampler_set_temp(struct common_sampler * gsmpl, float new_temp) {
if (!gsmpl || !gsmpl->chain) {
LOG_ERR("%s: invalid sampler or chain\n", __func__);
return false;
}

// Find the temperature sampler in the chain
const int n_samplers = llama_sampler_chain_n(gsmpl->chain);
int temp_idx = -1;

LOG_INF("%s: searching for temperature sampler in chain of %d samplers\n", __func__, n_samplers);

for (int i = 0; i < n_samplers; i++) {
struct llama_sampler * s = llama_sampler_chain_get(gsmpl->chain, i);
const char * name = llama_sampler_name(s);
LOG_INF("%s: sampler[%d] = '%s'\n", __func__, i, name);

// Look for "temp" or "temp-ext" sampler
if (strcmp(name, "temp") == 0 || strcmp(name, "temp-ext") == 0) {
temp_idx = i;
LOG_INF("%s: found temperature sampler '%s' at index %d\n", __func__, name, i);
break;
}
}

if (temp_idx == -1) {
// No temperature sampler found - this might happen with mirostat
LOG_ERR("%s: no temperature sampler found in chain\n", __func__);
return false;
}

LOG_INF("%s: removing old temperature sampler at index %d\n", __func__, temp_idx);

// Remove the old temperature sampler
struct llama_sampler * old_temp = llama_sampler_chain_remove(gsmpl->chain, temp_idx);
if (old_temp) {
llama_sampler_free(old_temp);
LOG_INF("%s: freed old temperature sampler\n", __func__);
}

// Collect all samplers that come after the temp position
std::vector<struct llama_sampler *> samplers_after;
int n_after = llama_sampler_chain_n(gsmpl->chain) - temp_idx;
LOG_INF("%s: collecting %d samplers after temp position\n", __func__, n_after);

for (int i = 0; i < n_after; i++) {
struct llama_sampler * s = llama_sampler_chain_remove(gsmpl->chain, temp_idx);
const char * name = llama_sampler_name(s);
LOG_INF("%s: removed sampler '%s'\n", __func__, name);
samplers_after.push_back(s);
}

// Create and add new temperature sampler
struct llama_sampler * new_temp_sampler;

// Use temp_ext if dynamic temperature was enabled, otherwise use simple temp
if (gsmpl->params.dynatemp_range > 0.0f) {
LOG_INF("%s: creating temp-ext sampler with temp=%.2f, range=%.2f, exp=%.2f\n",
__func__, new_temp, gsmpl->params.dynatemp_range, gsmpl->params.dynatemp_exponent);
new_temp_sampler = llama_sampler_init_temp_ext(new_temp, gsmpl->params.dynatemp_range, gsmpl->params.dynatemp_exponent);
} else {
LOG_INF("%s: creating temp sampler with temp=%.2f\n", __func__, new_temp);
new_temp_sampler = llama_sampler_init_temp(new_temp);
}

llama_sampler_chain_add(gsmpl->chain, new_temp_sampler);
LOG_INF("%s: added new temperature sampler\n", __func__);

// Add back the samplers that came after
for (auto * s : samplers_after) {
const char * name = llama_sampler_name(s);
llama_sampler_chain_add(gsmpl->chain, s);
LOG_INF("%s: re-added sampler '%s'\n", __func__, name);
}

// Update the params to reflect the new temperature
gsmpl->params.temp = new_temp;

LOG_INF("%s: final chain has %d samplers\n", __func__, llama_sampler_chain_n(gsmpl->chain));
LOG_INF("%s: temperature update complete\n", __func__);

return true;
}
4 changes: 4 additions & 0 deletions common/sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ std::string common_sampler_print(const struct common_sampler * gsmpl);
// get a string representation of the last accepted tokens
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);

// get/set temperature at runtime
float common_sampler_get_temp(const struct common_sampler * gsmpl);
bool common_sampler_set_temp(struct common_sampler * gsmpl, float temp);

char common_sampler_type_to_chr(enum common_sampler_type cnstr);
std::string common_sampler_type_to_str(enum common_sampler_type cnstr);

Expand Down
125 changes: 125 additions & 0 deletions tools/main/main-state-save.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// State save/load functions for main.cpp
// This file contains the simplified implementation using llama_state_get_data/set_data

#include "llama.h"
#include "log.h"
#include "gguf.h"
#include <vector>
#include <fstream>

// Save complete LLM state to GGUF file
// This includes: KV cache, logits, embeddings, RNG state
static bool save_llm_state_to_gguf(llama_context * ctx, const std::string & filename) {
LOG("\nSaving LLM state to %s...\n", filename.c_str());

// Get the size of the state
const size_t state_size = llama_state_get_size(ctx);
LOG("State size: %zu bytes (%.2f MB)\n", state_size, state_size / (1024.0 * 1024.0));

// Allocate buffer and get state data
std::vector<uint8_t> state_data(state_size);
const size_t written = llama_state_get_data(ctx, state_data.data(), state_size);

if (written != state_size) {
LOG_ERR("Failed to get state data: got %zu bytes, expected %zu\n", written, state_size);
return false;
}

// Create GGUF context
struct gguf_context * gguf_ctx = gguf_init_empty();

// Add metadata
gguf_set_val_u32(gguf_ctx, "llm_state.version", 1);
gguf_set_val_u64(gguf_ctx, "llm_state.size", state_size);
gguf_set_val_str(gguf_ctx, "llm_state.type", "kv_cache_rng_logits_embeddings");

// For GGUF, we need to add the state as a tensor
// Create a ggml context for the tensor
struct ggml_init_params params = {
/*.mem_size =*/ state_size + 1024*1024, // Extra space for tensor metadata
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true, // We already have the data
};

struct ggml_context * ggml_ctx = ggml_init(params);

// Create a 1D tensor to hold the state data
int64_t ne[4] = {(int64_t)state_size, 1, 1, 1};
struct ggml_tensor * state_tensor = ggml_new_tensor(ggml_ctx, GGML_TYPE_I8, 1, ne);
ggml_set_name(state_tensor, "llm_state_data");
state_tensor->data = state_data.data();

// Add tensor to GGUF
gguf_add_tensor(gguf_ctx, state_tensor);

// Write to file
gguf_write_to_file(gguf_ctx, filename.c_str(), false);

LOG("Successfully saved LLM state (%zu bytes)\n", written);

// Cleanup
ggml_free(ggml_ctx);
gguf_free(gguf_ctx);

return true;
}

// Load complete LLM state from GGUF file
static bool load_llm_state_from_gguf(llama_context * ctx, const std::string & filename) {
LOG("\nLoading LLM state from %s...\n", filename.c_str());

struct ggml_context * ggml_ctx = NULL;

struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ggml_ctx,
};

struct gguf_context * gguf_ctx = gguf_init_from_file(filename.c_str(), params);

if (!gguf_ctx) {
LOG_ERR("Failed to load state file: %s\n", filename.c_str());
return false;
}

// Read metadata
const int n_kv = gguf_get_n_kv(gguf_ctx);
uint32_t version = 0;
uint64_t state_size = 0;

for (int i = 0; i < n_kv; i++) {
const char * key = gguf_get_key(gguf_ctx, i);
const enum gguf_type type = gguf_get_kv_type(gguf_ctx, i);

if (strcmp(key, "llm_state.version") == 0 && type == GGUF_TYPE_UINT32) {
version = gguf_get_val_u32(gguf_ctx, i);
} else if (strcmp(key, "llm_state.size") == 0 && type == GGUF_TYPE_UINT64) {
state_size = gguf_get_val_u64(gguf_ctx, i);
}
}

LOG("State version: %u, size: %lu bytes (%.2f MB)\n", version, state_size, state_size / (1024.0 * 1024.0));

// Get the state tensor
struct ggml_tensor * state_tensor = ggml_get_tensor(ggml_ctx, "llm_state_data");
if (!state_tensor) {
LOG_ERR("State tensor not found in file\n");
gguf_free(gguf_ctx);
return false;
}

// Set the state
const size_t loaded = llama_state_set_data(ctx, (const uint8_t*)state_tensor->data, ggml_nbytes(state_tensor));

if (loaded == 0) {
LOG_ERR("Failed to set state data\n");
gguf_free(gguf_ctx);
return false;
}

LOG("Successfully loaded LLM state (%zu bytes)\n", loaded);

gguf_free(gguf_ctx);

return true;
}
Loading