Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2574,12 +2574,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
add_opt(common_arg(
{"--reasoning-budget"}, "N",
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
"controls the maximum number of thinking tokens allowed; -1 for unlimited, 0 to disable thinking, or a positive value to limit thinking tokens (default: -1)",
[](common_params & params, int value) {
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
if (value < -1) { throw std::invalid_argument("invalid value: must be -1 (unlimited), 0 (disabled), or a positive number"); }
params.reasoning_budget = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
add_opt(common_arg(
{"--reasoning-force-close-message"}, "STRING",
string_format(
"if specified, forces the model to close its reasoning/thoughts when generating this message (default: %s)\n",
params.reasoning_force_close_message.c_str()
),
[](common_params & params, const std::string & value) {
params.reasoning_force_close_message = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_FORCE_CLOSE_MESSAGE"));
add_opt(common_arg(
{"--chat-template"}, "JINJA_TEMPLATE",
string_format(
Expand Down
7 changes: 7 additions & 0 deletions common/chat-parser-xml-toolcall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,9 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons

// Parse content
bool reasoning_unclosed = builder.syntax().thinking_forced_open;
if (reasoning_unclosed) {
builder.mark_reasoning_active(end_think);
}
std::string unclosed_reasoning_content("");
for (;;) {
auto tc = try_find_2_literal_splited_by_spaces(builder, form.scope_start, form.tool_start);
Expand All @@ -730,6 +733,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
}
} else {
reasoning_unclosed = false;
builder.mark_reasoning_closed();
std::string reasoning_content;
if (pos == std::string::npos) {
reasoning_content = std::move(content);
Expand Down Expand Up @@ -766,13 +770,15 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
bool toolcall_in_think = false;
for (auto think_start = content.find(start_think); think_start != std::string::npos; think_start = content.find(start_think, think_start)) {
if (auto think_end = content.find(end_think, think_start + start_think.size()); think_end != std::string::npos) {
builder.mark_reasoning_active(end_think);
if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
auto reasoning_content = content.substr(think_start + start_think.size(), think_end - think_start - start_think.size());
builder.add_reasoning_content(reasoning_content);
think_start = erase_spaces(content, think_start, think_end + end_think.size() - 1);
} else {
think_start = think_end + end_think.size() - 1;
}
builder.mark_reasoning_closed();
} else {
// This <tool_call> start is in thinking block, skip this tool call
// This <tool_call> start is in thinking block
Expand All @@ -782,6 +788,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
unclosed_reasoning_content = content.substr(think_start + start_think.size()) + tool_call_start;
}
reasoning_unclosed = true;
builder.mark_reasoning_active(end_think);
content.resize(think_start);
toolcall_in_think = true;
}
Expand Down
28 changes: 28 additions & 0 deletions common/chat-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,20 @@ void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_
result_.reasoning_content += reasoning_content;
}

void common_chat_msg_parser::mark_reasoning_active(const std::string & end_tag) {
result_.reasoning_status.detected = true;
result_.reasoning_status.active = true;
if (!end_tag.empty()) {
result_.reasoning_status.end_tag = end_tag;
}
}

void common_chat_msg_parser::mark_reasoning_closed() {
if (result_.reasoning_status.detected) {
result_.reasoning_status.active = false;
}
}

bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
if (name.empty()) {
return false;
Expand Down Expand Up @@ -329,11 +343,13 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
const size_t saved_pos = pos_;
const size_t saved_content_size = result_.content.size();
const size_t saved_reasoning_size = result_.reasoning_content.size();
const auto saved_reasoning_status = result_.reasoning_status;

auto restore_state = [&]() {
move_to(saved_pos);
result_.content.resize(saved_content_size);
result_.reasoning_content.resize(saved_reasoning_size);
result_.reasoning_status = saved_reasoning_status;
};

// Allow leading whitespace to be preserved as content when reasoning is present at the start
Expand Down Expand Up @@ -370,9 +386,11 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
if (whitespace_end > pos_) {
add_content(input_.substr(pos_, whitespace_end - pos_));
}
mark_reasoning_active(end_think);
set_reasoning_prefix(cursor);
cursor += start_think.size();
} else if (syntax_.thinking_forced_open) {
mark_reasoning_active(end_think);
cursor = whitespace_end;
} else {
restore_state();
Expand All @@ -398,8 +416,10 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think

if (end_pos > cursor) {
handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
mark_reasoning_closed();
} else {
handle_reasoning("", /* closed */ true);
mark_reasoning_closed();
}

cursor = end_pos + end_think.size();
Expand All @@ -420,6 +440,7 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
move_to(input_.size());
return true;
}
mark_reasoning_active(end_think);
set_reasoning_prefix(cursor);
cursor += start_think.size();
continue;
Expand Down Expand Up @@ -1492,17 +1513,24 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
return common_chat_peg_parse(syntax.parser, input, is_partial, syntax);
}
common_chat_msg_parser builder(input, is_partial, syntax);
bool partial_exception_caught = false;
try {
common_chat_parse(builder);
} catch (const common_chat_msg_partial_exception & ex) {
LOG_DBG("Partial parse: %s\n", ex.what());
partial_exception_caught = true;
if (!is_partial) {
builder.clear_tools();
builder.move_to(0);
common_chat_parse_content_only(builder);
}
}
auto msg = builder.result();
// Mark tool_call_in_progress if we caught a partial exception during partial parsing
// and there are tool calls in progress (indicates incomplete tool call parsing)
if (is_partial && partial_exception_caught && !msg.tool_calls.empty()) {
msg.tool_call_in_progress = true;
}
if (!is_partial) {
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
}
Expand Down
4 changes: 4 additions & 0 deletions common/chat-parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ class common_chat_msg_parser {
// Appends to the result.reasoning_content field
void add_reasoning_content(const std::string & reasoning_content);

// Track reasoning status to expose start/end markers to callers
void mark_reasoning_active(const std::string & end_tag);
void mark_reasoning_closed();

// Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);

Expand Down
16 changes: 16 additions & 0 deletions common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,19 @@ struct common_chat_tool_call {
}
};

struct common_chat_reasoning_status {
bool detected = false; // a reasoning block start was observed
bool active = false; // we are currently inside a reasoning block (not closed yet)
std::string end_tag; // closing tag to use when forcing a close

bool operator==(const common_chat_reasoning_status & other) const {
return detected == other.detected && active == other.active && end_tag == other.end_tag;
}
bool operator!=(const common_chat_reasoning_status & other) const {
return !(*this == other);
}
};

struct common_chat_msg_content_part {
std::string type;
std::string text;
Expand All @@ -37,6 +50,8 @@ struct common_chat_msg {
std::vector<common_chat_msg_content_part> content_parts;
std::vector<common_chat_tool_call> tool_calls;
std::string reasoning_content;
common_chat_reasoning_status reasoning_status;
bool tool_call_in_progress = false;
std::string tool_name;
std::string tool_call_id;

Expand All @@ -63,6 +78,7 @@ struct common_chat_msg {
&& content_parts == other.content_parts
&& tool_calls == other.tool_calls
&& reasoning_content == other.reasoning_content
&& reasoning_status == other.reasoning_status
&& tool_name == other.tool_name
&& tool_call_id == other.tool_call_id;
}
Expand Down
8 changes: 8 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1078,6 +1078,14 @@ struct common_init_result common_init_from_params(common_params & params) {

common_init_sampler_from_model(model, params.sampling);

// Allow models to override the forced reasoning close message via GGUF metadata
if (params.reasoning_force_close_message == COMMON_DEFAULT_REASONING_FORCE_CLOSE_MESSAGE) {
char buf[512] = {0};
if (llama_model_meta_val_str(model, "tokenizer.ggml.reasoning_force_close_message", buf, sizeof(buf)) > 0) {
params.reasoning_force_close_message = buf;
}
}

const llama_vocab * vocab = llama_model_get_vocab(model);

auto cparams = common_context_params_to_llama(params);
Expand Down
3 changes: 3 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ enum llama_example {
LLAMA_EXAMPLE_COUNT,
};

inline constexpr const char * COMMON_DEFAULT_REASONING_FORCE_CLOSE_MESSAGE = "... I now conclude my reasoning and will provide the final answer.";

enum common_sampler_type {
COMMON_SAMPLER_TYPE_NONE = 0,
COMMON_SAMPLER_TYPE_DRY = 1,
Expand Down Expand Up @@ -466,6 +468,7 @@ struct common_params {
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int reasoning_budget = -1;
std::string reasoning_force_close_message = COMMON_DEFAULT_REASONING_FORCE_CLOSE_MESSAGE;
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response

std::vector<std::string> api_keys;
Expand Down
6 changes: 6 additions & 0 deletions tests/test-chat-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ static void test_reasoning() {
auto msg = common_chat_parse(input, false, syntax);
assert_equals(variant, std::string("Pense"), msg.reasoning_content);
assert_equals(variant, std::string("Bonjour"), msg.content);
assert_equals(variant, true, msg.reasoning_status.detected);
assert_equals(variant, false, msg.reasoning_status.active);
assert_equals(variant, std::string("</think>"), msg.reasoning_status.end_tag);
}
{
const std::string variant("llama_3_inline_think");
Expand All @@ -133,6 +136,9 @@ static void test_reasoning() {
auto msg = common_chat_parse(input, false, syntax);
assert_equals(variant, std::string("Plan"), msg.reasoning_content);
assert_equals(variant, std::string("Réponse"), msg.content);
assert_equals(variant, true, msg.reasoning_status.detected);
assert_equals(variant, false, msg.reasoning_status.active);
assert_equals(variant, std::string("</think>"), msg.reasoning_status.end_tag);
}
// Test DeepSeek V3.1 parsing - reasoning content followed by "</think>" and then regular content
{
Expand Down
3 changes: 2 additions & 1 deletion tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--jinja` | use jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_JINJA) |
| `--no-jinja` | disable jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_NO_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
| `--reasoning-budget N` | controls the maximum number of thinking tokens allowed; -1 for unlimited, 0 to disable thinking, or a positive value to limit thinking tokens. When the budget is exceeded, the server automatically injects a closing `</think>` and continues with the final answer. Individual OpenAI-compatible requests can override this value with `thinking_budget_tokens`. (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
| `--reasoning-force-close-message STRING` | when the reasoning budget is exceeded, this message is appended to the current user message to signal the model to close any open thought tags. (default: '... I now conclude my reasoning and will provide the final answer.')<br/>(env: LLAMA_ARG_THINK_FORCE_CLOSE_MESSAGE) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
Expand Down
Loading
Loading