ggml-org · aviallon · Dec 9, 2025 · Dec 9, 2025
@@ -2574,12 +2574,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
     add_opt(common_arg(
         {"--reasoning-budget"}, "N",
-        "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
+        "controls the maximum number of thinking tokens allowed; -1 for unlimited, 0 to disable thinking, or a positive value to limit thinking tokens (default: -1)",
         [](common_params & params, int value) {
-            if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
+            if (value < -1) { throw std::invalid_argument("invalid value: must be -1 (unlimited), 0 (disabled), or a positive number"); }
             params.reasoning_budget = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
+    add_opt(common_arg(
+        {"--reasoning-force-close-message"}, "STRING",
+        string_format(
+            "if specified, forces the model to close its reasoning/thoughts when generating this message (default: %s)\n",
+            params.reasoning_force_close_message.c_str()
+        ),
+        [](common_params & params, const std::string & value) {
+            params.reasoning_force_close_message = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_FORCE_CLOSE_MESSAGE"));
     add_opt(common_arg(
         {"--chat-template"}, "JINJA_TEMPLATE",
         string_format(

diff --git a/common/chat-parser-xml-toolcall.cpp b/common/chat-parser-xml-toolcall.cpp
@@ -705,6 +705,9 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
 
     // Parse content
     bool reasoning_unclosed = builder.syntax().thinking_forced_open;
+    if (reasoning_unclosed) {
+        builder.mark_reasoning_active(end_think);
+    }
     std::string unclosed_reasoning_content("");
     for (;;) {
         auto tc = try_find_2_literal_splited_by_spaces(builder, form.scope_start, form.tool_start);
@@ -730,6 +733,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
                 }
             } else {
                 reasoning_unclosed = false;
+                builder.mark_reasoning_closed();
                 std::string reasoning_content;
                 if (pos == std::string::npos) {
                     reasoning_content = std::move(content);
@@ -766,13 +770,15 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
         bool toolcall_in_think = false;
         for (auto think_start = content.find(start_think); think_start != std::string::npos; think_start = content.find(start_think, think_start)) {
             if (auto think_end = content.find(end_think, think_start + start_think.size()); think_end != std::string::npos) {
+                builder.mark_reasoning_active(end_think);
                 if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
                     auto reasoning_content = content.substr(think_start + start_think.size(), think_end - think_start - start_think.size());
                     builder.add_reasoning_content(reasoning_content);
                     think_start = erase_spaces(content, think_start, think_end + end_think.size() - 1);
                 } else {
                     think_start = think_end + end_think.size() - 1;
                 }
+                builder.mark_reasoning_closed();
             } else {
                 // This <tool_call> start is in thinking block, skip this tool call
                 // This <tool_call> start is in thinking block
@@ -782,6 +788,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
                     unclosed_reasoning_content = content.substr(think_start + start_think.size()) + tool_call_start;
                 }
                 reasoning_unclosed = true;
+                builder.mark_reasoning_active(end_think);
                 content.resize(think_start);
                 toolcall_in_think = true;
             }

diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp
@@ -156,6 +156,20 @@ void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_
     result_.reasoning_content += reasoning_content;
 }
 
+void common_chat_msg_parser::mark_reasoning_active(const std::string & end_tag) {
+    result_.reasoning_status.detected = true;
+    result_.reasoning_status.active   = true;
+    if (!end_tag.empty()) {
+        result_.reasoning_status.end_tag = end_tag;
+    }
+}
+
+void common_chat_msg_parser::mark_reasoning_closed() {
+    if (result_.reasoning_status.detected) {
+        result_.reasoning_status.active = false;
+    }
+}
+
 bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
     if (name.empty()) {
         return false;
@@ -329,11 +343,13 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
     const size_t saved_pos = pos_;
     const size_t saved_content_size = result_.content.size();
     const size_t saved_reasoning_size = result_.reasoning_content.size();
+    const auto   saved_reasoning_status = result_.reasoning_status;
 
     auto restore_state = [&]() {
         move_to(saved_pos);
         result_.content.resize(saved_content_size);
         result_.reasoning_content.resize(saved_reasoning_size);
+        result_.reasoning_status = saved_reasoning_status;
     };
 
     // Allow leading whitespace to be preserved as content when reasoning is present at the start
@@ -370,9 +386,11 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
         if (whitespace_end > pos_) {
             add_content(input_.substr(pos_, whitespace_end - pos_));
         }
+        mark_reasoning_active(end_think);
         set_reasoning_prefix(cursor);
         cursor += start_think.size();
     } else if (syntax_.thinking_forced_open) {
+        mark_reasoning_active(end_think);
         cursor = whitespace_end;
     } else {
         restore_state();
@@ -398,8 +416,10 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
 
         if (end_pos > cursor) {
             handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
+            mark_reasoning_closed();
         } else {
             handle_reasoning("", /* closed */ true);
+            mark_reasoning_closed();
         }
 
         cursor = end_pos + end_think.size();
@@ -420,6 +440,7 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
                 move_to(input_.size());
                 return true;
             }
+            mark_reasoning_active(end_think);
             set_reasoning_prefix(cursor);
             cursor += start_think.size();
             continue;
@@ -1492,17 +1513,24 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
         return common_chat_peg_parse(syntax.parser, input, is_partial, syntax);
     }
     common_chat_msg_parser builder(input, is_partial, syntax);
+    bool partial_exception_caught = false;
     try {
         common_chat_parse(builder);
     } catch (const common_chat_msg_partial_exception & ex) {
         LOG_DBG("Partial parse: %s\n", ex.what());
+        partial_exception_caught = true;
         if (!is_partial) {
             builder.clear_tools();
             builder.move_to(0);
             common_chat_parse_content_only(builder);
         }
     }
     auto msg = builder.result();
+    // Mark tool_call_in_progress if we caught a partial exception during partial parsing
+    // and there are tool calls in progress (indicates incomplete tool call parsing)
+    if (is_partial && partial_exception_caught && !msg.tool_calls.empty()) {
+        msg.tool_call_in_progress = true;
+    }
     if (!is_partial) {
         LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
     }

diff --git a/common/chat-parser.h b/common/chat-parser.h
@@ -56,6 +56,10 @@ class common_chat_msg_parser {
     // Appends to the result.reasoning_content field
     void add_reasoning_content(const std::string & reasoning_content);
 
+    // Track reasoning status to expose start/end markers to callers
+    void mark_reasoning_active(const std::string & end_tag);
+    void mark_reasoning_closed();
+
     // Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
     bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
 

@@ -22,6 +22,19 @@ struct common_chat_tool_call {
     }
 };
 
+struct common_chat_reasoning_status {
+    bool detected = false;      // a reasoning block start was observed
+    bool active   = false;      // we are currently inside a reasoning block (not closed yet)
+    std::string end_tag;        // closing tag to use when forcing a close
+
+    bool operator==(const common_chat_reasoning_status & other) const {
+        return detected == other.detected && active == other.active && end_tag == other.end_tag;
+    }
+    bool operator!=(const common_chat_reasoning_status & other) const {
+        return !(*this == other);
+    }
+};
+
 struct common_chat_msg_content_part {
     std::string type;
     std::string text;
@@ -37,6 +50,8 @@ struct common_chat_msg {
     std::vector<common_chat_msg_content_part> content_parts;
     std::vector<common_chat_tool_call> tool_calls;
     std::string reasoning_content;
+    common_chat_reasoning_status reasoning_status;
+    bool tool_call_in_progress = false;
     std::string tool_name;
     std::string tool_call_id;
 
@@ -63,6 +78,7 @@ struct common_chat_msg {
             && content_parts == other.content_parts
             && tool_calls == other.tool_calls
             && reasoning_content == other.reasoning_content
+            && reasoning_status == other.reasoning_status
             && tool_name == other.tool_name
             && tool_call_id == other.tool_call_id;
     }

@@ -1078,6 +1078,14 @@ struct common_init_result common_init_from_params(common_params & params) {
 
     common_init_sampler_from_model(model, params.sampling);
 
+    // Allow models to override the forced reasoning close message via GGUF metadata
+    if (params.reasoning_force_close_message == COMMON_DEFAULT_REASONING_FORCE_CLOSE_MESSAGE) {
+        char buf[512] = {0};
+        if (llama_model_meta_val_str(model, "tokenizer.ggml.reasoning_force_close_message", buf, sizeof(buf)) > 0) {
+            params.reasoning_force_close_message = buf;
+        }
+    }
+
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
     auto cparams = common_context_params_to_llama(params);

@@ -102,6 +102,8 @@ enum llama_example {
     LLAMA_EXAMPLE_COUNT,
 };
 
+inline constexpr const char * COMMON_DEFAULT_REASONING_FORCE_CLOSE_MESSAGE = "... I now conclude my reasoning and will provide the final answer.";
+
 enum common_sampler_type {
     COMMON_SAMPLER_TYPE_NONE        = 0,
     COMMON_SAMPLER_TYPE_DRY         = 1,
@@ -466,6 +468,7 @@ struct common_params {
     bool enable_chat_template = true;
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
     int reasoning_budget = -1;
+    std::string reasoning_force_close_message = COMMON_DEFAULT_REASONING_FORCE_CLOSE_MESSAGE;
     bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
 
     std::vector<std::string> api_keys;

@@ -119,6 +119,9 @@ static void test_reasoning() {
     auto msg = common_chat_parse(input, false, syntax);
     assert_equals(variant, std::string("Pense"), msg.reasoning_content);
     assert_equals(variant, std::string("Bonjour"), msg.content);
+    assert_equals(variant, true, msg.reasoning_status.detected);
+    assert_equals(variant, false, msg.reasoning_status.active);
+    assert_equals(variant, std::string("</think>"), msg.reasoning_status.end_tag);
   }
   {
     const std::string variant("llama_3_inline_think");
@@ -133,6 +136,9 @@ static void test_reasoning() {
     auto msg = common_chat_parse(input, false, syntax);
     assert_equals(variant, std::string("Plan"), msg.reasoning_content);
     assert_equals(variant, std::string("Réponse"), msg.content);
+    assert_equals(variant, true, msg.reasoning_status.detected);
+    assert_equals(variant, false, msg.reasoning_status.active);
+    assert_equals(variant, std::string("</think>"), msg.reasoning_status.end_tag);
   }
   // Test DeepSeek V3.1 parsing - reasoning content followed by "</think>" and then regular content
   {

@@ -203,7 +203,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--jinja` | use jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_JINJA) |
 | `--no-jinja` | disable jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_NO_JINJA) |
 | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
-| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
+| `--reasoning-budget N` | controls the maximum number of thinking tokens allowed; -1 for unlimited, 0 to disable thinking, or a positive value to limit thinking tokens. When the budget is exceeded, the server automatically injects a closing `</think>` and continues with the final answer. Individual OpenAI-compatible requests can override this value with `thinking_budget_tokens`. (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
+| `--reasoning-force-close-message STRING` | when the reasoning budget is exceeded, this message is appended to the current user message to signal the model to close any open thought tags. (default: '... I now conclude my reasoning and will provide the final answer.')<br/>(env: LLAMA_ARG_THINK_FORCE_CLOSE_MESSAGE) |
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |