diff --git a/common/arg.cpp b/common/arg.cpp
index ecc296485cb47..4204f6c6908fb 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3432,7 +3432,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--reasoning-format"}, "FORMAT",
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
"- none: leaves thoughts unparsed in `message.content`\n"
- "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
+ "- deepseek: puts thoughts in `message.reasoning_content`\n"
+ "- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`\n"
"(default: auto)",
[](common_params & params, const std::string & value) {
params.reasoning_format = common_reasoning_format_from_name(value);
diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp
index b3362519a68f3..7365782e7d6d8 100644
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -3,9 +3,12 @@
#include "log.h"
#include "regex-partial.h"
+#include
+#include
#include
#include
#include
+#include
#include
using json = nlohmann::ordered_json;
@@ -166,6 +169,27 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
}
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
+ std::string pending_reasoning_prefix;
+
+ if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
+ return false;
+ }
+
+ auto set_reasoning_prefix = [&](size_t prefix_pos) {
+ if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
+ return;
+ }
+ if (prefix_pos + start_think.size() > input_.size()) {
+ pending_reasoning_prefix.clear();
+ return;
+ }
+ // Capture the exact literal that opened the reasoning section so we can
+ // surface it back to callers. This ensures formats that force the
+ // reasoning tag open (e.g. DeepSeek R1) retain their original prefix
+ // instead of dropping it during parsing.
+ pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
+ };
+
auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
auto stripped_reasoning = string_strip(reasoning);
if (stripped_reasoning.empty()) {
@@ -178,28 +202,116 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "" : end_think);
}
} else {
+ if (!pending_reasoning_prefix.empty()) {
+ add_reasoning_content(pending_reasoning_prefix);
+ pending_reasoning_prefix.clear();
+ }
add_reasoning_content(stripped_reasoning);
}
};
- if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
- if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
- if (auto res = try_find_literal(end_think)) {
- handle_reasoning(res->prelude, /* closed */ true);
- consume_spaces();
- return true;
- }
- auto rest = consume_rest();
+
+ const size_t saved_pos = pos_;
+ const size_t saved_content_size = result_.content.size();
+ const size_t saved_reasoning_size = result_.reasoning_content.size();
+
+ auto restore_state = [&]() {
+ move_to(saved_pos);
+ result_.content.resize(saved_content_size);
+ result_.reasoning_content.resize(saved_reasoning_size);
+ };
+
+ // Allow leading whitespace to be preserved as content when reasoning is present at the start
+ size_t cursor = pos_;
+ size_t whitespace_end = cursor;
+ while (whitespace_end < input_.size() && std::isspace(static_cast(input_[whitespace_end]))) {
+ ++whitespace_end;
+ }
+
+ if (whitespace_end >= input_.size()) {
+ restore_state();
+ if (syntax_.thinking_forced_open) {
+ auto rest = input_.substr(saved_pos);
if (!rest.empty()) {
handle_reasoning(rest, /* closed */ !is_partial());
}
- // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
- // if (!syntax_.thinking_forced_open) {
- // throw common_chat_msg_partial_exception(end_think);
- // }
+ move_to(input_.size());
return true;
}
+ return false;
+ }
+
+ cursor = whitespace_end;
+ const size_t remaining = input_.size() - cursor;
+ const size_t start_prefix = std::min(start_think.size(), remaining);
+ const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
+
+ if (has_start_tag && start_prefix < start_think.size()) {
+ move_to(input_.size());
+ return true;
+ }
+
+ if (has_start_tag) {
+ if (whitespace_end > pos_) {
+ add_content(input_.substr(pos_, whitespace_end - pos_));
+ }
+ set_reasoning_prefix(cursor);
+ cursor += start_think.size();
+ } else if (syntax_.thinking_forced_open) {
+ cursor = whitespace_end;
+ } else {
+ restore_state();
+ return false;
+ }
+ while (true) {
+ if (cursor >= input_.size()) {
+ move_to(input_.size());
+ return true;
+ }
+
+ size_t end_pos = input_.find(end_think, cursor);
+ if (end_pos == std::string::npos) {
+ std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
+ size_t partial_off = string_find_partial_stop(remaining_view, end_think);
+ size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
+ if (reasoning_end > cursor) {
+ handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
+ }
+ move_to(input_.size());
+ return true;
+ }
+
+ if (end_pos > cursor) {
+ handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
+ } else {
+ handle_reasoning("", /* closed */ true);
+ }
+
+ cursor = end_pos + end_think.size();
+
+ while (cursor < input_.size() && std::isspace(static_cast(input_[cursor]))) {
+ ++cursor;
+ }
+
+ const size_t next_remaining = input_.size() - cursor;
+ if (next_remaining == 0) {
+ move_to(cursor);
+ return true;
+ }
+
+ const size_t next_prefix = std::min(start_think.size(), next_remaining);
+ if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
+ if (next_prefix < start_think.size()) {
+ move_to(input_.size());
+ return true;
+ }
+ set_reasoning_prefix(cursor);
+ cursor += start_think.size();
+ continue;
+ }
+
+ move_to(cursor);
+ return true;
}
- return false;
}
std::string common_chat_msg_parser::consume_rest() {
diff --git a/common/chat.cpp b/common/chat.cpp
index afbb2a2bdd3c4..8587140e1ff0a 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1408,6 +1408,8 @@ static common_chat_params common_chat_params_init_apertus(const common_chat_temp
return data;
}
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
+ builder.try_parse_reasoning("", "");
+
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
@@ -2862,6 +2864,7 @@ common_chat_params common_chat_templates_apply(
}
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
+ builder.try_parse_reasoning("", "");
builder.add_content(builder.consume_rest());
}
diff --git a/common/common.h b/common/common.h
index 8a8ecd667f2cc..0d3638c9c6228 100644
--- a/common/common.h
+++ b/common/common.h
@@ -433,7 +433,7 @@ struct common_params {
std::string chat_template = ""; // NOLINT
bool use_jinja = false; // NOLINT
bool enable_chat_template = true;
- common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int reasoning_budget = -1;
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
diff --git a/tests/test-chat-parser.cpp b/tests/test-chat-parser.cpp
index 547ebb4871cd4..0b275befb8bf4 100644
--- a/tests/test-chat-parser.cpp
+++ b/tests/test-chat-parser.cpp
@@ -106,6 +106,34 @@ static void test_reasoning() {
assert_equals("Cogito", builder.result().content);
assert_equals("Ergo sum", builder.consume_rest());
}
+ {
+ const std::string variant("content_only_inline_think");
+ common_chat_syntax syntax = {
+ /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
+ /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+ /* .reasoning_in_content = */ false,
+ /* .thinking_forced_open = */ false,
+ /* .parse_tool_calls = */ false,
+ };
+ const std::string input = "PenseBonjour";
+ auto msg = common_chat_parse(input, false, syntax);
+ assert_equals(variant, std::string("Pense"), msg.reasoning_content);
+ assert_equals(variant, std::string("Bonjour"), msg.content);
+ }
+ {
+ const std::string variant("llama_3_inline_think");
+ common_chat_syntax syntax = {
+ /* .format = */ COMMON_CHAT_FORMAT_LLAMA_3_X,
+ /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+ /* .reasoning_in_content = */ false,
+ /* .thinking_forced_open = */ false,
+ /* .parse_tool_calls = */ false,
+ };
+ const std::string input = "PlanRéponse";
+ auto msg = common_chat_parse(input, false, syntax);
+ assert_equals(variant, std::string("Plan"), msg.reasoning_content);
+ assert_equals(variant, std::string("Réponse"), msg.content);
+ }
// Test DeepSeek V3.1 parsing - reasoning content followed by "" and then regular content
{
common_chat_syntax syntax = {
diff --git a/tools/server/README.md b/tools/server/README.md
index 6825c8bf300c6..c47e15c5ade50 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -190,7 +190,7 @@ The project is under active development, and we are [looking for feedback and co
| `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
| `--jinja` | use jinja template for chat (default: disabled)
(env: LLAMA_ARG_JINJA) |
-| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)
(default: auto)
(env: LLAMA_ARG_THINK) |
+| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: deepseek)
(env: LLAMA_ARG_THINK) |
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
index c923bf9e040de..fed0cf712695f 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
@@ -1,7 +1,6 @@