Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3425,7 +3425,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--reasoning-format"}, "FORMAT",
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
"- none: leaves thoughts unparsed in `message.content`\n"
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
"- deepseek: puts thoughts in `message.reasoning_content`\n"
"- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
"(default: auto)",
[](common_params & params, const std::string & value) {
params.reasoning_format = common_reasoning_format_from_name(value);
Expand Down
138 changes: 125 additions & 13 deletions common/chat-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
#include "log.h"
#include "regex-partial.h"

#include <algorithm>
#include <cctype>
#include <optional>
#include <stdexcept>
#include <string>
#include <string_view>
#include <vector>

using json = nlohmann::ordered_json;
Expand Down Expand Up @@ -166,6 +169,27 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
}

bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
std::string pending_reasoning_prefix;

if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
return false;
}

auto set_reasoning_prefix = [&](size_t prefix_pos) {
if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
return;
}
if (prefix_pos + start_think.size() > input_.size()) {
pending_reasoning_prefix.clear();
return;
}
// Capture the exact literal that opened the reasoning section so we can
// surface it back to callers. This ensures formats that force the
// reasoning tag open (e.g. DeepSeek R1) retain their original prefix
// instead of dropping it during parsing.
pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
};

auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
auto stripped_reasoning = string_strip(reasoning);
if (stripped_reasoning.empty()) {
Expand All @@ -178,28 +202,116 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
}
} else {
if (!pending_reasoning_prefix.empty()) {
add_reasoning_content(pending_reasoning_prefix);
pending_reasoning_prefix.clear();
}
add_reasoning_content(stripped_reasoning);
}
};
if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
if (auto res = try_find_literal(end_think)) {
handle_reasoning(res->prelude, /* closed */ true);
consume_spaces();
return true;
}
auto rest = consume_rest();

const size_t saved_pos = pos_;
const size_t saved_content_size = result_.content.size();
const size_t saved_reasoning_size = result_.reasoning_content.size();

auto restore_state = [&]() {
move_to(saved_pos);
result_.content.resize(saved_content_size);
result_.reasoning_content.resize(saved_reasoning_size);
};

// Allow leading whitespace to be preserved as content when reasoning is present at the start
size_t cursor = pos_;
size_t whitespace_end = cursor;
while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
++whitespace_end;
}

if (whitespace_end >= input_.size()) {
restore_state();
if (syntax_.thinking_forced_open) {
auto rest = input_.substr(saved_pos);
if (!rest.empty()) {
handle_reasoning(rest, /* closed */ !is_partial());
}
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
// if (!syntax_.thinking_forced_open) {
// throw common_chat_msg_partial_exception(end_think);
// }
move_to(input_.size());
return true;
}
return false;
}

cursor = whitespace_end;
const size_t remaining = input_.size() - cursor;
const size_t start_prefix = std::min(start_think.size(), remaining);
const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;

if (has_start_tag && start_prefix < start_think.size()) {
move_to(input_.size());
return true;
}

if (has_start_tag) {
if (whitespace_end > pos_) {
add_content(input_.substr(pos_, whitespace_end - pos_));
}
set_reasoning_prefix(cursor);
cursor += start_think.size();
} else if (syntax_.thinking_forced_open) {
cursor = whitespace_end;
} else {
restore_state();
return false;
}
while (true) {
if (cursor >= input_.size()) {
move_to(input_.size());
return true;
}

size_t end_pos = input_.find(end_think, cursor);
if (end_pos == std::string::npos) {
std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
size_t partial_off = string_find_partial_stop(remaining_view, end_think);
size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
if (reasoning_end > cursor) {
handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
}
move_to(input_.size());
return true;
}

if (end_pos > cursor) {
handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
} else {
handle_reasoning("", /* closed */ true);
}

cursor = end_pos + end_think.size();

while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
++cursor;
}

const size_t next_remaining = input_.size() - cursor;
if (next_remaining == 0) {
move_to(cursor);
return true;
}

const size_t next_prefix = std::min(start_think.size(), next_remaining);
if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
if (next_prefix < start_think.size()) {
move_to(input_.size());
return true;
}
set_reasoning_prefix(cursor);
cursor += start_think.size();
continue;
}

move_to(cursor);
return true;
}
return false;
}

std::string common_chat_msg_parser::consume_rest() {
Expand Down
3 changes: 3 additions & 0 deletions common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1408,6 +1408,8 @@ static common_chat_params common_chat_params_init_apertus(const common_chat_temp
return data;
}
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
builder.try_parse_reasoning("<think>", "</think>");

if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
Expand Down Expand Up @@ -2862,6 +2864,7 @@ common_chat_params common_chat_templates_apply(
}

static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
builder.try_parse_reasoning("<think>", "</think>");
builder.add_content(builder.consume_rest());
}

Expand Down
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ struct common_params {
std::string chat_template = ""; // NOLINT
bool use_jinja = false; // NOLINT
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int reasoning_budget = -1;
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response

Expand Down
28 changes: 28 additions & 0 deletions tests/test-chat-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,34 @@ static void test_reasoning() {
assert_equals("<think>Cogito</think>", builder.result().content);
assert_equals("Ergo sum", builder.consume_rest());
}
{
const std::string variant("content_only_inline_think");
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ false,
};
const std::string input = "<think>Pense</think>Bonjour";
auto msg = common_chat_parse(input, false, syntax);
assert_equals(variant, std::string("Pense"), msg.reasoning_content);
assert_equals(variant, std::string("Bonjour"), msg.content);
}
{
const std::string variant("llama_3_inline_think");
common_chat_syntax syntax = {
/* .format = */ COMMON_CHAT_FORMAT_LLAMA_3_X,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
/* .reasoning_in_content = */ false,
/* .thinking_forced_open = */ false,
/* .parse_tool_calls = */ false,
};
const std::string input = "<think>Plan</think>Réponse";
auto msg = common_chat_parse(input, false, syntax);
assert_equals(variant, std::string("Plan"), msg.reasoning_content);
assert_equals(variant, std::string("Réponse"), msg.content);
}
// Test DeepSeek V3.1 parsing - reasoning content followed by "</think>" and then regular content
{
common_chat_syntax syntax = {
Expand Down
2 changes: 1 addition & 1 deletion tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ The project is under active development, and we are [looking for feedback and co
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
| `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
<script lang="ts">
import { getDeletionInfo } from '$lib/stores/chat.svelte';
import { copyToClipboard } from '$lib/utils/copy';
import { parseThinkingContent } from '$lib/utils/thinking';
import ChatMessageAssistant from './ChatMessageAssistant.svelte';
import ChatMessageUser from './ChatMessageUser.svelte';

Expand Down Expand Up @@ -47,26 +46,13 @@

let thinkingContent = $derived.by(() => {
if (message.role === 'assistant') {
if (message.thinking) {
return message.thinking;
}

const parsed = parseThinkingContent(message.content);
const trimmedThinking = message.thinking?.trim();

return parsed.thinking;
return trimmedThinking ? trimmedThinking : null;
}
return null;
});

let messageContent = $derived.by(() => {
if (message.role === 'assistant') {
const parsed = parseThinkingContent(message.content);
return parsed.cleanContent?.replace('<|channel|>analysis', '');
}

return message.content?.replace('<|channel|>analysis', '');
});

function handleCancelEdit() {
isEditing = false;
editedContent = message.content;
Expand Down Expand Up @@ -165,7 +151,7 @@
{editedContent}
{isEditing}
{message}
{messageContent}
messageContent={message.content}
onCancelEdit={handleCancelEdit}
onConfirmDelete={handleConfirmDelete}
onCopy={handleCopy}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,11 @@
</div>
</div>
{:else if message.role === 'assistant'}
<MarkdownContent content={messageContent || ''} />
{#if config().disableReasoningFormat}
<pre class="raw-output">{messageContent || ''}</pre>
{:else}
<MarkdownContent content={messageContent || ''} />
{/if}
{:else}
<div class="text-sm whitespace-pre-wrap">
{messageContent}
Expand Down Expand Up @@ -203,4 +207,21 @@
background-position: -200% 0;
}
}

.raw-output {
width: 100%;
max-width: 48rem;
margin-top: 1.5rem;
padding: 1rem 1.25rem;
border-radius: 1rem;
background: hsl(var(--muted) / 0.3);
color: var(--foreground);
font-family:
ui-monospace, SFMono-Regular, 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas,
'Liberation Mono', Menlo, monospace;
font-size: 0.875rem;
line-height: 1.6;
white-space: pre-wrap;
word-break: break-word;
}
</style>
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,12 @@
key: 'showThoughtInProgress',
label: 'Show thought in progress',
type: 'checkbox'
},
{
key: 'disableReasoningFormat',
label:
'Show raw LLM output without backend parsing and frontend Markdown rendering to inspect streaming across different models.',
type: 'checkbox'
}
]
},
Expand Down
3 changes: 3 additions & 0 deletions tools/server/webui/src/lib/constants/settings-config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
theme: 'system',
showTokensPerSecond: false,
showThoughtInProgress: false,
disableReasoningFormat: false,
keepStatsVisible: false,
askForTitleConfirmation: false,
pasteLongTextToFileLen: 2500,
Expand Down Expand Up @@ -76,6 +77,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
custom: 'Custom JSON parameters to send to the API. Must be valid JSON format.',
showTokensPerSecond: 'Display generation speed in tokens per second during streaming.',
showThoughtInProgress: 'Expand thought process by default when generating messages.',
disableReasoningFormat:
'Show raw LLM output without backend parsing and frontend Markdown rendering to inspect streaming across different models.',
keepStatsVisible: 'Keep processing statistics visible after generation finishes.',
askForTitleConfirmation:
'Ask for confirmation before automatically changing conversation title when editing the first message.',
Expand Down
Loading
Loading