Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 39 additions & 37 deletions bun.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@

outputHashMode = "recursive";
# Marker used by scripts/update_flake_hash.sh to update this hash in place.
outputHash = "sha256-WvzB3zFWrWA2mPCWIg/vVlDZbUFTWNTgL52TumiWvyM="; # mux-offline-cache-hash
outputHash = "sha256-ZrgZ+Fsj+sGVbe1ZDhzwxSsk2DIwcRmmRflFSQGslLY="; # mux-offline-cache-hash
};

configurePhase = ''
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"@1password/sdk": "^0.4.0",
"@agentclientprotocol/sdk": "^0.14.1",
"@ai-sdk/amazon-bedrock": "^4.0.49",
"@ai-sdk/anthropic": "^3.0.37",
"@ai-sdk/anthropic": "^3.0.74",
"@ai-sdk/deepseek": "^2.0.17",
"@ai-sdk/google": "^3.0.21",
"@ai-sdk/mcp": "^1.0.18",
Expand Down Expand Up @@ -127,8 +127,8 @@
"rehype-harden": "^1.1.5",
"rehype-sanitize": "^6.0.0",
"remark-breaks": "^4.0.0",
"shescape": "^2.1.6",
"sharp": "^0.34.5",
"shescape": "^2.1.6",
"source-map-support": "^0.5.21",
"ssh-config": "^5.0.4",
"ssh2": "^1.17.0",
Expand Down
11 changes: 9 additions & 2 deletions src/browser/features/Messages/ReasoningMessage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@ export const ReasoningMessage: React.FC<ReasoningMessageProps> = ({
const wasStreamingRef = useRef(isStreaming);
const isLastPartOfMessage =
"isLastPartOfMessage" in message ? message.isLastPartOfMessage : false;
// When the parent message contains *only* reasoning (e.g. the stream was
// truncated at max_tokens before any text/tool emerged), collapsing leaves
// the user staring at a single "Thinking" header and nothing else. Skip the
// auto-collapse in that case so the work the model did is still readable;
// the accompanying stream-error row from SMA explains why the turn stopped.
const isOnlyMessageContent =
"isOnlyMessageContent" in message ? message.isOnlyMessageContent === true : false;

// Auto-collapse only when reasoning reached *natural* completion β€” i.e. the
// stream ended while this reasoning part was still the terminal block of the
Expand All @@ -97,10 +104,10 @@ export const ReasoningMessage: React.FC<ReasoningMessageProps> = ({
const wasStreaming = wasStreamingRef.current;
wasStreamingRef.current = isStreaming;

if (wasStreaming && !isStreaming && isLastPartOfMessage) {
if (wasStreaming && !isStreaming && isLastPartOfMessage && !isOnlyMessageContent) {
setIsExpanded(false);
}
}, [isStreaming, isLastPartOfMessage]);
}, [isStreaming, isLastPartOfMessage, isOnlyMessageContent]);

const toggleExpanded = () => {
if (!isCollapsible) {
Expand Down
5 changes: 4 additions & 1 deletion src/browser/features/Messages/StreamErrorMessage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ const StreamErrorMessageBase: React.FC<StreamErrorMessageBaseProps> = (props) =>
message.errorType === "server_error" &&
/\bHTTP\s*529\b|overloaded/i.test(message.error);
const isEmptyOutputError = message.errorType === "empty_output";
const isMaxOutputTokensError = message.errorType === "max_output_tokens";
// Gateway quota failures need explicit attribution so users know mux gateway credits,
// not a provider quota, are blocking the request.
const isMuxGatewayQuotaError =
Expand All @@ -92,7 +93,9 @@ const StreamErrorMessageBase: React.FC<StreamErrorMessageBaseProps> = (props) =>
? "Service overloaded"
: isEmptyOutputError
? "No assistant output"
: "Stream Error";
: isMaxOutputTokensError
? "Response truncated"
: "Stream Error";
const pill = isAnthropicOverloaded ? "overloaded" : message.errorType;
const body = isMuxGatewayQuotaError
? "Your Mux Gateway credits have been depleted. Add credits or configure another provider to continue."
Expand Down
178 changes: 178 additions & 0 deletions src/browser/utils/messages/StreamingMessageAggregator.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2739,4 +2739,182 @@ describe("StreamingMessageAggregator", () => {
expect(aggregator.getLastAbortReason()).toBeNull();
});
});

describe("max output tokens (finishReason: length)", () => {
// Regression: an assistant turn that hits max_tokens mid-thinking has
// `finishReason: "length"` and only reasoning parts. Without explicit UI
// surfacing, ReasoningMessage auto-collapses on stream-end, leaving the
// user staring at a single "Thinking" header with no signal that the turn
// truncated. SMA must synthesize a stream-error row and mark the reasoning
// row as the only renderable content so collapse is suppressed.
test("synthesizes a max_output_tokens stream-error row for reasoning-only truncated turns", () => {
const aggregator = new StreamingMessageAggregator(TEST_CREATED_AT);

aggregator.addMessage({
id: "asst-truncated",
role: "assistant",
parts: [{ type: "reasoning" as const, text: "I need to think about this..." }],
metadata: {
historySequence: 1,
timestamp: 1,
model: "anthropic:claude-opus-4-7",
finishReason: "length",
},
});

const displayed = aggregator.getDisplayedMessages();

const errorRow = displayed.find((m) => m.type === "stream-error");
expect(errorRow).toBeDefined();
if (errorRow?.type === "stream-error") {
expect(errorRow.errorType).toBe("max_output_tokens");
expect(errorRow.historyId).toBe("asst-truncated");
expect(errorRow.model).toBe("anthropic:claude-opus-4-7");
}

const reasoningRow = displayed.find((m) => m.type === "reasoning");
expect(reasoningRow).toBeDefined();
if (reasoningRow?.type === "reasoning") {
expect(reasoningRow.isOnlyMessageContent).toBe(true);
}
});

test("still synthesizes the row when the turn also has text/tool parts", () => {
// A truncated turn can include earlier text or tool calls; the user still
// needs the banner so they know the response was cut off mid-flight.
const aggregator = new StreamingMessageAggregator(TEST_CREATED_AT);

aggregator.addMessage({
id: "asst-mixed-truncated",
role: "assistant",
parts: [
{ type: "reasoning" as const, text: "thinking" },
{ type: "text" as const, text: "Partial reply" },
],
metadata: {
historySequence: 1,
timestamp: 1,
model: "anthropic:claude-opus-4-7",
finishReason: "length",
},
});

const displayed = aggregator.getDisplayedMessages();
const errorRow = displayed.find((m) => m.type === "stream-error");
expect(errorRow).toBeDefined();
expect(errorRow?.type === "stream-error" && errorRow.errorType).toBe("max_output_tokens");

// Reasoning here is *not* the only content, so collapse should be allowed.
const reasoningRow = displayed.find((m) => m.type === "reasoning");
if (reasoningRow?.type === "reasoning") {
expect(reasoningRow.isOnlyMessageContent).toBe(false);
}
});

test("treats reasoning + skipped (empty-text) parts as reasoning-only", () => {
// Regression: assistant turns can contain non-renderable parts like empty
// text (the renderer's predicate filters them out). The
// `isOnlyMessageContent` flag must use that same predicate, otherwise a
// turn that visually consists of only a reasoning block still gets
// auto-collapsed when it ends β€” exactly the silent-end UX this PR fixes.
const aggregator = new StreamingMessageAggregator(TEST_CREATED_AT);

aggregator.addMessage({
id: "asst-reasoning-and-empty-text",
role: "assistant",
parts: [
{ type: "reasoning" as const, text: "thinking..." },
{ type: "text" as const, text: "" },
],
metadata: {
historySequence: 1,
timestamp: 1,
model: "anthropic:claude-opus-4-7",
finishReason: "length",
},
});

const displayed = aggregator.getDisplayedMessages();
const reasoningRow = displayed.find((m) => m.type === "reasoning");
expect(reasoningRow).toBeDefined();
if (reasoningRow?.type === "reasoning") {
expect(reasoningRow.isOnlyMessageContent).toBe(true);
}
// Empty text part should not produce an assistant row at all.
expect(displayed.find((m) => m.type === "assistant")).toBeUndefined();
});

test("survives malformed text parts in persisted history", () => {
// Self-healing: chat.jsonl is loaded via plain JSON parse, so an entry
// with `type: "text"` but a missing/non-string `text` field is reachable.
// getDisplayedMessages must not crash on this β€” it should just skip the
// bad part and continue.
const aggregator = new StreamingMessageAggregator(TEST_CREATED_AT);

aggregator.addMessage({
id: "asst-malformed",
role: "assistant",
parts: [
{ type: "reasoning" as const, text: "thinking" },
// Cast through unknown to model a malformed history entry.
{ type: "text" } as unknown as { type: "text"; text: string },
],
metadata: {
historySequence: 1,
timestamp: 1,
model: "anthropic:claude-opus-4-7",
finishReason: "length",
},
});

// The call itself must not throw.
const displayed = aggregator.getDisplayedMessages();
expect(displayed.find((m) => m.type === "reasoning")).toBeDefined();
expect(displayed.find((m) => m.type === "stream-error")).toBeDefined();
});

test("does not synthesize the row when finishReason is a normal stop", () => {
const aggregator = new StreamingMessageAggregator(TEST_CREATED_AT);

aggregator.addMessage({
id: "asst-normal",
role: "assistant",
parts: [{ type: "text" as const, text: "All done." }],
metadata: {
historySequence: 1,
timestamp: 1,
model: "anthropic:claude-opus-4-7",
finishReason: "stop",
},
});

const displayed = aggregator.getDisplayedMessages();
expect(displayed.find((m) => m.type === "stream-error")).toBeUndefined();
});

test("does not stack a length banner on top of an existing stream error", () => {
// If the message already failed with a real error, that takes precedence β€”
// we don't want two banners on the same turn.
const aggregator = new StreamingMessageAggregator(TEST_CREATED_AT);

aggregator.addMessage({
id: "asst-errored",
role: "assistant",
parts: [{ type: "reasoning" as const, text: "hmm" }],
metadata: {
historySequence: 1,
timestamp: 1,
model: "anthropic:claude-opus-4-7",
finishReason: "length",
error: "Network timed out",
errorType: "network",
},
});

const displayed = aggregator.getDisplayedMessages();
const errorRows = displayed.filter((m) => m.type === "stream-error");
expect(errorRows).toHaveLength(1);
expect(errorRows[0]?.type === "stream-error" && errorRows[0].errorType).toBe("network");
});
});
});
66 changes: 55 additions & 11 deletions src/browser/utils/messages/StreamingMessageAggregator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2901,26 +2901,44 @@ export class StreamingMessageAggregator {
// Merge adjacent text/reasoning parts for display
const mergedParts = mergeAdjacentParts(message.parts);

// Find the last part that will produce a DisplayedMessage
// (reasoning, text parts with content, OR tool parts)
// A part is "renderable" when getDisplayedMessages will emit a row for
// it. Empty text parts and other unsupported types are silently skipped
// by the loop below, so any flag derived from "what the user sees" must
// share this predicate to stay in sync. The text check uses a truthy
// test (rather than `.length > 0`) to keep self-healing behavior for
// malformed history entries where `part.text` may be undefined.
const isRenderablePart = (part: (typeof mergedParts)[number]): boolean =>
part.type === "reasoning" ||
(part.type === "text" && Boolean(part.text)) ||
isDynamicToolPart(part);

// Find the last part that will produce a DisplayedMessage and tally
// renderable parts to detect reasoning-only turns. Done in a single
// pass so the two derivations can't drift.
let lastPartIndex = -1;
for (let i = mergedParts.length - 1; i >= 0; i--) {
let renderableCount = 0;
let renderableReasoningCount = 0;
for (let i = 0; i < mergedParts.length; i++) {
const part = mergedParts[i];
if (
part.type === "reasoning" ||
(part.type === "text" && part.text) ||
isDynamicToolPart(part)
) {
lastPartIndex = i;
break;
}
if (!isRenderablePart(part)) continue;
lastPartIndex = i;
renderableCount++;
if (part.type === "reasoning") renderableReasoningCount++;
}

const isCompactionBoundarySummary = this.isCompactionBoundarySummaryMessage(message);
if (isCompactionBoundarySummary) {
displayedMessages.push(this.createCompactionBoundaryRow(message, historySequence));
}

// A turn whose *renderable* parts are entirely reasoning (no text, no tool
// calls) is the visible signature of a max_tokens truncation mid-thinking.
// We pass this hint down so ReasoningMessage can skip its auto-collapse β€”
// otherwise the user is left looking at a single collapsed "Thinking"
// header with no other output to read.
const isReasoningOnlyMessage =
renderableCount > 0 && renderableCount === renderableReasoningCount;

mergedParts.forEach((part, partIndex) => {
const isLastPart = partIndex === lastPartIndex;
// Part is streaming if: active stream exists AND this is the last part
Expand All @@ -2938,6 +2956,7 @@ export class StreamingMessageAggregator {
isStreaming,
isPartial,
isLastPartOfMessage: isLastPart,
isOnlyMessageContent: isReasoningOnlyMessage,
timestamp: part.timestamp ?? baseTimestamp,
streamPresentation: isStreaming
? { source: streamContext?.isReplay ? "replay" : "live" }
Expand Down Expand Up @@ -3059,6 +3078,31 @@ export class StreamingMessageAggregator {
routedThroughGateway: message.metadata?.routedThroughGateway,
timestamp: baseTimestamp,
});
} else if (
// Stream ended cleanly *but* the provider truncated us at max_tokens.
// The backend's stream-end path treats this as a successful completion
// (no error metadata), so without this synthesis the chat appears to
// silently end β€” especially painful for reasoning-only turns where
// ReasoningMessage would otherwise auto-collapse the only output.
// Skip while still streaming: finishReason is only authoritative once
// the stream has settled.
message.role === "assistant" &&
!hasActiveStream &&
message.metadata?.finishReason === "length"
) {
displayedMessages.push({
type: "stream-error",
id: `${message.id}-length`,
historyId: message.id,
error:
"The model hit its max output token limit before finishing this response. " +
"Lower the thinking level (or split the turn into smaller steps) to give it more headroom.",
errorType: "max_output_tokens",
historySequence,
model: message.metadata.model,
routedThroughGateway: message.metadata?.routedThroughGateway,
timestamp: baseTimestamp,
});
}
}

Expand Down
1 change: 1 addition & 0 deletions src/common/orpc/schemas/errors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ export const StreamErrorTypeSchema = z.enum([
"runtime_not_ready", // Container/runtime doesn't exist or failed to start (permanent)
"runtime_start_failed", // Runtime is starting or temporarily unavailable (retryable)
"empty_output", // Provider ended the stream without any assistant-visible output
"max_output_tokens", // Provider truncated the response at max_tokens (finishReason: "length")
"unknown", // Catch-all
]);

Expand Down
5 changes: 5 additions & 0 deletions src/common/types/message.ts
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,11 @@ export type DisplayedMessage =
isStreaming: boolean;
isPartial: boolean; // Whether the parent message was interrupted
isLastPartOfMessage?: boolean; // True if this is the last part of a multi-part message
/** True when this is the only renderable content in the parent assistant message
* (no text or tool parts). Used to suppress auto-collapse so a reasoning-only
* turn (e.g. one truncated mid-thinking by max_tokens) doesn't visually disappear
* the moment the stream ends. */
isOnlyMessageContent?: boolean;
timestamp?: number;
tokens?: number; // Reasoning tokens if available
/** Presentation hint for smooth streaming β€” indicates if this is live or replayed content. */
Expand Down
Loading