coder · ammario · May 2, 2026 · May 2, 2026 · May 2, 2026 · May 2, 2026
diff --git a/bun.lock b/bun.lock
diff --git a/flake.nix b/flake.nix
@@ -84,7 +84,7 @@
 
             outputHashMode = "recursive";
             # Marker used by scripts/update_flake_hash.sh to update this hash in place.
-            outputHash = "sha256-WvzB3zFWrWA2mPCWIg/vVlDZbUFTWNTgL52TumiWvyM="; # mux-offline-cache-hash
+            outputHash = "sha256-ZrgZ+Fsj+sGVbe1ZDhzwxSsk2DIwcRmmRflFSQGslLY="; # mux-offline-cache-hash
           };
 
           configurePhase = ''

diff --git a/package.json b/package.json
@@ -50,7 +50,7 @@
     "@1password/sdk": "^0.4.0",
     "@agentclientprotocol/sdk": "^0.14.1",
     "@ai-sdk/amazon-bedrock": "^4.0.49",
-    "@ai-sdk/anthropic": "^3.0.37",
+    "@ai-sdk/anthropic": "^3.0.74",
     "@ai-sdk/deepseek": "^2.0.17",
     "@ai-sdk/google": "^3.0.21",
     "@ai-sdk/mcp": "^1.0.18",
@@ -127,8 +127,8 @@
     "rehype-harden": "^1.1.5",
     "rehype-sanitize": "^6.0.0",
     "remark-breaks": "^4.0.0",
-    "shescape": "^2.1.6",
     "sharp": "^0.34.5",
+    "shescape": "^2.1.6",
     "source-map-support": "^0.5.21",
     "ssh-config": "^5.0.4",
     "ssh2": "^1.17.0",

diff --git a/src/browser/features/Messages/ReasoningMessage.tsx b/src/browser/features/Messages/ReasoningMessage.tsx
@@ -85,6 +85,13 @@ export const ReasoningMessage: React.FC<ReasoningMessageProps> = ({
   const wasStreamingRef = useRef(isStreaming);
   const isLastPartOfMessage =
     "isLastPartOfMessage" in message ? message.isLastPartOfMessage : false;
+  // When the parent message contains *only* reasoning (e.g. the stream was
+  // truncated at max_tokens before any text/tool emerged), collapsing leaves
+  // the user staring at a single "Thinking" header and nothing else. Skip the
+  // auto-collapse in that case so the work the model did is still readable;
+  // the accompanying stream-error row from SMA explains why the turn stopped.
+  const isOnlyMessageContent =
+    "isOnlyMessageContent" in message ? message.isOnlyMessageContent === true : false;
 
   // Auto-collapse only when reasoning reached *natural* completion — i.e. the
   // stream ended while this reasoning part was still the terminal block of the
@@ -97,10 +104,10 @@ export const ReasoningMessage: React.FC<ReasoningMessageProps> = ({
     const wasStreaming = wasStreamingRef.current;
     wasStreamingRef.current = isStreaming;
 
-    if (wasStreaming && !isStreaming && isLastPartOfMessage) {
+    if (wasStreaming && !isStreaming && isLastPartOfMessage && !isOnlyMessageContent) {
       setIsExpanded(false);
     }
-  }, [isStreaming, isLastPartOfMessage]);
+  }, [isStreaming, isLastPartOfMessage, isOnlyMessageContent]);
 
   const toggleExpanded = () => {
     if (!isCollapsible) {

diff --git a/src/browser/features/Messages/StreamErrorMessage.tsx b/src/browser/features/Messages/StreamErrorMessage.tsx
@@ -81,6 +81,7 @@ const StreamErrorMessageBase: React.FC<StreamErrorMessageBaseProps> = (props) =>
     message.errorType === "server_error" &&
     /\bHTTP\s*529\b|overloaded/i.test(message.error);
   const isEmptyOutputError = message.errorType === "empty_output";
+  const isMaxOutputTokensError = message.errorType === "max_output_tokens";
   // Gateway quota failures need explicit attribution so users know mux gateway credits,
   // not a provider quota, are blocking the request.
   const isMuxGatewayQuotaError =
@@ -92,7 +93,9 @@ const StreamErrorMessageBase: React.FC<StreamErrorMessageBaseProps> = (props) =>
       ? "Service overloaded"
       : isEmptyOutputError
         ? "No assistant output"
-        : "Stream Error";
+        : isMaxOutputTokensError
+          ? "Response truncated"
+          : "Stream Error";
   const pill = isAnthropicOverloaded ? "overloaded" : message.errorType;
   const body = isMuxGatewayQuotaError
     ? "Your Mux Gateway credits have been depleted. Add credits or configure another provider to continue."

diff --git a/src/browser/utils/messages/StreamingMessageAggregator.test.ts b/src/browser/utils/messages/StreamingMessageAggregator.test.ts
@@ -2739,4 +2739,182 @@ describe("StreamingMessageAggregator", () => {
       expect(aggregator.getLastAbortReason()).toBeNull();
     });
   });
+
+  describe("max output tokens (finishReason: length)", () => {
+    // Regression: an assistant turn that hits max_tokens mid-thinking has
+    // `finishReason: "length"` and only reasoning parts. Without explicit UI
+    // surfacing, ReasoningMessage auto-collapses on stream-end, leaving the
+    // user staring at a single "Thinking" header with no signal that the turn
+    // truncated. SMA must synthesize a stream-error row and mark the reasoning
+    // row as the only renderable content so collapse is suppressed.
+    test("synthesizes a max_output_tokens stream-error row for reasoning-only truncated turns", () => {
+      const aggregator = new StreamingMessageAggregator(TEST_CREATED_AT);
+
+      aggregator.addMessage({
+        id: "asst-truncated",
+        role: "assistant",
+        parts: [{ type: "reasoning" as const, text: "I need to think about this..." }],
+        metadata: {
+          historySequence: 1,
+          timestamp: 1,
+          model: "anthropic:claude-opus-4-7",
+          finishReason: "length",
+        },
+      });
+
+      const displayed = aggregator.getDisplayedMessages();
+
+      const errorRow = displayed.find((m) => m.type === "stream-error");
+      expect(errorRow).toBeDefined();
+      if (errorRow?.type === "stream-error") {
+        expect(errorRow.errorType).toBe("max_output_tokens");
+        expect(errorRow.historyId).toBe("asst-truncated");
+        expect(errorRow.model).toBe("anthropic:claude-opus-4-7");
+      }
+
+      const reasoningRow = displayed.find((m) => m.type === "reasoning");
+      expect(reasoningRow).toBeDefined();
+      if (reasoningRow?.type === "reasoning") {
+        expect(reasoningRow.isOnlyMessageContent).toBe(true);
+      }
+    });
+
+    test("still synthesizes the row when the turn also has text/tool parts", () => {
+      // A truncated turn can include earlier text or tool calls; the user still
+      // needs the banner so they know the response was cut off mid-flight.
+      const aggregator = new StreamingMessageAggregator(TEST_CREATED_AT);
+
+      aggregator.addMessage({
+        id: "asst-mixed-truncated",
+        role: "assistant",
+        parts: [
+          { type: "reasoning" as const, text: "thinking" },
+          { type: "text" as const, text: "Partial reply" },
+        ],
+        metadata: {
+          historySequence: 1,
+          timestamp: 1,
+          model: "anthropic:claude-opus-4-7",
+          finishReason: "length",
+        },
+      });
+
+      const displayed = aggregator.getDisplayedMessages();
+      const errorRow = displayed.find((m) => m.type === "stream-error");
+      expect(errorRow).toBeDefined();
+      expect(errorRow?.type === "stream-error" && errorRow.errorType).toBe("max_output_tokens");
+
+      // Reasoning here is *not* the only content, so collapse should be allowed.
+      const reasoningRow = displayed.find((m) => m.type === "reasoning");
+      if (reasoningRow?.type === "reasoning") {
+        expect(reasoningRow.isOnlyMessageContent).toBe(false);
+      }
+    });
+
+    test("treats reasoning + skipped (empty-text) parts as reasoning-only", () => {
+      // Regression: assistant turns can contain non-renderable parts like empty
+      // text (the renderer's predicate filters them out). The
+      // `isOnlyMessageContent` flag must use that same predicate, otherwise a
+      // turn that visually consists of only a reasoning block still gets
+      // auto-collapsed when it ends — exactly the silent-end UX this PR fixes.
+      const aggregator = new StreamingMessageAggregator(TEST_CREATED_AT);
+
+      aggregator.addMessage({
+        id: "asst-reasoning-and-empty-text",
+        role: "assistant",
+        parts: [
+          { type: "reasoning" as const, text: "thinking..." },
+          { type: "text" as const, text: "" },
+        ],
+        metadata: {
+          historySequence: 1,
+          timestamp: 1,
+          model: "anthropic:claude-opus-4-7",
+          finishReason: "length",
+        },
+      });
+
+      const displayed = aggregator.getDisplayedMessages();
+      const reasoningRow = displayed.find((m) => m.type === "reasoning");
+      expect(reasoningRow).toBeDefined();
+      if (reasoningRow?.type === "reasoning") {
+        expect(reasoningRow.isOnlyMessageContent).toBe(true);
+      }
+      // Empty text part should not produce an assistant row at all.
+      expect(displayed.find((m) => m.type === "assistant")).toBeUndefined();
+    });
+
+    test("survives malformed text parts in persisted history", () => {
+      // Self-healing: chat.jsonl is loaded via plain JSON parse, so an entry
+      // with `type: "text"` but a missing/non-string `text` field is reachable.
+      // getDisplayedMessages must not crash on this — it should just skip the
+      // bad part and continue.
+      const aggregator = new StreamingMessageAggregator(TEST_CREATED_AT);
+
+      aggregator.addMessage({
+        id: "asst-malformed",
+        role: "assistant",
+        parts: [
+          { type: "reasoning" as const, text: "thinking" },
+          // Cast through unknown to model a malformed history entry.
+          { type: "text" } as unknown as { type: "text"; text: string },
+        ],
+        metadata: {
+          historySequence: 1,
+          timestamp: 1,
+          model: "anthropic:claude-opus-4-7",
+          finishReason: "length",
+        },
+      });
+
+      // The call itself must not throw.
+      const displayed = aggregator.getDisplayedMessages();
+      expect(displayed.find((m) => m.type === "reasoning")).toBeDefined();
+      expect(displayed.find((m) => m.type === "stream-error")).toBeDefined();
+    });
+
+    test("does not synthesize the row when finishReason is a normal stop", () => {
+      const aggregator = new StreamingMessageAggregator(TEST_CREATED_AT);
+
+      aggregator.addMessage({
+        id: "asst-normal",
+        role: "assistant",
+        parts: [{ type: "text" as const, text: "All done." }],
+        metadata: {
+          historySequence: 1,
+          timestamp: 1,
+          model: "anthropic:claude-opus-4-7",
+          finishReason: "stop",
+        },
+      });
+
+      const displayed = aggregator.getDisplayedMessages();
+      expect(displayed.find((m) => m.type === "stream-error")).toBeUndefined();
+    });
+
+    test("does not stack a length banner on top of an existing stream error", () => {
+      // If the message already failed with a real error, that takes precedence —
+      // we don't want two banners on the same turn.
+      const aggregator = new StreamingMessageAggregator(TEST_CREATED_AT);
+
+      aggregator.addMessage({
+        id: "asst-errored",
+        role: "assistant",
+        parts: [{ type: "reasoning" as const, text: "hmm" }],
+        metadata: {
+          historySequence: 1,
+          timestamp: 1,
+          model: "anthropic:claude-opus-4-7",
+          finishReason: "length",
+          error: "Network timed out",
+          errorType: "network",
+        },
+      });
+
+      const displayed = aggregator.getDisplayedMessages();
+      const errorRows = displayed.filter((m) => m.type === "stream-error");
+      expect(errorRows).toHaveLength(1);
+      expect(errorRows[0]?.type === "stream-error" && errorRows[0].errorType).toBe("network");
+    });
+  });
 });
diff --git a/src/browser/utils/messages/StreamingMessageAggregator.ts b/src/browser/utils/messages/StreamingMessageAggregator.ts
@@ -2901,26 +2901,44 @@ export class StreamingMessageAggregator {
       // Merge adjacent text/reasoning parts for display
       const mergedParts = mergeAdjacentParts(message.parts);
 
-      // Find the last part that will produce a DisplayedMessage
-      // (reasoning, text parts with content, OR tool parts)
+      // A part is "renderable" when getDisplayedMessages will emit a row for
+      // it. Empty text parts and other unsupported types are silently skipped
+      // by the loop below, so any flag derived from "what the user sees" must
+      // share this predicate to stay in sync. The text check uses a truthy
+      // test (rather than `.length > 0`) to keep self-healing behavior for
+      // malformed history entries where `part.text` may be undefined.
+      const isRenderablePart = (part: (typeof mergedParts)[number]): boolean =>
+        part.type === "reasoning" ||
+        (part.type === "text" && Boolean(part.text)) ||
+        isDynamicToolPart(part);
+
+      // Find the last part that will produce a DisplayedMessage and tally
+      // renderable parts to detect reasoning-only turns. Done in a single
+      // pass so the two derivations can't drift.
       let lastPartIndex = -1;
-      for (let i = mergedParts.length - 1; i >= 0; i--) {
+      let renderableCount = 0;
+      let renderableReasoningCount = 0;
+      for (let i = 0; i < mergedParts.length; i++) {
         const part = mergedParts[i];
-        if (
-          part.type === "reasoning" ||
-          (part.type === "text" && part.text) ||
-          isDynamicToolPart(part)
-        ) {
-          lastPartIndex = i;
-          break;
-        }
+        if (!isRenderablePart(part)) continue;
+        lastPartIndex = i;
+        renderableCount++;
+        if (part.type === "reasoning") renderableReasoningCount++;
       }
 
       const isCompactionBoundarySummary = this.isCompactionBoundarySummaryMessage(message);
       if (isCompactionBoundarySummary) {
         displayedMessages.push(this.createCompactionBoundaryRow(message, historySequence));
       }
 
+      // A turn whose *renderable* parts are entirely reasoning (no text, no tool
+      // calls) is the visible signature of a max_tokens truncation mid-thinking.
+      // We pass this hint down so ReasoningMessage can skip its auto-collapse —
+      // otherwise the user is left looking at a single collapsed "Thinking"
+      // header with no other output to read.
+      const isReasoningOnlyMessage =
+        renderableCount > 0 && renderableCount === renderableReasoningCount;
+
       mergedParts.forEach((part, partIndex) => {
         const isLastPart = partIndex === lastPartIndex;
         // Part is streaming if: active stream exists AND this is the last part
@@ -2938,6 +2956,7 @@ export class StreamingMessageAggregator {
             isStreaming,
             isPartial,
             isLastPartOfMessage: isLastPart,
+            isOnlyMessageContent: isReasoningOnlyMessage,
             timestamp: part.timestamp ?? baseTimestamp,
             streamPresentation: isStreaming
               ? { source: streamContext?.isReplay ? "replay" : "live" }
@@ -3059,6 +3078,31 @@ export class StreamingMessageAggregator {
           routedThroughGateway: message.metadata?.routedThroughGateway,
           timestamp: baseTimestamp,
         });
+      } else if (
+        // Stream ended cleanly *but* the provider truncated us at max_tokens.
+        // The backend's stream-end path treats this as a successful completion
+        // (no error metadata), so without this synthesis the chat appears to
+        // silently end — especially painful for reasoning-only turns where
+        // ReasoningMessage would otherwise auto-collapse the only output.
+        // Skip while still streaming: finishReason is only authoritative once
+        // the stream has settled.
+        message.role === "assistant" &&
+        !hasActiveStream &&
+        message.metadata?.finishReason === "length"
+      ) {
+        displayedMessages.push({
+          type: "stream-error",
+          id: `${message.id}-length`,
+          historyId: message.id,
+          error:
+            "The model hit its max output token limit before finishing this response. " +
+            "Lower the thinking level (or split the turn into smaller steps) to give it more headroom.",
+          errorType: "max_output_tokens",
+          historySequence,
+          model: message.metadata.model,
+          routedThroughGateway: message.metadata?.routedThroughGateway,
+          timestamp: baseTimestamp,
+        });
       }
     }
 

diff --git a/src/common/orpc/schemas/errors.ts b/src/common/orpc/schemas/errors.ts
@@ -40,6 +40,7 @@ export const StreamErrorTypeSchema = z.enum([
   "runtime_not_ready", // Container/runtime doesn't exist or failed to start (permanent)
   "runtime_start_failed", // Runtime is starting or temporarily unavailable (retryable)
   "empty_output", // Provider ended the stream without any assistant-visible output
+  "max_output_tokens", // Provider truncated the response at max_tokens (finishReason: "length")
   "unknown", // Catch-all
 ]);
 

diff --git a/src/common/types/message.ts b/src/common/types/message.ts
@@ -745,6 +745,11 @@ export type DisplayedMessage =
       isStreaming: boolean;
       isPartial: boolean; // Whether the parent message was interrupted
       isLastPartOfMessage?: boolean; // True if this is the last part of a multi-part message
+      /** True when this is the only renderable content in the parent assistant message
+       *  (no text or tool parts). Used to suppress auto-collapse so a reasoning-only
+       *  turn (e.g. one truncated mid-thinking by max_tokens) doesn't visually disappear
+       *  the moment the stream ends. */
+      isOnlyMessageContent?: boolean;
       timestamp?: number;
       tokens?: number; // Reasoning tokens if available
       /** Presentation hint for smooth streaming — indicates if this is live or replayed content. */