Skip to content

Commit 1a5e8b5

Browse files
🤖 fix: accurate cost estimation for multi-step tool usage (#831)
## Problem The application was severely underestimating costs for conversations involving tool calls. ### Root Cause The Vercel AI SDK provides two usage metrics: - `streamResult.usage` — Token usage from **the last step only** - `streamResult.totalUsage` — Sum of token usage across **all steps** The application was using `usage` instead of `totalUsage`. For a conversation with 10 tool calls, only ~1/10th of actual consumption was reported. A $5 conversation would display as $0.50. ### The Complicating Factor Two UI elements use token data with different semantic requirements: | Display | Needs | Why | |---------|-------|-----| | **Cost** | Sum of all steps | If model read context 10 times, you paid for 10 reads | | **Context window** | Last step only | Shows "how full is the conversation now" for the next request | Simply switching to `totalUsage` would fix costs but break context display (showing 500% utilization after many tool calls). ### Cache Creation Tokens Anthropic's cache creation tokens (`cacheCreationInputTokens`) are: - Only in provider-specific metadata, not normalized usage - Need to be summed across all steps - Not automatically aggregated by the AI SDK Even with `totalUsage`, cache creation costs were lost unless manually aggregated from each step's provider metadata. ## Solution Track both values with different semantic purposes: **For cost calculation:** - `usage` / `cumulativeUsage` — total across all steps - `providerMetadata` / `cumulativeProviderMetadata` — aggregated cache creation tokens **For context window display:** - `contextUsage` / `lastContextUsage` — last step only - `contextProviderMetadata` — last step only ### Key Changes 1. **Backend** (`streamManager.ts`): Use `totalUsage` for cost, track `lastStepUsage` for context, aggregate provider metadata across steps 2. **Types**: Extended `StreamEndEvent`, `MuxMetadata`, `UsageDeltaEvent` with dual fields 3. **Frontend**: `StreamingMessageAggregator` tracks both cumulative and per-step usage 4. **Store**: `WorkspaceUsageState` provides `usageHistory` (cost) and `lastContextUsage` (context window) 5. **UI**: Components use appropriate field for their purpose ### Also Fixed - **OpenAI cached token double-counting**: Gateway models (`mux-gateway:openai/gpt-5.1`) weren't recognized as OpenAI, causing cached tokens to be counted in both "Cache Read" and "Input". Now normalizes gateway model strings before provider detection. - **Google/Gemini cached token double-counting**: Google, like OpenAI, reports `inputTokens` inclusive of `cachedInputTokens`. Extended the subtraction logic to handle Google models. --- _Generated with `mux`_
1 parent 1e3dce5 commit 1a5e8b5

File tree

15 files changed

+847
-61
lines changed

15 files changed

+847
-61
lines changed

‎src/browser/components/ChatMetaSidebar.tsx‎

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ const ChatMetaSidebarComponent: React.FC<ChatMetaSidebarProps> = ({ workspaceId,
1919
const use1M = options.anthropic?.use1MContext ?? false;
2020
const chatAreaSize = useResizeObserver(chatAreaRef);
2121

22-
const lastUsage = usage?.liveUsage ?? usage?.usageHistory[usage.usageHistory.length - 1];
22+
// Use lastContextUsage for context window display (last step = actual context size)
23+
const lastUsage = usage?.liveUsage ?? usage?.lastContextUsage;
2324

2425
// Memoize vertical meter data calculation to prevent unnecessary re-renders
2526
const verticalMeterData = React.useMemo(() => {

‎src/browser/components/RightSidebar.tsx‎

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,8 @@ const RightSidebarComponent: React.FC<RightSidebarProps> = ({
135135
const costsPanelId = `${baseId}-panel-costs`;
136136
const reviewPanelId = `${baseId}-panel-review`;
137137

138-
const lastUsage = usage?.liveUsage ?? usage?.usageHistory[usage.usageHistory.length - 1];
138+
// Use lastContextUsage for context window display (last step = actual context size)
139+
const lastUsage = usage?.liveUsage ?? usage?.lastContextUsage;
139140
const model = lastUsage?.model ?? null;
140141

141142
// Auto-compaction settings: threshold per-model

‎src/browser/components/RightSidebar/CostsTab.tsx‎

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -65,22 +65,28 @@ const CostsTabComponent: React.FC<CostsTabProps> = ({ workspaceId }) => {
6565
const use1M = options.anthropic?.use1MContext ?? false;
6666

6767
// Get model from context usage for per-model threshold storage
68-
const contextUsage = usage.liveUsage ?? usage.usageHistory[usage.usageHistory.length - 1];
69-
const currentModel = contextUsage?.model ?? null;
68+
// Use lastContextUsage for context window display (last step's usage)
69+
const contextUsageForModel = usage.liveUsage ?? usage.lastContextUsage;
70+
const currentModel = contextUsageForModel?.model ?? null;
7071

7172
// Auto-compaction settings: threshold per-model (100 = disabled)
7273
const { threshold: autoCompactThreshold, setThreshold: setAutoCompactThreshold } =
7374
useAutoCompactionSettings(workspaceId, currentModel);
7475

75-
// Session usage for cost
76+
// Session usage for cost calculation
77+
// Uses usageHistory (total across all steps) + liveCostUsage (cumulative during streaming)
7678
const sessionUsage = React.useMemo(() => {
7779
const historicalSum = sumUsageHistory(usage.usageHistory);
78-
if (!usage.liveUsage) return historicalSum;
79-
if (!historicalSum) return usage.liveUsage;
80-
return sumUsageHistory([historicalSum, usage.liveUsage]);
81-
}, [usage.usageHistory, usage.liveUsage]);
82-
83-
const hasUsageData = usage && (usage.usageHistory.length > 0 || usage.liveUsage !== undefined);
80+
if (!usage.liveCostUsage) return historicalSum;
81+
if (!historicalSum) return usage.liveCostUsage;
82+
return sumUsageHistory([historicalSum, usage.liveCostUsage]);
83+
}, [usage.usageHistory, usage.liveCostUsage]);
84+
85+
const hasUsageData =
86+
usage &&
87+
(usage.usageHistory.length > 0 ||
88+
usage.lastContextUsage !== undefined ||
89+
usage.liveUsage !== undefined);
8490
const hasConsumerData = consumers && (consumers.totalTokens > 0 || consumers.isCalculating);
8591
const hasAnyData = hasUsageData || hasConsumerData;
8692

@@ -109,8 +115,8 @@ const CostsTabComponent: React.FC<CostsTabProps> = ({ workspaceId }) => {
109115
<div data-testid="context-usage-list" className="flex flex-col gap-3">
110116
{(() => {
111117
// Context usage: live when streaming, else last historical
112-
const contextUsage =
113-
usage.liveUsage ?? usage.usageHistory[usage.usageHistory.length - 1];
118+
// Uses lastContextUsage (last step) for accurate context window size
119+
const contextUsage = usage.liveUsage ?? usage.lastContextUsage;
114120
const model = contextUsage?.model ?? "unknown";
115121

116122
// Get max tokens for the model from the model stats database

‎src/browser/stores/WorkspaceStore.ts‎

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,21 @@ type DerivedState = Record<string, number>;
5959
/**
6060
* Usage metadata extracted from API responses (no tokenization).
6161
* Updates instantly when usage metadata arrives.
62+
*
63+
* For multi-step tool calls, cost and context usage differ:
64+
* - usageHistory: Total usage per message (sum of all steps) for cost calculation
65+
* - lastContextUsage: Last step's usage for context window display (inputTokens = actual context size)
6266
*/
6367
export interface WorkspaceUsageState {
68+
/** Usage history for cost calculation (total across all steps per message) */
6469
usageHistory: ChatUsageDisplay[];
70+
/** Last message's context usage (last step only, for context window display) */
71+
lastContextUsage?: ChatUsageDisplay;
6572
totalTokens: number;
66-
/** Live usage during streaming (inputTokens = current context window) */
73+
/** Live context usage during streaming (last step's inputTokens = current context window) */
6774
liveUsage?: ChatUsageDisplay;
75+
/** Live cost usage during streaming (cumulative across all steps) */
76+
liveCostUsage?: ChatUsageDisplay;
6877
}
6978

7079
/**
@@ -441,6 +450,8 @@ export class WorkspaceStore {
441450

442451
const messages = aggregator.getAllMessages();
443452
const model = aggregator.getCurrentModel();
453+
454+
// Collect usage history for cost calculation (total across all steps per message)
444455
const usageHistory = collectUsageHistory(messages, model);
445456

446457
// Calculate total from usage history (now includes historical)
@@ -455,12 +466,47 @@ export class WorkspaceStore {
455466
0
456467
);
457468

458-
// Include active stream usage if currently streaming (already converted)
469+
// Get last message's context usage for context window display
470+
// Uses contextUsage (last step) if available, falls back to usage for old messages
471+
const lastContextUsage = (() => {
472+
for (let i = messages.length - 1; i >= 0; i--) {
473+
const msg = messages[i];
474+
if (msg.role === "assistant") {
475+
const rawUsage = msg.metadata?.contextUsage ?? msg.metadata?.usage;
476+
const providerMeta =
477+
msg.metadata?.contextProviderMetadata ?? msg.metadata?.providerMetadata;
478+
if (rawUsage) {
479+
const msgModel = msg.metadata?.model ?? model ?? "unknown";
480+
return createDisplayUsage(rawUsage, msgModel, providerMeta);
481+
}
482+
}
483+
}
484+
return undefined;
485+
})();
486+
487+
// Include active stream usage if currently streaming
459488
const activeStreamId = aggregator.getActiveStreamMessageId();
460-
const rawUsage = activeStreamId ? aggregator.getActiveStreamUsage(activeStreamId) : undefined;
461-
const liveUsage = rawUsage && model ? createDisplayUsage(rawUsage, model) : undefined;
462489

463-
return { usageHistory, totalTokens, liveUsage };
490+
// Live context usage (last step's inputTokens = current context window)
491+
const rawContextUsage = activeStreamId
492+
? aggregator.getActiveStreamUsage(activeStreamId)
493+
: undefined;
494+
const liveUsage =
495+
rawContextUsage && model ? createDisplayUsage(rawContextUsage, model) : undefined;
496+
497+
// Live cost usage (cumulative across all steps, with accumulated cache creation tokens)
498+
const rawCumulativeUsage = activeStreamId
499+
? aggregator.getActiveStreamCumulativeUsage(activeStreamId)
500+
: undefined;
501+
const rawCumulativeProviderMetadata = activeStreamId
502+
? aggregator.getActiveStreamCumulativeProviderMetadata(activeStreamId)
503+
: undefined;
504+
const liveCostUsage =
505+
rawCumulativeUsage && model
506+
? createDisplayUsage(rawCumulativeUsage, model, rawCumulativeProviderMetadata)
507+
: undefined;
508+
509+
return { usageHistory, lastContextUsage, totalTokens, liveUsage, liveCostUsage };
464510
});
465511
}
466512

‎src/browser/utils/compaction/autoCompactionCheck.test.ts‎

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,11 @@ const createMockUsage = (
4040
}
4141

4242
// Add recent usage
43-
usageHistory.push(createUsageEntry(lastEntryTokens, model));
43+
const recentUsage = createUsageEntry(lastEntryTokens, model);
44+
usageHistory.push(recentUsage);
4445

45-
return { usageHistory, totalTokens: 0, liveUsage };
46+
// lastContextUsage is the most recent context window state
47+
return { usageHistory, lastContextUsage: recentUsage, totalTokens: 0, liveUsage };
4648
};
4749

4850
describe("checkAutoCompaction", () => {
@@ -136,17 +138,17 @@ describe("checkAutoCompaction", () => {
136138

137139
test("includes all token types in calculation", () => {
138140
// Create usage with all token types specified
141+
const usageEntry = {
142+
input: { tokens: 10_000 },
143+
cached: { tokens: 5_000 },
144+
cacheCreate: { tokens: 2_000 },
145+
output: { tokens: 3_000 },
146+
reasoning: { tokens: 1_000 },
147+
model: KNOWN_MODELS.SONNET.id,
148+
};
139149
const usage: WorkspaceUsageState = {
140-
usageHistory: [
141-
{
142-
input: { tokens: 10_000 },
143-
cached: { tokens: 5_000 },
144-
cacheCreate: { tokens: 2_000 },
145-
output: { tokens: 3_000 },
146-
reasoning: { tokens: 1_000 },
147-
model: KNOWN_MODELS.SONNET.id,
148-
},
149-
],
150+
usageHistory: [usageEntry],
151+
lastContextUsage: usageEntry,
150152
totalTokens: 0,
151153
};
152154

@@ -232,17 +234,17 @@ describe("checkAutoCompaction", () => {
232234
});
233235

234236
test("handles zero tokens gracefully", () => {
237+
const zeroEntry = {
238+
input: { tokens: 0 },
239+
cached: { tokens: 0 },
240+
cacheCreate: { tokens: 0 },
241+
output: { tokens: 0 },
242+
reasoning: { tokens: 0 },
243+
model: KNOWN_MODELS.SONNET.id,
244+
};
235245
const usage: WorkspaceUsageState = {
236-
usageHistory: [
237-
{
238-
input: { tokens: 0 },
239-
cached: { tokens: 0 },
240-
cacheCreate: { tokens: 0 },
241-
output: { tokens: 0 },
242-
reasoning: { tokens: 0 },
243-
model: KNOWN_MODELS.SONNET.id,
244-
},
245-
],
246+
usageHistory: [zeroEntry],
247+
lastContextUsage: zeroEntry,
246248
totalTokens: 0,
247249
};
248250

@@ -357,7 +359,11 @@ describe("checkAutoCompaction", () => {
357359
test("shouldForceCompact triggers with empty history but liveUsage near limit", () => {
358360
// Bug fix: empty history but liveUsage should still trigger
359361
const liveUsage = createUsageEntry(SONNET_MAX_TOKENS - BUFFER);
360-
const usage: WorkspaceUsageState = { usageHistory: [], totalTokens: 0, liveUsage };
362+
const usage: WorkspaceUsageState = {
363+
usageHistory: [],
364+
totalTokens: 0,
365+
liveUsage,
366+
};
361367
const result = checkAutoCompaction(usage, KNOWN_MODELS.SONNET.id, false);
362368

363369
expect(result.shouldForceCompact).toBe(true);

‎src/browser/utils/compaction/autoCompactionCheck.ts‎

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,9 @@ export function checkAutoCompaction(
9494
};
9595
}
9696

97-
// Current usage: live when streaming, else last historical (pattern from CostsTab)
98-
const lastUsage = usage.usageHistory[usage.usageHistory.length - 1];
97+
// Current usage: live when streaming, else last historical
98+
// Use lastContextUsage (last step) for accurate context window size
99+
const lastUsage = usage.lastContextUsage;
99100
const currentUsage = usage.liveUsage ?? lastUsage;
100101

101102
// Force-compact when approaching context limit (can trigger even with empty history if streaming)

0 commit comments

Comments
 (0)