feat(agent): expose per-call maxTokens on agent() config

jddunn · jddunn · commit cecb36cf408b · 2026-04-18T16:35:10.000-07:00
BaseAgentConfig now carries an optional maxTokens field. When set, it's
forwarded to every generateText / streamText call the agent makes —
generate(), session.send(), and stream(). Unset preserves current
behavior (providers fall back to their defaults, typically 4-8k).

The driver: agency-style sessions (e.g. paracosm's commander +
department agents) had no way to cap tail spend on a misbehaving
model. Per-call maxTokens was accessible on generateText / generateObject
directly, but not on agent sessions, so high-volume session-backed
call sites defaulted to provider caps on every invocation.

Two tests pin the behavior: maxTokens flows through agent.generate()
and session.send() when configured, and stays undefined on the
generateText call when omitted.
diff --git a/src/api/agent.ts b/src/api/agent.ts
@@ -380,6 +380,10 @@ export function agent(opts: AgentOptions): Agent {
     system: opts.systemBlocks ?? buildSystemPrompt(opts),
     tools: opts.tools,
     maxSteps: opts.maxSteps ?? 5,
+    // Per-call completion-token cap applied to every generate /
+    // session.send / stream invocation this agent makes. Unset means
+    // the underlying generateText falls back to the provider default.
+    maxTokens: opts.maxTokens,
     chainOfThought: opts.chainOfThought ?? true,
     apiKey: opts.apiKey,
     baseUrl: opts.baseUrl,
diff --git a/src/api/runtime/__tests__/agent.test.ts b/src/api/runtime/__tests__/agent.test.ts
@@ -202,4 +202,29 @@ describe('agent', () => {
       })
     );
   });
+
+  it('forwards maxTokens from agent config to every generateText call (generate / session.send)', async () => {
+    const assistant = agent({
+      model: 'openai:gpt-4.1-mini',
+      instructions: 'be brief',
+      maxTokens: 1500,
+    });
+
+    await assistant.generate('Hello.');
+    expect(hoisted.generateText).toHaveBeenLastCalledWith(
+      expect.objectContaining({ maxTokens: 1500 }),
+    );
+
+    await assistant.session('s1').send('Hello again.');
+    expect(hoisted.generateText).toHaveBeenLastCalledWith(
+      expect.objectContaining({ maxTokens: 1500 }),
+    );
+  });
+
+  it('omits maxTokens from the generateText call when agent config does not set it', async () => {
+    const assistant = agent({ model: 'openai:gpt-4.1-mini', instructions: 'be brief' });
+    await assistant.generate('Hello.');
+    const callArgs = hoisted.generateText.mock.calls.at(-1)?.[0];
+    expect(callArgs?.maxTokens).toBeUndefined();
+  });
 });
diff --git a/src/api/types.ts b/src/api/types.ts
@@ -1191,6 +1191,18 @@ export interface BaseAgentConfig {
   tools?: AdaptableToolInput;
   /** Maximum number of agentic steps (LLM calls) per invocation. Defaults to `5`. */
   maxSteps?: number;
+  /**
+   * Upper bound on completion tokens for each LLM call the agent makes.
+   * Forwarded to the underlying `generateText` / `streamText` call on
+   * every `generate()`, `stream()`, and `session.send()` invocation.
+   *
+   * Caps tail spend when a model misbehaves and yaps past the intended
+   * output size — without it, calls fall back to the provider default
+   * (OpenAI 4096, Anthropic 4096-8192). Set to ~2× the agent's typical
+   * response size so normal calls finish naturally and only runaway
+   * generations hit the cap. Omit to use the provider default.
+   */
+  maxTokens?: number;
   /**
    * Memory configuration.
    * - `true` — enable in-memory conversation history with default settings.