diff --git a/cagent-schema.json b/cagent-schema.json index 3665d0913..85afb19d0 100644 --- a/cagent-schema.json +++ b/cagent-schema.json @@ -182,7 +182,7 @@ "description": "Whether to track usage" }, "thinking_budget": { - "description": "Controls reasoning effort/budget. For OpenAI: string levels ('minimal', 'low', 'medium', 'high'). For Anthropic: integer token budget (1024-32768)", + "description": "Controls reasoning effort/budget. OpenAI: string levels ('minimal','low','medium','high'). Anthropic: integer token budget (1024-32768). Gemini: integer token budget (-1 for unlimited, 0 to disable, 24576 max).", "oneOf": [ { "type": "string", @@ -191,12 +191,12 @@ }, { "type": "integer", - "minimum": 1024, + "minimum": -1, "maximum": 32768, - "description": "Token budget for extended thinking (Anthropic)" + "description": "Token budget for extended thinking (Anthropic, Google)" } ], - "examples": ["minimal", "low", "medium", "high", 1024, 32768] + "examples": ["minimal", "low", "medium", "high", -1, 0, 1024, 24576, 32768] } }, "additionalProperties": false diff --git a/docs/USAGE.md b/docs/USAGE.md index 235698715..f57b8b0ec 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -139,17 +139,17 @@ cagent run ./agent.yaml --command ls ### Model Properties -| Property | Type | Description | Required | -|---------------------|------------|-----------------------------------------------------------------------|----------| -| `provider` | string | Provider: `openai`, `anthropic`, `dmr` | ✓ | -| `model` | string | Model name (e.g., `gpt-4o`, `claude-sonnet-4-0`) | ✓ | -| `temperature` | float | Randomness (0.0-1.0) | ✗ | -| `max_tokens` | integer | Response length limit | ✗ | -| `top_p` | float | Nucleus sampling (0.0-1.0) | ✗ | -| `frequency_penalty` | float | Repetition penalty (0.0-2.0) | ✗ | -| `presence_penalty` | float | Topic repetition penalty (0.0-2.0) | ✗ | -| `base_url` | string | Custom API endpoint | ✗ | -| `thinking_budget` | string/int | Reasoning effort — OpenAI: effort string, Anthropic: token budget int | ✗ | +| Property | Type | Description | Required | +|---------------------|------------|------------------------------------------------------------------------------|----------| +| `provider` | string | Provider: `openai`, `anthropic`, `google`, `dmr` | ✓ | +| `model` | string | Model name (e.g., `gpt-4o`, `claude-sonnet-4-0`, `gemini-2.5-flash`) | ✓ | +| `temperature` | float | Randomness (0.0-1.0) | ✗ | +| `max_tokens` | integer | Response length limit | ✗ | +| `top_p` | float | Nucleus sampling (0.0-1.0) | ✗ | +| `frequency_penalty` | float | Repetition penalty (0.0-2.0) | ✗ | +| `presence_penalty` | float | Topic repetition penalty (0.0-2.0) | ✗ | +| `base_url` | string | Custom API endpoint | ✗ | +| `thinking_budget` | string/int | Reasoning effort — OpenAI: effort string, Anthropic/Google: token budget int | ✗ | #### Example @@ -164,7 +164,7 @@ models: frequency_penalty: float # Repetition penalty (0.0-2.0) presence_penalty: float # Topic repetition penalty (0.0-2.0) parallel_tool_calls: boolean - thinking_budget: string|integer # OpenAI: effort level string; Anthropic: integer token budget + thinking_budget: string|integer # OpenAI: effort level string; Anthropic/Google: integer token budget ``` ### Reasoning Effort (thinking_budget) @@ -172,7 +172,8 @@ models: Determine how much the model should think by setting the `thinking_budget` - **OpenAI**: use effort levels — `minimal`, `low`, `medium`, `high` -- **Anthropic**: set an integer token budget. Minimum is 1024; range is 1024–32768; must be strictly less than `max_tokens`. When set, cagent uses Anthropic's Beta Messages API with interleaved thinking enabled. +- **Anthropic**: set an integer token budget. Range is 1024–32768; must be strictly less than `max_tokens`. +- **Google (Gemini)**: set an integer token budget. `0` -> disable thinking, `-1` -> dynamic thinking (model decides). Most models: 0–24576 tokens. Gemini 2.5 Pro: 128–32768 tokens (and cannot disabled thinking). Examples (OpenAI): @@ -204,6 +205,31 @@ agents: instruction: you are a helpful assistant that doesn't think very much ``` +Examples (Google): + +```yaml +models: + gemini-no-thinking: + provider: google + model: gemini-2.5-flash + thinking_budget: 0 # Disable thinking + + gemini-dynamic: + provider: google + model: gemini-2.5-flash + thinking_budget: -1 # Dynamic thinking (model decides) + + gemini-fixed: + provider: google + model: gemini-2.5-flash + thinking_budget: 8192 # Fixed token budget + +agents: + root: + model: gemini-fixed + instruction: you are a helpful assistant +``` + #### Interleaved Thinking (Anthropic) Anthropic's interleaved thinking feature uses the Beta Messages API to provide tool calling during model reasoning. You can control this behavior using the `interleaved_thinking` provider option: @@ -220,11 +246,14 @@ models: Notes: -- If an invalid OpenAI effort value is set, the request will fail with a clear error -- For Anthropic, values < 1024 or ≥ `max_tokens` are ignored (warning logged) -- When `interleaved_thinking` is enabled, cagent uses Anthropic's Beta Messages API with a default thinking budget of 16384 tokens if not specified +- **OpenAI**: If an invalid effort value is set, the request will fail with a clear error +- **Anthropic**: Values < 1024 or ≥ `max_tokens` are ignored (warning logged). When `interleaved_thinking` is enabled, cagent uses Anthropic's Beta Messages API with a default thinking budget of 16384 tokens if not specified +- **Google**: + - Most models support values between -1 and 24576 tokens. Set to `0` to disable, `-1` for dynamic thinking + - Gemini 2.5 Pro: supports 128–32768 tokens. Cannot be disabled (minimum 128) + - Gemini 2.5 Flash-Lite: supports 512–24576 tokens. Set to `0` to disable, `-1` for dynamic thinking - For unsupported providers, `thinking_budget` has no effect -- Debug logs include the applied effort (e.g., "OpenAI request using thinking_budget", "Anthropic Beta API using thinking_budget") +- Debug logs include the applied effort (e.g., "OpenAI request using thinking_budget", "Gemini request using thinking_budget") See `examples/thinking_budget.yaml` for a complete runnable demo. diff --git a/examples/thinking_budget.yaml b/examples/thinking_budget.yaml index bd94ca43d..fb5034209 100644 --- a/examples/thinking_budget.yaml +++ b/examples/thinking_budget.yaml @@ -9,8 +9,9 @@ agents: root: model: gpt-5-mini-min # <- try with gpt-5-mini-high # model: claude-4-5-sonnet-min # <- try with claude-4-5-sonnet-high + # model: gemini-2-5-flash-dynamic-thinking # <- try with -no-thinking, -low or -high variants description: a helpful assistant that thinks - instruction: you are a helpful assistant + instruction: you are a helpful assistant who can also use tools, but only if you need to commands: demo: "hey i need python code for a mandelbrot fractal" toolsets: @@ -35,6 +36,26 @@ models: claude-4-5-sonnet-high: provider: anthropic model: claude-sonnet-4-5-20250929 - thinking_budget: 32768 # <- tokens, 32768 is the suggested maximum without batching + thinking_budget: 32768 # <- tokens, 32768 is the Anthropic suggested maximum without batching provider_opts: - interleaved_thinking: true # <- enable interleaved thinking, aka tool calling during model reasoning + interleaved_thinking: true # <- enables interleaved thinking, aka tool calling during model reasoning + + gemini-2-5-flash-dynamic-thinking: + provider: google + model: gemini-2.5-flash + thinking_budget: -1 # <- google only, dynamic thinking + + gemini-2-5-flash-no-thinking: + provider: google + model: gemini-2.5-flash + thinking_budget: 0 # <- google only, no thinking + + gemini-2-5-flash-low: + provider: google + model: gemini-2.5-flash + thinking_budget: 1024 + + gemini-2-5-flash-high: + provider: google + model: gemini-2.5-flash + thinking_budget: 24576 # <- google's maximum thinking budget for all models except Gemini 2.5 Pro (max 32768) diff --git a/pkg/chat/chat.go b/pkg/chat/chat.go index 504614c79..6b66c3f1f 100644 --- a/pkg/chat/chat.go +++ b/pkg/chat/chat.go @@ -119,6 +119,7 @@ type Usage struct { OutputTokens int `json:"output_tokens"` CachedInputTokens int `json:"cached_input_tokens"` CachedOutputTokens int `json:"cached_output_tokens"` + ReasoningTokens int `json:"reasoning_tokens,omitempty"` } // MessageStream interface represents a stream of chat completions diff --git a/pkg/model/provider/gemini/adapter.go b/pkg/model/provider/gemini/adapter.go index 5de2149f2..146f0a445 100644 --- a/pkg/model/provider/gemini/adapter.go +++ b/pkg/model/provider/gemini/adapter.go @@ -179,20 +179,29 @@ func (g *StreamAdapter) Recv() (chat.MessageStreamResponse, error) { OutputTokens: int(res.resp.UsageMetadata.CandidatesTokenCount), CachedInputTokens: int(res.resp.UsageMetadata.CachedContentTokenCount), CachedOutputTokens: 0, // Gemini doesn't provide cached output tokens + ReasoningTokens: int(res.resp.UsageMetadata.ThoughtsTokenCount), } } - // Handle text content without using Text() to avoid warnings + // Handle text and thoughts separately so TUI can render them distinctly var textContent string + var reasoningText string for _, candidate := range res.resp.Candidates { if candidate.Content != nil { for _, part := range candidate.Content.Parts { if part.Text != "" { - textContent += part.Text + if part.Thought { + reasoningText += part.Text + } else { + textContent += part.Text + } } } } } + if reasoningText != "" { + resp.Choices[0].Delta.ReasoningContent = reasoningText + } if textContent != "" { resp.Choices[0].Delta.Content = textContent } diff --git a/pkg/model/provider/gemini/client.go b/pkg/model/provider/gemini/client.go index 232c33538..147d398bd 100644 --- a/pkg/model/provider/gemini/client.go +++ b/pkg/model/provider/gemini/client.go @@ -220,6 +220,30 @@ func (c *Client) buildConfig() *genai.GenerateContentConfig { if c.config.MaxTokens > 0 { config.MaxOutputTokens = int32(c.config.MaxTokens) } + + // Apply thinking budget for Gemini models using token-based configuration. + // Per official docs: https://ai.google.dev/gemini-api/docs/thinking + // - Set thinkingBudget to 0 to disable thinking + // - Set thinkingBudget to -1 for dynamic thinking (model decides) + // - Set to a specific value for a fixed token budget, + // maximum is 24576 for all models except Gemini 2.5 Pro (max 32768) + if c.config.ThinkingBudget != nil { + if config.ThinkingConfig == nil { + config.ThinkingConfig = &genai.ThinkingConfig{} + } + config.ThinkingConfig.IncludeThoughts = true + tokens := c.config.ThinkingBudget.Tokens + config.ThinkingConfig.ThinkingBudget = genai.Ptr(int32(tokens)) + + switch tokens { + case 0: + slog.Debug("Gemini request with thinking disabled", "budget_tokens", tokens) + case -1: + slog.Debug("Gemini request with dynamic thinking", "budget_tokens", tokens) + default: + slog.Debug("Gemini request using thinking_budget", "budget_tokens", tokens) + } + } return config } diff --git a/pkg/model/provider/oaistream/adapter.go b/pkg/model/provider/oaistream/adapter.go index 2edf55a25..174aab7ef 100644 --- a/pkg/model/provider/oaistream/adapter.go +++ b/pkg/model/provider/oaistream/adapter.go @@ -49,10 +49,14 @@ func (a *StreamAdapter) Recv() (chat.MessageStreamResponse, error) { OutputTokens: openaiResponse.Usage.CompletionTokens, CachedInputTokens: 0, CachedOutputTokens: 0, + ReasoningTokens: 0, } if openaiResponse.Usage.PromptTokensDetails != nil { response.Usage.CachedInputTokens = openaiResponse.Usage.PromptTokensDetails.CachedTokens } + if openaiResponse.Usage.CompletionTokensDetails != nil { + response.Usage.ReasoningTokens = openaiResponse.Usage.CompletionTokensDetails.ReasoningTokens + } // Use the tracked finish reason instead of hardcoding stop finishReason := a.lastFinishReason if finishReason == "" { diff --git a/pkg/runtime/runtime.go b/pkg/runtime/runtime.go index a6e9387cd..d1f5abd4f 100644 --- a/pkg/runtime/runtime.go +++ b/pkg/runtime/runtime.go @@ -481,19 +481,19 @@ func (r *runtime) handleStream(ctx context.Context, stream chat.MessageStream, a if response.Usage != nil { if m != nil { sess.Cost += (float64(response.Usage.InputTokens)*m.Cost.Input + - float64(response.Usage.OutputTokens)*m.Cost.Output + + float64(response.Usage.OutputTokens+response.Usage.ReasoningTokens)*m.Cost.Output + float64(response.Usage.CachedInputTokens)*m.Cost.CacheRead + float64(response.Usage.CachedOutputTokens)*m.Cost.CacheWrite) / 1e6 } sess.InputTokens = response.Usage.InputTokens + response.Usage.CachedInputTokens - sess.OutputTokens = response.Usage.OutputTokens + response.Usage.CachedOutputTokens + sess.OutputTokens = response.Usage.OutputTokens + response.Usage.CachedOutputTokens + response.Usage.ReasoningTokens modelName := "unknown" if m != nil { modelName = m.Name } - telemetry.RecordTokenUsage(ctx, modelName, int64(response.Usage.InputTokens), int64(response.Usage.OutputTokens), sess.Cost) + telemetry.RecordTokenUsage(ctx, modelName, int64(response.Usage.InputTokens), int64(response.Usage.OutputTokens+response.Usage.ReasoningTokens), sess.Cost) } if len(response.Choices) == 0 {