From 8ab5c961e8b40814e5d61734daf19b0dda2e1da7 Mon Sep 17 00:00:00 2001 From: damithsenanayake Date: Thu, 2 Oct 2025 13:04:37 +1000 Subject: [PATCH 1/8] feat: include token streaming and thoughts rendering for anthropic models. --- src/google/adk/models/anthropic_llm.py | 210 +++++++++++++- tests/unittests/models/test_anthropic_llm.py | 4 +- .../models/test_anthropic_max_tokens.py | 268 ++++++++++++++++++ .../models/test_anthropic_streaming.py | 132 +++++++++ .../models/test_anthropic_thinking.py | 254 +++++++++++++++++ 5 files changed, 855 insertions(+), 13 deletions(-) create mode 100644 tests/unittests/models/test_anthropic_max_tokens.py create mode 100644 tests/unittests/models/test_anthropic_streaming.py create mode 100644 tests/unittests/models/test_anthropic_thinking.py diff --git a/src/google/adk/models/anthropic_llm.py b/src/google/adk/models/anthropic_llm.py index 6c20b1b9a5..2df020d0e8 100644 --- a/src/google/adk/models/anthropic_llm.py +++ b/src/google/adk/models/anthropic_llm.py @@ -30,6 +30,7 @@ from typing import Union from anthropic import AnthropicVertex +from anthropic import AsyncAnthropicVertex from anthropic import NOT_GIVEN from anthropic import types as anthropic_types from google.genai import types @@ -166,7 +167,83 @@ def content_block_to_part( ) part.function_call.id = content_block.id return part - raise NotImplementedError("Not supported yet.") + + # Handle thinking blocks from Anthropic extended thinking feature + # Thinking blocks have a 'thinking' attribute containing the reasoning text + if hasattr(content_block, "thinking"): + thinking_text = content_block.thinking + logger.info(f"Received thinking block ({len(thinking_text)} chars)") + # Return as Part with thought=True (standard GenAI format) + return types.Part(text=thinking_text, thought=True) + + # Alternative check: some versions may use type attribute + if ( + hasattr(content_block, "type") + and getattr(content_block, "type", None) == "thinking" + ): + thinking_text = str(content_block) + logger.info( + f"Received thinking block via type check ({len(thinking_text)} chars)" + ) + # Return as Part with thought=True (standard GenAI format) + return types.Part(text=thinking_text, thought=True) + + raise NotImplementedError( + f"Not supported yet: {type(content_block).__name__}" + ) + + +def streaming_event_to_llm_response( + event: anthropic_types.MessageStreamEvent, +) -> Optional[LlmResponse]: + """Convert Anthropic streaming events to ADK LlmResponse format. + + Args: + event: Anthropic streaming event + + Returns: + LlmResponse or None if event should be skipped + """ + # Handle content block deltas + if event.type == "content_block_delta": + delta = event.delta + + # Text delta + if hasattr(delta, "type") and delta.type == "text_delta": + return LlmResponse( + content=types.Content( + role="model", + parts=[types.Part.from_text(text=delta.text)], + ), + partial=True, + ) + + # Thinking delta + if hasattr(delta, "type") and delta.type == "thinking_delta": + return LlmResponse( + content=types.Content( + role="model", + parts=[types.Part(text=delta.thinking, thought=True)], + ), + partial=True, + ) + + # Handle message deltas (usage updates) + elif event.type == "message_delta": + if hasattr(event, "usage"): + return LlmResponse( + usage_metadata=types.GenerateContentResponseUsageMetadata( + prompt_token_count=getattr(event.usage, "input_tokens", 0), + candidates_token_count=getattr(event.usage, "output_tokens", 0), + total_token_count=( + getattr(event.usage, "input_tokens", 0) + + getattr(event.usage, "output_tokens", 0) + ), + ), + ) + + # Ignore start/stop events + return None def message_to_generate_content_response( @@ -283,19 +360,128 @@ async def generate_content_async( if llm_request.tools_dict else NOT_GIVEN ) - # TODO(b/421255973): Enable streaming for anthropic models. - message = self._anthropic_client.messages.create( - model=llm_request.model, - system=llm_request.config.system_instruction, - messages=messages, - tools=tools, - tool_choice=tool_choice, - max_tokens=self.max_tokens, + + # Extract and convert thinking config from ADK to Anthropic format + thinking = NOT_GIVEN + if llm_request.config and llm_request.config.thinking_config: + budget = llm_request.config.thinking_config.thinking_budget + if budget and budget != 0: + if budget == -1: + # Automatic thinking budget - use recommended default of 10000 tokens + thinking = {"type": "enabled", "budget_tokens": 1024} + logger.info( + "Extended thinking enabled (automatic budget: 10000 tokens)" + ) + elif budget > 0: + # Specific budget - enforce minimum 1024 tokens + actual_budget = max(budget, 1024) + thinking = {"type": "enabled", "budget_tokens": actual_budget} + logger.info( + f"Extended thinking enabled (budget: {actual_budget} tokens)" + ) + + # Determine if streaming should be used + use_streaming = ( + stream # From runtime context (streaming_mode == SSE) + or thinking + != NOT_GIVEN # Extended thinking requires streaming (Anthropic-specific) + or self.max_tokens + >= 8192 # Large max_tokens may exceed 10min timeout (Anthropic SDK requirement) ) - yield message_to_generate_content_response(message) + + if use_streaming: + # Use streaming mode + logger.info( + f"Using streaming mode (stream={stream}, " + f"has_thinking={thinking != NOT_GIVEN}, " + f"large_max_tokens={self.max_tokens >= 8192})" + ) + + # Accumulators for text and thinking + accumulated_text = "" + accumulated_thinking = "" + + async with self._anthropic_client.messages.stream( + model=llm_request.model, + system=llm_request.config.system_instruction, + messages=messages, + tools=tools, + tool_choice=tool_choice, + max_tokens=self.max_tokens, + thinking=thinking, + ) as anthropic_stream: + # Process streaming events + async for event in anthropic_stream: + # Convert Anthropic event to LlmResponse + if llm_response := streaming_event_to_llm_response(event): + # Track accumulated content + is_thought = False + if llm_response.content and llm_response.content.parts: + for part in llm_response.content.parts: + if part.text: + if hasattr(part, "thought") and part.thought: + accumulated_thinking += part.text + is_thought = True + else: + accumulated_text += part.text + + # If we have accumulated thinking and now getting text, + # yield the accumulated thinking first + if accumulated_thinking and accumulated_text and not is_thought: + yield LlmResponse( + content=types.Content( + role="model", + parts=[ + types.Part(text=accumulated_thinking, thought=True) + ], + ), + partial=True, + ) + accumulated_thinking = "" # Reset after yielding + + # Yield partial response (but skip individual thought deltas) + if not is_thought: + yield llm_response + + # Get final message to extract usage metadata + final_message = await anthropic_stream.get_final_message() + + # Build final aggregated response with complete content + parts = [] + if accumulated_thinking: + parts.append(types.Part(text=accumulated_thinking, thought=True)) + if accumulated_text: + parts.append(types.Part.from_text(text=accumulated_text)) + + # Only yield final aggregated response if we have content + if parts: + yield LlmResponse( + content=types.Content(role="model", parts=parts), + usage_metadata=types.GenerateContentResponseUsageMetadata( + prompt_token_count=final_message.usage.input_tokens, + candidates_token_count=final_message.usage.output_tokens, + total_token_count=( + final_message.usage.input_tokens + + final_message.usage.output_tokens + ), + ), + ) + + else: + # Non-streaming mode (simple requests without thinking) + logger.info("Using non-streaming mode") + message = await self._anthropic_client.messages.create( + model=llm_request.model, + system=llm_request.config.system_instruction, + messages=messages, + tools=tools, + tool_choice=tool_choice, + max_tokens=self.max_tokens, + ) + yield message_to_generate_content_response(message) @cached_property - def _anthropic_client(self) -> AnthropicVertex: + def _anthropic_client(self) -> AsyncAnthropicVertex: if ( "GOOGLE_CLOUD_PROJECT" not in os.environ or "GOOGLE_CLOUD_LOCATION" not in os.environ @@ -305,7 +491,7 @@ def _anthropic_client(self) -> AnthropicVertex: " Anthropic on Vertex." ) - return AnthropicVertex( + return AsyncAnthropicVertex( project_id=os.environ["GOOGLE_CLOUD_PROJECT"], region=os.environ["GOOGLE_CLOUD_LOCATION"], ) diff --git a/tests/unittests/models/test_anthropic_llm.py b/tests/unittests/models/test_anthropic_llm.py index a81fbc7252..f892c8ddf2 100644 --- a/tests/unittests/models/test_anthropic_llm.py +++ b/tests/unittests/models/test_anthropic_llm.py @@ -293,8 +293,10 @@ async def test_function_declaration_to_tool_param( @pytest.mark.asyncio async def test_generate_content_async( - claude_llm, llm_request, generate_content_response, generate_llm_response + llm_request, generate_content_response, generate_llm_response ): + # Use max_tokens < 8192 to trigger non-streaming mode + claude_llm = Claude(model="claude-3-5-sonnet-v2@20241022", max_tokens=4096) with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: with mock.patch.object( anthropic_llm, diff --git a/tests/unittests/models/test_anthropic_max_tokens.py b/tests/unittests/models/test_anthropic_max_tokens.py new file mode 100644 index 0000000000..ccd3c66359 --- /dev/null +++ b/tests/unittests/models/test_anthropic_max_tokens.py @@ -0,0 +1,268 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Anthropic max_tokens streaming threshold behavior.""" + +import os +from unittest import mock + +from anthropic import types as anthropic_types +from google.adk.models.anthropic_llm import Claude +from google.adk.models.llm_request import LlmRequest +from google.genai import types +import pytest + + +@pytest.fixture +def base_llm_request(): + return LlmRequest( + model="claude-opus-4-1@20250805", + contents=[ + types.Content( + role="user", parts=[types.Part.from_text(text="What is 2+2?")] + ) + ], + config=types.GenerateContentConfig( + system_instruction="You are a helpful assistant" + ), + ) + + +@pytest.mark.asyncio +async def test_max_tokens_below_threshold_uses_non_streaming(base_llm_request): + """Test that max_tokens < 8192 uses non-streaming mode.""" + claude_llm = Claude(model="claude-opus-4-1@20250805", max_tokens=4096) + + with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: + mock_client.messages.create = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[anthropic_types.TextBlock(text="4", type="text")], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), + ) + ) + + responses = [] + async for response in claude_llm.generate_content_async( + base_llm_request, stream=False + ): + responses.append(response) + + # Verify non-streaming mode was used + mock_client.messages.create.assert_called_once() + mock_client.messages.stream.assert_not_called() + + +@pytest.mark.asyncio +async def test_max_tokens_at_threshold_uses_streaming(base_llm_request): + """Test that max_tokens >= 8192 uses streaming mode.""" + claude_llm = Claude(model="claude-opus-4-1@20250805", max_tokens=8192) + + with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: + mock_stream = mock.AsyncMock() + mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) + mock_stream.__aexit__ = mock.AsyncMock() + mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) + mock_stream.get_final_message = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), + ) + ) + mock_client.messages.stream = mock.Mock(return_value=mock_stream) + + responses = [] + async for response in claude_llm.generate_content_async( + base_llm_request, stream=False + ): + responses.append(response) + + # Verify streaming mode was used + mock_client.messages.stream.assert_called_once() + assert not mock_client.messages.create.called + + +@pytest.mark.asyncio +async def test_max_tokens_above_threshold_uses_streaming(base_llm_request): + """Test that max_tokens > 8192 uses streaming mode.""" + claude_llm = Claude(model="claude-opus-4-1@20250805", max_tokens=16000) + + with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: + mock_stream = mock.AsyncMock() + mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) + mock_stream.__aexit__ = mock.AsyncMock() + mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) + mock_stream.get_final_message = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), + ) + ) + mock_client.messages.stream = mock.Mock(return_value=mock_stream) + + responses = [] + async for response in claude_llm.generate_content_async( + base_llm_request, stream=False + ): + responses.append(response) + + # Verify streaming mode was used + mock_client.messages.stream.assert_called_once() + assert not mock_client.messages.create.called + + +@pytest.mark.asyncio +async def test_stream_flag_overrides_max_tokens(base_llm_request): + """Test that stream=True forces streaming regardless of max_tokens.""" + claude_llm = Claude(model="claude-opus-4-1@20250805", max_tokens=1024) + + with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: + mock_stream = mock.AsyncMock() + mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) + mock_stream.__aexit__ = mock.AsyncMock() + mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) + mock_stream.get_final_message = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), + ) + ) + mock_client.messages.stream = mock.Mock(return_value=mock_stream) + + responses = [] + async for response in claude_llm.generate_content_async( + base_llm_request, stream=True + ): + responses.append(response) + + # Verify streaming mode was used even with low max_tokens + mock_client.messages.stream.assert_called_once() + assert not mock_client.messages.create.called + + +@pytest.mark.asyncio +async def test_thinking_enables_streaming_regardless_max_tokens( + base_llm_request, +): + """Test that thinking config enables streaming regardless of max_tokens.""" + claude_llm = Claude(model="claude-opus-4-1@20250805", max_tokens=1024) + base_llm_request.config.thinking_config = types.ThinkingConfig( + include_thoughts=True, thinking_budget=-1 + ) + + with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: + mock_stream = mock.AsyncMock() + mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) + mock_stream.__aexit__ = mock.AsyncMock() + mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) + mock_stream.get_final_message = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), + ) + ) + mock_client.messages.stream = mock.Mock(return_value=mock_stream) + + responses = [] + async for response in claude_llm.generate_content_async( + base_llm_request, stream=False + ): + responses.append(response) + + # Verify streaming mode was used due to thinking config + mock_client.messages.stream.assert_called_once() + assert not mock_client.messages.create.called + + +@pytest.mark.asyncio +async def test_streaming_decision_logic(base_llm_request): + """Test the complete streaming decision logic.""" + # Case 1: No streaming triggers + claude_llm_1 = Claude(model="claude-opus-4-1@20250805", max_tokens=4096) + + with mock.patch.object(claude_llm_1, "_anthropic_client") as mock_client: + mock_client.messages.create = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[anthropic_types.TextBlock(text="4", type="text")], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), + ) + ) + + responses = [] + async for response in claude_llm_1.generate_content_async( + base_llm_request, stream=False + ): + responses.append(response) + + # Should use non-streaming + mock_client.messages.create.assert_called_once() + + # Case 2: Large max_tokens trigger + claude_llm_2 = Claude(model="claude-opus-4-1@20250805", max_tokens=10000) + + with mock.patch.object(claude_llm_2, "_anthropic_client") as mock_client: + mock_stream = mock.AsyncMock() + mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) + mock_stream.__aexit__ = mock.AsyncMock() + mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) + mock_stream.get_final_message = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), + ) + ) + mock_client.messages.stream = mock.Mock(return_value=mock_stream) + + responses = [] + async for response in claude_llm_2.generate_content_async( + base_llm_request, stream=False + ): + responses.append(response) + + # Should use streaming + mock_client.messages.stream.assert_called_once() diff --git a/tests/unittests/models/test_anthropic_streaming.py b/tests/unittests/models/test_anthropic_streaming.py new file mode 100644 index 0000000000..0f0f05e5dc --- /dev/null +++ b/tests/unittests/models/test_anthropic_streaming.py @@ -0,0 +1,132 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Anthropic streaming with thinking support.""" + +import os +from unittest import mock + +from anthropic import types as anthropic_types +from google.adk.models.anthropic_llm import Claude +from google.adk.models.anthropic_llm import streaming_event_to_llm_response +from google.adk.models.llm_request import LlmRequest +from google.genai import types +import pytest + + +@pytest.fixture +def claude_llm(): + return Claude(model="claude-opus-4-1@20250805", max_tokens=16000) + + +@pytest.fixture +def llm_request_with_thinking(): + return LlmRequest( + model="claude-opus-4-1@20250805", + contents=[ + types.Content( + role="user", + parts=[types.Part.from_text(text="Explain quantum computing")], + ) + ], + config=types.GenerateContentConfig( + system_instruction="You are a helpful assistant", + thinking_config=types.ThinkingConfig( + include_thoughts=True, thinking_budget=-1 + ), + ), + ) + + +def test_streaming_event_text_delta(): + """Test that text_delta events are converted correctly.""" + + class TextDelta: + type = "text_delta" + text = "Hello " + + class ContentBlockDeltaEvent: + type = "content_block_delta" + delta = TextDelta() + + event = ContentBlockDeltaEvent() + llm_response = streaming_event_to_llm_response(event) + + assert llm_response is not None + assert llm_response.partial is True + assert llm_response.content is not None + assert llm_response.content.role == "model" + assert len(llm_response.content.parts) == 1 + assert llm_response.content.parts[0].text == "Hello " + assert ( + not hasattr(llm_response.content.parts[0], "thought") + or not llm_response.content.parts[0].thought + ) + + +def test_streaming_event_thinking_delta(): + """Test that thinking_delta events are converted with thought=True.""" + + class ThinkingDelta: + type = "thinking_delta" + thinking = "Let me think... " + + class ContentBlockDeltaEvent: + type = "content_block_delta" + delta = ThinkingDelta() + + event = ContentBlockDeltaEvent() + llm_response = streaming_event_to_llm_response(event) + + assert llm_response is not None + assert llm_response.partial is True + assert llm_response.content is not None + assert llm_response.content.role == "model" + assert len(llm_response.content.parts) == 1 + assert llm_response.content.parts[0].text == "Let me think... " + assert hasattr(llm_response.content.parts[0], "thought") + assert llm_response.content.parts[0].thought is True + + +def test_streaming_event_message_delta_usage(): + """Test that message_delta events with usage are converted correctly.""" + + class Usage: + input_tokens = 100 + output_tokens = 50 + + class MessageDeltaEvent: + type = "message_delta" + usage = Usage() + + event = MessageDeltaEvent() + llm_response = streaming_event_to_llm_response(event) + + assert llm_response is not None + assert llm_response.usage_metadata is not None + assert llm_response.usage_metadata.prompt_token_count == 100 + assert llm_response.usage_metadata.candidates_token_count == 50 + assert llm_response.usage_metadata.total_token_count == 150 + + +def test_streaming_event_ignored(): + """Test that start/stop events are ignored.""" + + class MessageStartEvent: + type = "message_start" + + event = MessageStartEvent() + llm_response = streaming_event_to_llm_response(event) + + assert llm_response is None diff --git a/tests/unittests/models/test_anthropic_thinking.py b/tests/unittests/models/test_anthropic_thinking.py new file mode 100644 index 0000000000..19c3feedb5 --- /dev/null +++ b/tests/unittests/models/test_anthropic_thinking.py @@ -0,0 +1,254 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Anthropic Extended Thinking configuration.""" + +import os +from unittest import mock + +from anthropic import types as anthropic_types +from google.adk.models.anthropic_llm import Claude +from google.adk.models.anthropic_llm import content_block_to_part +from google.adk.models.llm_request import LlmRequest +from google.genai import types +import pytest + + +@pytest.fixture +def claude_llm(): + return Claude(model="claude-opus-4-1@20250805", max_tokens=4096) + + +@pytest.fixture +def base_llm_request(): + return LlmRequest( + model="claude-opus-4-1@20250805", + contents=[ + types.Content( + role="user", parts=[types.Part.from_text(text="What is 2+2?")] + ) + ], + config=types.GenerateContentConfig( + system_instruction="You are a helpful assistant" + ), + ) + + +@pytest.mark.asyncio +async def test_thinking_budget_automatic(claude_llm, base_llm_request): + """Test that thinking_budget=-1 uses automatic budget (1024 tokens).""" + base_llm_request.config.thinking_config = types.ThinkingConfig( + include_thoughts=True, thinking_budget=-1 + ) + + with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: + mock_stream = mock.AsyncMock() + mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) + mock_stream.__aexit__ = mock.AsyncMock() + mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) + mock_stream.get_final_message = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=20), + ) + ) + mock_client.messages.stream = mock.Mock(return_value=mock_stream) + + responses = [] + async for response in claude_llm.generate_content_async( + base_llm_request, stream=False + ): + responses.append(response) + + # Verify thinking parameter was passed with 1024 tokens + mock_client.messages.stream.assert_called_once() + call_kwargs = mock_client.messages.stream.call_args.kwargs + assert call_kwargs["thinking"] == {"type": "enabled", "budget_tokens": 1024} + + +@pytest.mark.asyncio +async def test_thinking_budget_disabled(claude_llm, base_llm_request): + """Test that thinking_budget=0 disables thinking.""" + base_llm_request.config.thinking_config = types.ThinkingConfig( + include_thoughts=True, thinking_budget=0 + ) + + with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: + mock_client.messages.create = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[anthropic_types.TextBlock(text="4", type="text")], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), + ) + ) + + responses = [] + async for response in claude_llm.generate_content_async( + base_llm_request, stream=False + ): + responses.append(response) + + # Verify non-streaming mode was used (thinking disabled) + mock_client.messages.create.assert_called_once() + # Thinking should not be in the call + assert "thinking" not in mock_client.messages.create.call_args.kwargs + + +@pytest.mark.asyncio +async def test_thinking_budget_specific_value(claude_llm, base_llm_request): + """Test that thinking_budget with specific value is used.""" + base_llm_request.config.thinking_config = types.ThinkingConfig( + include_thoughts=True, thinking_budget=5000 + ) + + with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: + mock_stream = mock.AsyncMock() + mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) + mock_stream.__aexit__ = mock.AsyncMock() + mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) + mock_stream.get_final_message = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=20), + ) + ) + mock_client.messages.stream = mock.Mock(return_value=mock_stream) + + responses = [] + async for response in claude_llm.generate_content_async( + base_llm_request, stream=False + ): + responses.append(response) + + # Verify thinking parameter was passed with 5000 tokens + mock_client.messages.stream.assert_called_once() + call_kwargs = mock_client.messages.stream.call_args.kwargs + assert call_kwargs["thinking"] == {"type": "enabled", "budget_tokens": 5000} + + +@pytest.mark.asyncio +async def test_thinking_budget_minimum_enforced(claude_llm, base_llm_request): + """Test that thinking budget below 1024 is enforced to minimum 1024.""" + base_llm_request.config.thinking_config = types.ThinkingConfig( + include_thoughts=True, thinking_budget=500 + ) + + with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: + mock_stream = mock.AsyncMock() + mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) + mock_stream.__aexit__ = mock.AsyncMock() + mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) + mock_stream.get_final_message = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=20), + ) + ) + mock_client.messages.stream = mock.Mock(return_value=mock_stream) + + responses = [] + async for response in claude_llm.generate_content_async( + base_llm_request, stream=False + ): + responses.append(response) + + # Verify thinking parameter was enforced to minimum 1024 tokens + mock_client.messages.stream.assert_called_once() + call_kwargs = mock_client.messages.stream.call_args.kwargs + assert call_kwargs["thinking"] == {"type": "enabled", "budget_tokens": 1024} + + +def test_thinking_block_parsing(): + """Test that thinking blocks are parsed as Part(thought=True).""" + + # Create a mock thinking block + class ThinkingBlock: + + def __init__(self): + self.thinking = "Let me think about this step by step..." + self.type = "thinking" + + thinking_block = ThinkingBlock() + part = content_block_to_part(thinking_block) + + assert part.text == "Let me think about this step by step..." + assert hasattr(part, "thought") + assert part.thought is True + + +def test_thinking_block_type_check(): + """Test alternative thinking block detection via type attribute.""" + + # Create a mock thinking block with only type attribute + class ThinkingBlockTypeOnly: + + def __init__(self): + self.type = "thinking" + + def __str__(self): + return "Thinking content here" + + thinking_block = ThinkingBlockTypeOnly() + part = content_block_to_part(thinking_block) + + assert part.text == "Thinking content here" + assert hasattr(part, "thought") + assert part.thought is True + + +@pytest.mark.asyncio +async def test_no_thinking_config(claude_llm, base_llm_request): + """Test that no thinking config results in normal operation.""" + # No thinking_config set + + with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: + mock_client.messages.create = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[anthropic_types.TextBlock(text="4", type="text")], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), + ) + ) + + responses = [] + async for response in claude_llm.generate_content_async( + base_llm_request, stream=False + ): + responses.append(response) + + # Verify non-streaming mode was used + mock_client.messages.create.assert_called_once() From ae8b49e8c5ba119989e6b2280bf9382a470db4b1 Mon Sep 17 00:00:00 2001 From: damithsenanayake Date: Thu, 2 Oct 2025 13:32:39 +1000 Subject: [PATCH 2/8] fix: default value fix --- src/google/adk/models/anthropic_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/google/adk/models/anthropic_llm.py b/src/google/adk/models/anthropic_llm.py index 2df020d0e8..43132f1fbd 100644 --- a/src/google/adk/models/anthropic_llm.py +++ b/src/google/adk/models/anthropic_llm.py @@ -368,7 +368,7 @@ async def generate_content_async( if budget and budget != 0: if budget == -1: # Automatic thinking budget - use recommended default of 10000 tokens - thinking = {"type": "enabled", "budget_tokens": 1024} + thinking = {"type": "enabled", "budget_tokens": 10000} logger.info( "Extended thinking enabled (automatic budget: 10000 tokens)" ) From 5b7b6ededd0f87f6b742aef54b3592301856b29a Mon Sep 17 00:00:00 2001 From: damithsenanayake Date: Thu, 2 Oct 2025 13:40:52 +1000 Subject: [PATCH 3/8] refactor: gemini code review --- src/google/adk/models/anthropic_llm.py | 28 ++++++++++++-------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/src/google/adk/models/anthropic_llm.py b/src/google/adk/models/anthropic_llm.py index 43132f1fbd..9f9bfb8bdd 100644 --- a/src/google/adk/models/anthropic_llm.py +++ b/src/google/adk/models/anthropic_llm.py @@ -209,7 +209,7 @@ def streaming_event_to_llm_response( delta = event.delta # Text delta - if hasattr(delta, "type") and delta.type == "text_delta": + if delta.type == "text_delta": return LlmResponse( content=types.Content( role="model", @@ -219,7 +219,7 @@ def streaming_event_to_llm_response( ) # Thinking delta - if hasattr(delta, "type") and delta.type == "thinking_delta": + elif delta.type == "thinking_delta": return LlmResponse( content=types.Content( role="model", @@ -231,14 +231,13 @@ def streaming_event_to_llm_response( # Handle message deltas (usage updates) elif event.type == "message_delta": if hasattr(event, "usage"): + input_tokens = getattr(event.usage, "input_tokens", 0) + output_tokens = getattr(event.usage, "output_tokens", 0) return LlmResponse( usage_metadata=types.GenerateContentResponseUsageMetadata( - prompt_token_count=getattr(event.usage, "input_tokens", 0), - candidates_token_count=getattr(event.usage, "output_tokens", 0), - total_token_count=( - getattr(event.usage, "input_tokens", 0) - + getattr(event.usage, "output_tokens", 0) - ), + prompt_token_count=input_tokens, + candidates_token_count=output_tokens, + total_token_count=input_tokens + output_tokens, ), ) @@ -365,7 +364,7 @@ async def generate_content_async( thinking = NOT_GIVEN if llm_request.config and llm_request.config.thinking_config: budget = llm_request.config.thinking_config.thinking_budget - if budget and budget != 0: + if budget: if budget == -1: # Automatic thinking budget - use recommended default of 10000 tokens thinking = {"type": "enabled", "budget_tokens": 10000} @@ -455,15 +454,14 @@ async def generate_content_async( # Only yield final aggregated response if we have content if parts: + input_tokens = final_message.usage.input_tokens + output_tokens = final_message.usage.output_tokens yield LlmResponse( content=types.Content(role="model", parts=parts), usage_metadata=types.GenerateContentResponseUsageMetadata( - prompt_token_count=final_message.usage.input_tokens, - candidates_token_count=final_message.usage.output_tokens, - total_token_count=( - final_message.usage.input_tokens - + final_message.usage.output_tokens - ), + prompt_token_count=input_tokens, + candidates_token_count=output_tokens, + total_token_count=input_tokens + output_tokens, ), ) From c77e5d5d7462fe9121b5da22029571b91d03d162 Mon Sep 17 00:00:00 2001 From: damithsenanayake Date: Thu, 2 Oct 2025 13:42:38 +1000 Subject: [PATCH 4/8] fix: test fix for default tokens --- tests/unittests/models/test_anthropic_thinking.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unittests/models/test_anthropic_thinking.py b/tests/unittests/models/test_anthropic_thinking.py index 19c3feedb5..fed8ae48df 100644 --- a/tests/unittests/models/test_anthropic_thinking.py +++ b/tests/unittests/models/test_anthropic_thinking.py @@ -47,7 +47,7 @@ def base_llm_request(): @pytest.mark.asyncio async def test_thinking_budget_automatic(claude_llm, base_llm_request): - """Test that thinking_budget=-1 uses automatic budget (1024 tokens).""" + """Test that thinking_budget=-1 uses automatic budget (10000 tokens).""" base_llm_request.config.thinking_config = types.ThinkingConfig( include_thoughts=True, thinking_budget=-1 ) @@ -76,10 +76,10 @@ async def test_thinking_budget_automatic(claude_llm, base_llm_request): ): responses.append(response) - # Verify thinking parameter was passed with 1024 tokens + # Verify thinking parameter was passed with 10000 tokens mock_client.messages.stream.assert_called_once() call_kwargs = mock_client.messages.stream.call_args.kwargs - assert call_kwargs["thinking"] == {"type": "enabled", "budget_tokens": 1024} + assert call_kwargs["thinking"] == {"type": "enabled", "budget_tokens": 10000} @pytest.mark.asyncio From 769e57f0b2261b15b58f72ebac2894ef437034ad Mon Sep 17 00:00:00 2001 From: damithsenanayake Date: Thu, 2 Oct 2025 22:14:51 +1000 Subject: [PATCH 5/8] wip: tool calling fixed --- src/google/adk/models/anthropic_llm.py | 97 +++++++++++++++----------- 1 file changed, 58 insertions(+), 39 deletions(-) diff --git a/src/google/adk/models/anthropic_llm.py b/src/google/adk/models/anthropic_llm.py index 9f9bfb8bdd..fd6e84303b 100644 --- a/src/google/adk/models/anthropic_llm.py +++ b/src/google/adk/models/anthropic_llm.py @@ -85,8 +85,20 @@ def part_to_message_block( anthropic_types.ImageBlockParam, anthropic_types.ToolUseBlockParam, anthropic_types.ToolResultBlockParam, + dict, # For thinking blocks ]: - if part.text: + # Handle thinking blocks (must check thought=True BEFORE text) + # Thinking is stored as Part(text=..., thought=True, thought_signature=...) + if part.text and hasattr(part, 'thought') and part.thought: + thinking_block = {"type": "thinking", "thinking": part.text} + if hasattr(part, 'thought_signature') and part.thought_signature: + # thought_signature is stored as bytes in Part, but API expects base64 string + thinking_block["signature"] = base64.b64encode(part.thought_signature).decode('utf-8') + logger.debug(f"Including signature with thinking block") + else: + logger.warning(f"No signature found for thinking block - this may cause API errors") + return thinking_block + elif part.text: return anthropic_types.TextBlockParam(text=part.text, type="text") elif part.function_call: assert part.function_call.name @@ -140,14 +152,26 @@ def part_to_message_block( def content_to_message_param( content: types.Content, ) -> anthropic_types.MessageParam: - message_block = [] + thinking_blocks = [] + other_blocks = [] + for part in content.parts or []: # Image data is not supported in Claude for model turns. if _is_image_part(part): logger.warning("Image data is not supported in Claude for model turns.") continue - message_block.append(part_to_message_block(part)) + block = part_to_message_block(part) + + # Separate thinking blocks from other blocks + # Anthropic requires thinking blocks to come FIRST in assistant messages + if isinstance(block, dict) and block.get("type") == "thinking": + thinking_blocks.append(block) + else: + other_blocks.append(block) + + # Thinking blocks MUST come first (Anthropic API requirement) + message_block = thinking_blocks + other_blocks return { "role": to_claude_role(content.role), @@ -172,9 +196,10 @@ def content_block_to_part( # Thinking blocks have a 'thinking' attribute containing the reasoning text if hasattr(content_block, "thinking"): thinking_text = content_block.thinking - logger.info(f"Received thinking block ({len(thinking_text)} chars)") - # Return as Part with thought=True (standard GenAI format) - return types.Part(text=thinking_text, thought=True) + signature = getattr(content_block, 'signature', None) + logger.info(f"Received thinking block ({len(thinking_text)} chars, signature={'present' if signature else 'missing'})") + # Return as Part with thought=True and preserve signature (standard GenAI format) + return types.Part(text=thinking_text, thought=True, thought_signature=signature) # Alternative check: some versions may use type attribute if ( @@ -182,11 +207,12 @@ def content_block_to_part( and getattr(content_block, "type", None) == "thinking" ): thinking_text = str(content_block) + signature = getattr(content_block, 'signature', None) logger.info( - f"Received thinking block via type check ({len(thinking_text)} chars)" + f"Received thinking block via type check ({len(thinking_text)} chars, signature={'present' if signature else 'missing'})" ) - # Return as Part with thought=True (standard GenAI format) - return types.Part(text=thinking_text, thought=True) + # Return as Part with thought=True and preserve signature (standard GenAI format) + return types.Part(text=thinking_text, thought=True, thought_signature=signature) raise NotImplementedError( f"Not supported yet: {type(content_block).__name__}" @@ -231,8 +257,8 @@ def streaming_event_to_llm_response( # Handle message deltas (usage updates) elif event.type == "message_delta": if hasattr(event, "usage"): - input_tokens = getattr(event.usage, "input_tokens", 0) - output_tokens = getattr(event.usage, "output_tokens", 0) + input_tokens = getattr(event.usage, "input_tokens", 0) or 0 + output_tokens = getattr(event.usage, "output_tokens", 0) or 0 return LlmResponse( usage_metadata=types.GenerateContentResponseUsageMetadata( prompt_token_count=input_tokens, @@ -362,33 +388,27 @@ async def generate_content_async( # Extract and convert thinking config from ADK to Anthropic format thinking = NOT_GIVEN + if llm_request.config and llm_request.config.thinking_config: budget = llm_request.config.thinking_config.thinking_budget if budget: if budget == -1: - # Automatic thinking budget - use recommended default of 10000 tokens - thinking = {"type": "enabled", "budget_tokens": 10000} - logger.info( - "Extended thinking enabled (automatic budget: 10000 tokens)" + raise ValueError( + "Unlimited thinking budget (-1) is not supported with Claude." ) elif budget > 0: - # Specific budget - enforce minimum 1024 tokens - actual_budget = max(budget, 1024) - thinking = {"type": "enabled", "budget_tokens": actual_budget} + + thinking = {"type": "enabled", "budget_tokens": budget} logger.info( - f"Extended thinking enabled (budget: {actual_budget} tokens)" + f"Extended thinking enabled (budget: {budget} tokens)" ) + else: + logger.warning(f"Budget not given! budget={budget}") + else: + logger.warning(f"No thinking_config found in llm_request.config") - # Determine if streaming should be used - use_streaming = ( - stream # From runtime context (streaming_mode == SSE) - or thinking - != NOT_GIVEN # Extended thinking requires streaming (Anthropic-specific) - or self.max_tokens - >= 8192 # Large max_tokens may exceed 10min timeout (Anthropic SDK requirement) - ) - if use_streaming: + if stream: # Use streaming mode logger.info( f"Using streaming mode (stream={stream}, " @@ -426,6 +446,8 @@ async def generate_content_async( # If we have accumulated thinking and now getting text, # yield the accumulated thinking first + # NOTE: This partial response is for UI display only + # The final response with signature will be yielded after the stream ends if accumulated_thinking and accumulated_text and not is_thought: yield LlmResponse( content=types.Content( @@ -442,18 +464,14 @@ async def generate_content_async( if not is_thought: yield llm_response - # Get final message to extract usage metadata + # Get final message with complete content blocks (includes signatures) final_message = await anthropic_stream.get_final_message() - # Build final aggregated response with complete content - parts = [] - if accumulated_thinking: - parts.append(types.Part(text=accumulated_thinking, thought=True)) - if accumulated_text: - parts.append(types.Part.from_text(text=accumulated_text)) - - # Only yield final aggregated response if we have content - if parts: + # Build final response from complete content blocks to preserve thinking signatures + # IMPORTANT: Use final_message.content instead of accumulated strings + # because accumulated strings don't have signatures + if final_message.content: + parts = [content_block_to_part(cb) for cb in final_message.content] input_tokens = final_message.usage.input_tokens output_tokens = final_message.usage.output_tokens yield LlmResponse( @@ -466,7 +484,7 @@ async def generate_content_async( ) else: - # Non-streaming mode (simple requests without thinking) + # Non-streaming mode logger.info("Using non-streaming mode") message = await self._anthropic_client.messages.create( model=llm_request.model, @@ -475,6 +493,7 @@ async def generate_content_async( tools=tools, tool_choice=tool_choice, max_tokens=self.max_tokens, + thinking=thinking, ) yield message_to_generate_content_response(message) From c949900e1e50d54480257a434965864573af853a Mon Sep 17 00:00:00 2001 From: damithsenanayake Date: Thu, 2 Oct 2025 22:27:12 +1000 Subject: [PATCH 6/8] tests: thinking config changes --- .../models/test_anthropic_max_tokens.py | 268 ------------------ .../models/test_anthropic_thinking.py | 142 ++++++---- 2 files changed, 81 insertions(+), 329 deletions(-) delete mode 100644 tests/unittests/models/test_anthropic_max_tokens.py diff --git a/tests/unittests/models/test_anthropic_max_tokens.py b/tests/unittests/models/test_anthropic_max_tokens.py deleted file mode 100644 index ccd3c66359..0000000000 --- a/tests/unittests/models/test_anthropic_max_tokens.py +++ /dev/null @@ -1,268 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tests for Anthropic max_tokens streaming threshold behavior.""" - -import os -from unittest import mock - -from anthropic import types as anthropic_types -from google.adk.models.anthropic_llm import Claude -from google.adk.models.llm_request import LlmRequest -from google.genai import types -import pytest - - -@pytest.fixture -def base_llm_request(): - return LlmRequest( - model="claude-opus-4-1@20250805", - contents=[ - types.Content( - role="user", parts=[types.Part.from_text(text="What is 2+2?")] - ) - ], - config=types.GenerateContentConfig( - system_instruction="You are a helpful assistant" - ), - ) - - -@pytest.mark.asyncio -async def test_max_tokens_below_threshold_uses_non_streaming(base_llm_request): - """Test that max_tokens < 8192 uses non-streaming mode.""" - claude_llm = Claude(model="claude-opus-4-1@20250805", max_tokens=4096) - - with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: - mock_client.messages.create = mock.AsyncMock( - return_value=anthropic_types.Message( - id="test", - content=[anthropic_types.TextBlock(text="4", type="text")], - model="claude-opus-4-1@20250805", - role="assistant", - stop_reason="end_turn", - type="message", - usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), - ) - ) - - responses = [] - async for response in claude_llm.generate_content_async( - base_llm_request, stream=False - ): - responses.append(response) - - # Verify non-streaming mode was used - mock_client.messages.create.assert_called_once() - mock_client.messages.stream.assert_not_called() - - -@pytest.mark.asyncio -async def test_max_tokens_at_threshold_uses_streaming(base_llm_request): - """Test that max_tokens >= 8192 uses streaming mode.""" - claude_llm = Claude(model="claude-opus-4-1@20250805", max_tokens=8192) - - with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: - mock_stream = mock.AsyncMock() - mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) - mock_stream.__aexit__ = mock.AsyncMock() - mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) - mock_stream.get_final_message = mock.AsyncMock( - return_value=anthropic_types.Message( - id="test", - content=[], - model="claude-opus-4-1@20250805", - role="assistant", - stop_reason="end_turn", - type="message", - usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), - ) - ) - mock_client.messages.stream = mock.Mock(return_value=mock_stream) - - responses = [] - async for response in claude_llm.generate_content_async( - base_llm_request, stream=False - ): - responses.append(response) - - # Verify streaming mode was used - mock_client.messages.stream.assert_called_once() - assert not mock_client.messages.create.called - - -@pytest.mark.asyncio -async def test_max_tokens_above_threshold_uses_streaming(base_llm_request): - """Test that max_tokens > 8192 uses streaming mode.""" - claude_llm = Claude(model="claude-opus-4-1@20250805", max_tokens=16000) - - with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: - mock_stream = mock.AsyncMock() - mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) - mock_stream.__aexit__ = mock.AsyncMock() - mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) - mock_stream.get_final_message = mock.AsyncMock( - return_value=anthropic_types.Message( - id="test", - content=[], - model="claude-opus-4-1@20250805", - role="assistant", - stop_reason="end_turn", - type="message", - usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), - ) - ) - mock_client.messages.stream = mock.Mock(return_value=mock_stream) - - responses = [] - async for response in claude_llm.generate_content_async( - base_llm_request, stream=False - ): - responses.append(response) - - # Verify streaming mode was used - mock_client.messages.stream.assert_called_once() - assert not mock_client.messages.create.called - - -@pytest.mark.asyncio -async def test_stream_flag_overrides_max_tokens(base_llm_request): - """Test that stream=True forces streaming regardless of max_tokens.""" - claude_llm = Claude(model="claude-opus-4-1@20250805", max_tokens=1024) - - with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: - mock_stream = mock.AsyncMock() - mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) - mock_stream.__aexit__ = mock.AsyncMock() - mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) - mock_stream.get_final_message = mock.AsyncMock( - return_value=anthropic_types.Message( - id="test", - content=[], - model="claude-opus-4-1@20250805", - role="assistant", - stop_reason="end_turn", - type="message", - usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), - ) - ) - mock_client.messages.stream = mock.Mock(return_value=mock_stream) - - responses = [] - async for response in claude_llm.generate_content_async( - base_llm_request, stream=True - ): - responses.append(response) - - # Verify streaming mode was used even with low max_tokens - mock_client.messages.stream.assert_called_once() - assert not mock_client.messages.create.called - - -@pytest.mark.asyncio -async def test_thinking_enables_streaming_regardless_max_tokens( - base_llm_request, -): - """Test that thinking config enables streaming regardless of max_tokens.""" - claude_llm = Claude(model="claude-opus-4-1@20250805", max_tokens=1024) - base_llm_request.config.thinking_config = types.ThinkingConfig( - include_thoughts=True, thinking_budget=-1 - ) - - with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: - mock_stream = mock.AsyncMock() - mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) - mock_stream.__aexit__ = mock.AsyncMock() - mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) - mock_stream.get_final_message = mock.AsyncMock( - return_value=anthropic_types.Message( - id="test", - content=[], - model="claude-opus-4-1@20250805", - role="assistant", - stop_reason="end_turn", - type="message", - usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), - ) - ) - mock_client.messages.stream = mock.Mock(return_value=mock_stream) - - responses = [] - async for response in claude_llm.generate_content_async( - base_llm_request, stream=False - ): - responses.append(response) - - # Verify streaming mode was used due to thinking config - mock_client.messages.stream.assert_called_once() - assert not mock_client.messages.create.called - - -@pytest.mark.asyncio -async def test_streaming_decision_logic(base_llm_request): - """Test the complete streaming decision logic.""" - # Case 1: No streaming triggers - claude_llm_1 = Claude(model="claude-opus-4-1@20250805", max_tokens=4096) - - with mock.patch.object(claude_llm_1, "_anthropic_client") as mock_client: - mock_client.messages.create = mock.AsyncMock( - return_value=anthropic_types.Message( - id="test", - content=[anthropic_types.TextBlock(text="4", type="text")], - model="claude-opus-4-1@20250805", - role="assistant", - stop_reason="end_turn", - type="message", - usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), - ) - ) - - responses = [] - async for response in claude_llm_1.generate_content_async( - base_llm_request, stream=False - ): - responses.append(response) - - # Should use non-streaming - mock_client.messages.create.assert_called_once() - - # Case 2: Large max_tokens trigger - claude_llm_2 = Claude(model="claude-opus-4-1@20250805", max_tokens=10000) - - with mock.patch.object(claude_llm_2, "_anthropic_client") as mock_client: - mock_stream = mock.AsyncMock() - mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) - mock_stream.__aexit__ = mock.AsyncMock() - mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) - mock_stream.get_final_message = mock.AsyncMock( - return_value=anthropic_types.Message( - id="test", - content=[], - model="claude-opus-4-1@20250805", - role="assistant", - stop_reason="end_turn", - type="message", - usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), - ) - ) - mock_client.messages.stream = mock.Mock(return_value=mock_stream) - - responses = [] - async for response in claude_llm_2.generate_content_async( - base_llm_request, stream=False - ): - responses.append(response) - - # Should use streaming - mock_client.messages.stream.assert_called_once() diff --git a/tests/unittests/models/test_anthropic_thinking.py b/tests/unittests/models/test_anthropic_thinking.py index fed8ae48df..ba839d9123 100644 --- a/tests/unittests/models/test_anthropic_thinking.py +++ b/tests/unittests/models/test_anthropic_thinking.py @@ -17,6 +17,7 @@ import os from unittest import mock +from anthropic import NOT_GIVEN from anthropic import types as anthropic_types from google.adk.models.anthropic_llm import Claude from google.adk.models.anthropic_llm import content_block_to_part @@ -46,40 +47,22 @@ def base_llm_request(): @pytest.mark.asyncio -async def test_thinking_budget_automatic(claude_llm, base_llm_request): - """Test that thinking_budget=-1 uses automatic budget (10000 tokens).""" +async def test_thinking_budget_automatic_raises_error( + claude_llm, base_llm_request +): + """Test that thinking_budget=-1 raises ValueError.""" base_llm_request.config.thinking_config = types.ThinkingConfig( include_thoughts=True, thinking_budget=-1 ) - with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: - mock_stream = mock.AsyncMock() - mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) - mock_stream.__aexit__ = mock.AsyncMock() - mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) - mock_stream.get_final_message = mock.AsyncMock( - return_value=anthropic_types.Message( - id="test", - content=[], - model="claude-opus-4-1@20250805", - role="assistant", - stop_reason="end_turn", - type="message", - usage=anthropic_types.Usage(input_tokens=10, output_tokens=20), - ) - ) - mock_client.messages.stream = mock.Mock(return_value=mock_stream) - - responses = [] + with pytest.raises( + ValueError, + match="Unlimited thinking budget \\(-1\\) is not supported with Claude", + ): async for response in claude_llm.generate_content_async( base_llm_request, stream=False ): - responses.append(response) - - # Verify thinking parameter was passed with 10000 tokens - mock_client.messages.stream.assert_called_once() - call_kwargs = mock_client.messages.stream.call_args.kwargs - assert call_kwargs["thinking"] == {"type": "enabled", "budget_tokens": 10000} + pass @pytest.mark.asyncio @@ -110,8 +93,9 @@ async def test_thinking_budget_disabled(claude_llm, base_llm_request): # Verify non-streaming mode was used (thinking disabled) mock_client.messages.create.assert_called_once() - # Thinking should not be in the call - assert "thinking" not in mock_client.messages.create.call_args.kwargs + # Thinking should be NOT_GIVEN when disabled + call_kwargs = mock_client.messages.create.call_args.kwargs + assert call_kwargs.get("thinking") == NOT_GIVEN @pytest.mark.asyncio @@ -122,11 +106,7 @@ async def test_thinking_budget_specific_value(claude_llm, base_llm_request): ) with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: - mock_stream = mock.AsyncMock() - mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) - mock_stream.__aexit__ = mock.AsyncMock() - mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) - mock_stream.get_final_message = mock.AsyncMock( + mock_client.messages.create = mock.AsyncMock( return_value=anthropic_types.Message( id="test", content=[], @@ -137,7 +117,6 @@ async def test_thinking_budget_specific_value(claude_llm, base_llm_request): usage=anthropic_types.Usage(input_tokens=10, output_tokens=20), ) ) - mock_client.messages.stream = mock.Mock(return_value=mock_stream) responses = [] async for response in claude_llm.generate_content_async( @@ -146,24 +125,20 @@ async def test_thinking_budget_specific_value(claude_llm, base_llm_request): responses.append(response) # Verify thinking parameter was passed with 5000 tokens - mock_client.messages.stream.assert_called_once() - call_kwargs = mock_client.messages.stream.call_args.kwargs + mock_client.messages.create.assert_called_once() + call_kwargs = mock_client.messages.create.call_args.kwargs assert call_kwargs["thinking"] == {"type": "enabled", "budget_tokens": 5000} @pytest.mark.asyncio -async def test_thinking_budget_minimum_enforced(claude_llm, base_llm_request): - """Test that thinking budget below 1024 is enforced to minimum 1024.""" +async def test_thinking_budget_minimum_value(claude_llm, base_llm_request): + """Test that thinking budget of 1024 tokens (minimum) works.""" base_llm_request.config.thinking_config = types.ThinkingConfig( - include_thoughts=True, thinking_budget=500 + include_thoughts=True, thinking_budget=1024 ) with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: - mock_stream = mock.AsyncMock() - mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) - mock_stream.__aexit__ = mock.AsyncMock() - mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) - mock_stream.get_final_message = mock.AsyncMock( + mock_client.messages.create = mock.AsyncMock( return_value=anthropic_types.Message( id="test", content=[], @@ -174,7 +149,6 @@ async def test_thinking_budget_minimum_enforced(claude_llm, base_llm_request): usage=anthropic_types.Usage(input_tokens=10, output_tokens=20), ) ) - mock_client.messages.stream = mock.Mock(return_value=mock_stream) responses = [] async for response in claude_llm.generate_content_async( @@ -182,53 +156,60 @@ async def test_thinking_budget_minimum_enforced(claude_llm, base_llm_request): ): responses.append(response) - # Verify thinking parameter was enforced to minimum 1024 tokens - mock_client.messages.stream.assert_called_once() - call_kwargs = mock_client.messages.stream.call_args.kwargs + # Verify thinking parameter was passed with 1024 tokens + mock_client.messages.create.assert_called_once() + call_kwargs = mock_client.messages.create.call_args.kwargs assert call_kwargs["thinking"] == {"type": "enabled", "budget_tokens": 1024} def test_thinking_block_parsing(): """Test that thinking blocks are parsed as Part(thought=True).""" + import base64 - # Create a mock thinking block + # Create a mock thinking block with 'thinking' attribute class ThinkingBlock: def __init__(self): self.thinking = "Let me think about this step by step..." - self.type = "thinking" + # Signature must be base64 encoded + self.signature = base64.b64encode(b"mock_signature_123").decode("utf-8") thinking_block = ThinkingBlock() part = content_block_to_part(thinking_block) assert part.text == "Let me think about this step by step..." - assert hasattr(part, "thought") assert part.thought is True + # Signature is stored as bytes (decoded from base64 input) + assert part.thought_signature == b"mock_signature_123" def test_thinking_block_type_check(): - """Test alternative thinking block detection via type attribute.""" + """Test that thinking blocks with type attribute are parsed correctly.""" + import base64 - # Create a mock thinking block with only type attribute - class ThinkingBlockTypeOnly: + # Create a mock thinking block with 'type' attribute + class ThinkingBlockWithType: def __init__(self): self.type = "thinking" + # Signature must be base64 encoded + self.signature = base64.b64encode(b"mock_signature_456").decode("utf-8") def __str__(self): - return "Thinking content here" + return "Thinking content via type check" - thinking_block = ThinkingBlockTypeOnly() + thinking_block = ThinkingBlockWithType() part = content_block_to_part(thinking_block) - assert part.text == "Thinking content here" - assert hasattr(part, "thought") + assert part.text == "Thinking content via type check" assert part.thought is True + # Signature is stored as bytes (decoded from base64 input) + assert part.thought_signature == b"mock_signature_456" @pytest.mark.asyncio async def test_no_thinking_config(claude_llm, base_llm_request): - """Test that no thinking config results in normal operation.""" + """Test that requests without thinking_config work normally.""" # No thinking_config set with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: @@ -250,5 +231,44 @@ async def test_no_thinking_config(claude_llm, base_llm_request): ): responses.append(response) - # Verify non-streaming mode was used + # Verify non-streaming mode was used without thinking mock_client.messages.create.assert_called_once() + call_kwargs = mock_client.messages.create.call_args.kwargs + assert call_kwargs.get("thinking") == NOT_GIVEN + + +@pytest.mark.asyncio +async def test_thinking_with_streaming(claude_llm, base_llm_request): + """Test that thinking works in streaming mode.""" + base_llm_request.config.thinking_config = types.ThinkingConfig( + include_thoughts=True, thinking_budget=2048 + ) + + with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: + mock_stream = mock.AsyncMock() + mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) + mock_stream.__aexit__ = mock.AsyncMock() + mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) + mock_stream.get_final_message = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=20), + ) + ) + mock_client.messages.stream = mock.Mock(return_value=mock_stream) + + responses = [] + async for response in claude_llm.generate_content_async( + base_llm_request, stream=True + ): + responses.append(response) + + # Verify streaming mode was used with thinking + mock_client.messages.stream.assert_called_once() + call_kwargs = mock_client.messages.stream.call_args.kwargs + assert call_kwargs["thinking"] == {"type": "enabled", "budget_tokens": 2048} From 5f321fddefe5d2a549152a80cebfd84865acb11f Mon Sep 17 00:00:00 2001 From: damithsenanayake Date: Fri, 3 Oct 2025 06:37:27 +1000 Subject: [PATCH 7/8] feat: interleaved thinking with tools --- src/google/adk/models/anthropic_llm.py | 10 + .../models/test_anthropic_thinking.py | 197 ++++++++++++++++++ 2 files changed, 207 insertions(+) diff --git a/src/google/adk/models/anthropic_llm.py b/src/google/adk/models/anthropic_llm.py index fd6e84303b..06590fa8d7 100644 --- a/src/google/adk/models/anthropic_llm.py +++ b/src/google/adk/models/anthropic_llm.py @@ -352,10 +352,12 @@ class Claude(BaseLlm): Attributes: model: The name of the Claude model. max_tokens: The maximum number of tokens to generate. + enable_interleaved_thinking: Enable interleaved thinking for tool use (beta). """ model: str = "claude-3-5-sonnet-v2@20241022" max_tokens: int = 8192 + enable_interleaved_thinking: bool = False @classmethod @override @@ -407,6 +409,12 @@ async def generate_content_async( else: logger.warning(f"No thinking_config found in llm_request.config") + # Configure beta header for interleaved thinking + extra_headers = ( + {"anthropic-beta": "interleaved-thinking-2025-05-14"} + if self.enable_interleaved_thinking and thinking != NOT_GIVEN + else NOT_GIVEN + ) if stream: # Use streaming mode @@ -428,6 +436,7 @@ async def generate_content_async( tool_choice=tool_choice, max_tokens=self.max_tokens, thinking=thinking, + extra_headers=extra_headers, ) as anthropic_stream: # Process streaming events async for event in anthropic_stream: @@ -494,6 +503,7 @@ async def generate_content_async( tool_choice=tool_choice, max_tokens=self.max_tokens, thinking=thinking, + extra_headers=extra_headers, ) yield message_to_generate_content_response(message) diff --git a/tests/unittests/models/test_anthropic_thinking.py b/tests/unittests/models/test_anthropic_thinking.py index ba839d9123..6d3505d18c 100644 --- a/tests/unittests/models/test_anthropic_thinking.py +++ b/tests/unittests/models/test_anthropic_thinking.py @@ -272,3 +272,200 @@ async def test_thinking_with_streaming(claude_llm, base_llm_request): mock_client.messages.stream.assert_called_once() call_kwargs = mock_client.messages.stream.call_args.kwargs assert call_kwargs["thinking"] == {"type": "enabled", "budget_tokens": 2048} + + +@pytest.mark.asyncio +async def test_interleaved_thinking_disabled_by_default( + claude_llm, base_llm_request +): + """Test that beta header is NOT sent when interleaved thinking is disabled.""" + base_llm_request.config.thinking_config = types.ThinkingConfig( + include_thoughts=True, thinking_budget=2048 + ) + + with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: + mock_client.messages.create = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=20), + ) + ) + + responses = [] + async for response in claude_llm.generate_content_async( + base_llm_request, stream=False + ): + responses.append(response) + + # Verify beta header is NOT_GIVEN (default behavior) + mock_client.messages.create.assert_called_once() + call_kwargs = mock_client.messages.create.call_args.kwargs + assert call_kwargs.get("extra_headers") == NOT_GIVEN + + +@pytest.mark.asyncio +async def test_interleaved_thinking_streaming(base_llm_request): + """Test that beta header is sent in streaming mode when enabled.""" + # Create Claude with interleaved thinking enabled + claude_llm = Claude( + model="claude-opus-4-1@20250805", + max_tokens=4096, + enable_interleaved_thinking=True, + ) + + base_llm_request.config.thinking_config = types.ThinkingConfig( + include_thoughts=True, thinking_budget=2048 + ) + + with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: + mock_stream = mock.AsyncMock() + mock_stream.__aenter__ = mock.AsyncMock(return_value=mock_stream) + mock_stream.__aexit__ = mock.AsyncMock() + mock_stream.__aiter__ = mock.AsyncMock(return_value=iter([])) + mock_stream.get_final_message = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=20), + ) + ) + mock_client.messages.stream = mock.Mock(return_value=mock_stream) + + responses = [] + async for response in claude_llm.generate_content_async( + base_llm_request, stream=True + ): + responses.append(response) + + # Verify beta header was sent + mock_client.messages.stream.assert_called_once() + call_kwargs = mock_client.messages.stream.call_args.kwargs + assert call_kwargs["extra_headers"] == { + "anthropic-beta": "interleaved-thinking-2025-05-14" + } + + +@pytest.mark.asyncio +async def test_interleaved_thinking_non_streaming(base_llm_request): + """Test that beta header is sent in non-streaming mode when enabled.""" + # Create Claude with interleaved thinking enabled + claude_llm = Claude( + model="claude-opus-4-1@20250805", + max_tokens=4096, + enable_interleaved_thinking=True, + ) + + base_llm_request.config.thinking_config = types.ThinkingConfig( + include_thoughts=True, thinking_budget=2048 + ) + + with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: + mock_client.messages.create = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=20), + ) + ) + + responses = [] + async for response in claude_llm.generate_content_async( + base_llm_request, stream=False + ): + responses.append(response) + + # Verify beta header was sent + mock_client.messages.create.assert_called_once() + call_kwargs = mock_client.messages.create.call_args.kwargs + assert call_kwargs["extra_headers"] == { + "anthropic-beta": "interleaved-thinking-2025-05-14" + } + + +@pytest.mark.asyncio +async def test_interleaved_thinking_requires_thinking_config(base_llm_request): + """Test that beta header is NOT sent when thinking is disabled.""" + # Create Claude with interleaved thinking enabled + claude_llm = Claude( + model="claude-opus-4-1@20250805", + max_tokens=4096, + enable_interleaved_thinking=True, + ) + + # No thinking_config set - thinking disabled + base_llm_request.config.thinking_config = types.ThinkingConfig( + include_thoughts=True, thinking_budget=0 # Disabled + ) + + with mock.patch.object(claude_llm, "_anthropic_client") as mock_client: + mock_client.messages.create = mock.AsyncMock( + return_value=anthropic_types.Message( + id="test", + content=[anthropic_types.TextBlock(text="4", type="text")], + model="claude-opus-4-1@20250805", + role="assistant", + stop_reason="end_turn", + type="message", + usage=anthropic_types.Usage(input_tokens=10, output_tokens=2), + ) + ) + + responses = [] + async for response in claude_llm.generate_content_async( + base_llm_request, stream=False + ): + responses.append(response) + + # Verify beta header is NOT_GIVEN when thinking is disabled + mock_client.messages.create.assert_called_once() + call_kwargs = mock_client.messages.create.call_args.kwargs + assert call_kwargs.get("extra_headers") == NOT_GIVEN + + +@pytest.mark.asyncio +async def test_thinking_blocks_preserved_in_assistant_messages(base_llm_request): + """Test that thinking blocks from previous assistant turn are preserved.""" + from google.adk.models.anthropic_llm import content_to_message_param + + # Create content with thinking block and tool use + content = types.Content( + role="model", + parts=[ + types.Part( + text="Let me calculate this step by step...", thought=True + ), + types.Part.from_function_call( + name="calculator", args={"expression": "2+2"} + ), + ], + ) + + message_param = content_to_message_param(content) + + # Verify message structure + assert message_param["role"] == "assistant" + assert len(message_param["content"]) == 2 + + # Verify thinking block comes FIRST + assert message_param["content"][0]["type"] == "thinking" + assert ( + message_param["content"][0]["thinking"] + == "Let me calculate this step by step..." + ) + + # Verify tool use comes after + assert message_param["content"][1]["type"] == "tool_use" + assert message_param["content"][1]["name"] == "calculator" From bb1b86bd1417bfdd57bf3fede7fa80b1c9ea8243 Mon Sep 17 00:00:00 2001 From: damithsenanayake Date: Fri, 3 Oct 2025 08:58:32 +1000 Subject: [PATCH 8/8] refactor: pass header directly --- src/google/adk/models/anthropic_llm.py | 12 +++------ .../models/test_anthropic_thinking.py | 26 ++++++++++--------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/src/google/adk/models/anthropic_llm.py b/src/google/adk/models/anthropic_llm.py index 06590fa8d7..a598aa567c 100644 --- a/src/google/adk/models/anthropic_llm.py +++ b/src/google/adk/models/anthropic_llm.py @@ -352,12 +352,12 @@ class Claude(BaseLlm): Attributes: model: The name of the Claude model. max_tokens: The maximum number of tokens to generate. - enable_interleaved_thinking: Enable interleaved thinking for tool use (beta). + extra_headers: Optional extra headers to pass to the Anthropic API. """ model: str = "claude-3-5-sonnet-v2@20241022" max_tokens: int = 8192 - enable_interleaved_thinking: bool = False + extra_headers: Optional[dict[str, str]] = None @classmethod @override @@ -409,12 +409,8 @@ async def generate_content_async( else: logger.warning(f"No thinking_config found in llm_request.config") - # Configure beta header for interleaved thinking - extra_headers = ( - {"anthropic-beta": "interleaved-thinking-2025-05-14"} - if self.enable_interleaved_thinking and thinking != NOT_GIVEN - else NOT_GIVEN - ) + # Use extra headers if provided + extra_headers = self.extra_headers or NOT_GIVEN if stream: # Use streaming mode diff --git a/tests/unittests/models/test_anthropic_thinking.py b/tests/unittests/models/test_anthropic_thinking.py index 6d3505d18c..dbe3415f1b 100644 --- a/tests/unittests/models/test_anthropic_thinking.py +++ b/tests/unittests/models/test_anthropic_thinking.py @@ -310,12 +310,12 @@ async def test_interleaved_thinking_disabled_by_default( @pytest.mark.asyncio async def test_interleaved_thinking_streaming(base_llm_request): - """Test that beta header is sent in streaming mode when enabled.""" - # Create Claude with interleaved thinking enabled + """Test that beta header is sent in streaming mode when provided.""" + # Create Claude with beta headers claude_llm = Claude( model="claude-opus-4-1@20250805", max_tokens=4096, - enable_interleaved_thinking=True, + extra_headers={"anthropic-beta": "interleaved-thinking-2025-05-14"}, ) base_llm_request.config.thinking_config = types.ThinkingConfig( @@ -356,12 +356,12 @@ async def test_interleaved_thinking_streaming(base_llm_request): @pytest.mark.asyncio async def test_interleaved_thinking_non_streaming(base_llm_request): - """Test that beta header is sent in non-streaming mode when enabled.""" - # Create Claude with interleaved thinking enabled + """Test that beta header is sent in non-streaming mode when provided.""" + # Create Claude with beta headers claude_llm = Claude( model="claude-opus-4-1@20250805", max_tokens=4096, - enable_interleaved_thinking=True, + extra_headers={"anthropic-beta": "interleaved-thinking-2025-05-14"}, ) base_llm_request.config.thinking_config = types.ThinkingConfig( @@ -396,13 +396,13 @@ async def test_interleaved_thinking_non_streaming(base_llm_request): @pytest.mark.asyncio -async def test_interleaved_thinking_requires_thinking_config(base_llm_request): - """Test that beta header is NOT sent when thinking is disabled.""" - # Create Claude with interleaved thinking enabled +async def test_extra_headers_sent_regardless_of_thinking(base_llm_request): + """Test that extra headers are sent even when thinking is disabled.""" + # Create Claude with beta headers claude_llm = Claude( model="claude-opus-4-1@20250805", max_tokens=4096, - enable_interleaved_thinking=True, + extra_headers={"anthropic-beta": "interleaved-thinking-2025-05-14"}, ) # No thinking_config set - thinking disabled @@ -429,10 +429,12 @@ async def test_interleaved_thinking_requires_thinking_config(base_llm_request): ): responses.append(response) - # Verify beta header is NOT_GIVEN when thinking is disabled + # Verify beta header is sent even when thinking is disabled mock_client.messages.create.assert_called_once() call_kwargs = mock_client.messages.create.call_args.kwargs - assert call_kwargs.get("extra_headers") == NOT_GIVEN + assert call_kwargs.get("extra_headers") == { + "anthropic-beta": "interleaved-thinking-2025-05-14" + } @pytest.mark.asyncio