From 621e54282db7a0621c326e636c6fb70c32ac397e Mon Sep 17 00:00:00 2001 From: Sureel Bhurat Date: Sun, 23 Nov 2025 09:21:37 -0500 Subject: [PATCH 1/4] Fix: Map finish_reason for LiteLLM streaming responses Fixes #3665 Streaming responses from LiteLLM models (Claude, GPT, etc.) were not setting finish_reason on aggregated LlmResponse objects, causing agent runners to not properly recognize completion states. This fix mirrors the finish_reason mapping logic from the non-streaming path (lines 776-784) and applies it to both streaming code paths: - Tool call responses (lines 1340-1368) - Text-only responses (lines 1369-1390) Without this fix, agents using Claude or GPT via LiteLLM would encounter stop conditions that couldn't be properly handled, leading to incomplete responses or unexpected agent behavior. Tested with Claude Sonnet 4.5 and GPT-5 via Azure OpenAI in production multi-agent system with MCP tools. --- src/google/adk/models/lite_llm.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/google/adk/models/lite_llm.py b/src/google/adk/models/lite_llm.py index 9e3698b190..5a6b0b2e91 100644 --- a/src/google/adk/models/lite_llm.py +++ b/src/google/adk/models/lite_llm.py @@ -1348,6 +1348,20 @@ async def generate_content_async( else None, ) ) + # FIX: Map finish_reason to FinishReason enum for streaming responses. + # Previously, streaming responses did not set finish_reason on aggregated + # LlmResponse objects, causing the ADK agent runner to not properly recognize + # completion states. This mirrors the logic from non-streaming path (lines 776-784) + # to ensure consistent behavior across both streaming and non-streaming modes. + # Without this, Claude and other models via LiteLLM would hit stop conditions + # that the agent couldn't properly handle. + if isinstance(finish_reason, types.FinishReason): + aggregated_llm_response_with_tool_call.finish_reason = finish_reason + else: + finish_reason_str = str(finish_reason).lower() + aggregated_llm_response_with_tool_call.finish_reason = _FINISH_REASON_MAPPING.get( + finish_reason_str, types.FinishReason.OTHER + ) text = "" reasoning_parts = [] function_calls.clear() @@ -1362,6 +1376,20 @@ async def generate_content_async( if reasoning_parts else None, ) + # FIX: Map finish_reason to FinishReason enum for streaming text-only responses. + # Previously, streaming responses did not set finish_reason on aggregated + # LlmResponse objects, causing the ADK agent runner to not properly recognize + # completion states. This mirrors the logic from non-streaming path (lines 776-784) + # to ensure consistent behavior across both streaming and non-streaming modes. + # Without this, Claude and other models via LiteLLM would hit stop conditions + # that the agent couldn't properly handle. + if isinstance(finish_reason, types.FinishReason): + aggregated_llm_response.finish_reason = finish_reason + else: + finish_reason_str = str(finish_reason).lower() + aggregated_llm_response.finish_reason = _FINISH_REASON_MAPPING.get( + finish_reason_str, types.FinishReason.OTHER + ) text = "" reasoning_parts = [] From 40caeecab1c406ad77e093ef5307d9b3ed876ef6 Mon Sep 17 00:00:00 2001 From: Sureel Bhurat Date: Sun, 23 Nov 2025 09:30:52 -0500 Subject: [PATCH 2/4] Fix: Prevent double JSON serialization of MCP tool responses Fixes #3676 MCP tool responses arrive as JSON strings but were being double-serialized by _safe_json_serialize(), creating triple-nested JSON that prevented Claude and GPT from parsing tool results. Example of the bug: '{"content": [{"type": "text", "text": "{\n \"type\"..."}]}' This fix adds an isinstance(str) check before serialization. If the response is already a string (from MCP or other sources), it's used directly. Otherwise, it's serialized normally. Impact: Without this fix, agents using LiteLLM with MCP tools would successfully call tools but fail to present results to users, appearing to hang or produce incomplete responses. Tested with Claude Sonnet 4.5 and GPT-5 via Azure OpenAI with MCP tools (Google Drive, HubSpot CRM) in production multi-agent system. --- src/google/adk/models/lite_llm.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/google/adk/models/lite_llm.py b/src/google/adk/models/lite_llm.py index 9e3698b190..f47485e3c2 100644 --- a/src/google/adk/models/lite_llm.py +++ b/src/google/adk/models/lite_llm.py @@ -367,11 +367,21 @@ def _content_to_message_param( tool_messages = [] for part in content.parts: if part.function_response: + # FIX: Check if response is already a string before serializing. + # MCP tool responses come as JSON strings, but _safe_json_serialize was + # double-serializing them (json.dumps on already-JSON strings), causing + # triple-nested JSON like: '{"content": [{"type": "text", "text": "{\n \"type\"..."}]}' + # This prevented Claude/GPT from parsing tool results correctly. + response_content = ( + part.function_response.response + if isinstance(part.function_response.response, str) + else _safe_json_serialize(part.function_response.response) + ) tool_messages.append( ChatCompletionToolMessage( role="tool", tool_call_id=part.function_response.id, - content=_safe_json_serialize(part.function_response.response), + content=response_content, ) ) if tool_messages: From 69bb67ff2d6911eb367907e274e88e5e0521aa18 Mon Sep 17 00:00:00 2001 From: Sureel Bhurat Date: Mon, 24 Nov 2025 16:22:27 -0500 Subject: [PATCH 3/4] fix: Set content=None for tool-only messages in streaming to avoid duplication - Fixes content duplication where planning/reasoning text appears twice (once during streaming, again in aggregated tool-call message) - Aligns with OpenAI/LiteLLM conventions for tool-call messages - Planning text is preserved in thought_parts and already streamed to users - Resolves semantic confusion where tool-call messages contained text content Fixes #3697 --- src/google/adk/models/lite_llm.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/google/adk/models/lite_llm.py b/src/google/adk/models/lite_llm.py index d92007410e..c62f3d1fe1 100644 --- a/src/google/adk/models/lite_llm.py +++ b/src/google/adk/models/lite_llm.py @@ -1349,7 +1349,13 @@ async def generate_content_async( _message_to_generate_content_response( ChatCompletionAssistantMessage( role="assistant", - content=text, + # FIX: Set content=None for tool-only messages to avoid duplication + # and follow OpenAI/LiteLLM conventions. Planning/reasoning text is + # already streamed (lines 1288-1296) and preserved in thought_parts + # (line 1357). Including it again in content causes duplication and + # violates API specifications for tool-call messages. + # See: https://github.com/google/adk-python/issues/3697 + content=None, tool_calls=tool_calls, ), model_version=part.model, From 021378eca23f9eee7ad9ca4faf86c7939cf8019e Mon Sep 17 00:00:00 2001 From: Sureel Bhurat Date: Tue, 25 Nov 2025 07:41:03 -0500 Subject: [PATCH 4/4] refactor: Extract _set_finish_reason helper to eliminate code duplication Extract finish_reason mapping logic into a reusable helper function to address code duplication feedback from Gemini Code Assist review on PR #3698. Changes: - Added _set_finish_reason(response, finish_reason) helper function - Replaced three duplicate mapping blocks with single helper call: * Non-streaming path (line ~880) * Streaming tool-call path (line ~1387) * Streaming text-only path (line ~1409) - Preserved all existing comments and behavior - Improved maintainability - single source of truth for mapping logic Addresses: https://github.com/google/adk-python/pull/3698\#discussion_r18xxxxx --- src/google/adk/models/lite_llm.py | 43 ++++++++++++++++--------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/src/google/adk/models/lite_llm.py b/src/google/adk/models/lite_llm.py index c62f3d1fe1..ded6f12aee 100644 --- a/src/google/adk/models/lite_llm.py +++ b/src/google/adk/models/lite_llm.py @@ -87,6 +87,25 @@ ) +def _set_finish_reason( + response: types.LlmResponse, finish_reason: Any +) -> None: + """Sets the finish reason on the LlmResponse, mapping from string if necessary. + + Args: + response: The LlmResponse object to update. + finish_reason: The finish reason value, either a FinishReason enum or a string + that needs to be mapped. + """ + if isinstance(finish_reason, types.FinishReason): + response.finish_reason = finish_reason + else: + finish_reason_str = str(finish_reason).lower() + response.finish_reason = _FINISH_REASON_MAPPING.get( + finish_reason_str, types.FinishReason.OTHER + ) + + def _decode_inline_text_data(raw_bytes: bytes) -> str: """Decodes inline file bytes that represent textual content.""" try: @@ -861,13 +880,7 @@ def _model_response_to_generate_content_response( if finish_reason: # If LiteLLM already provides a FinishReason enum (e.g., for Gemini), use # it directly. Otherwise, map the finish_reason string to the enum. - if isinstance(finish_reason, types.FinishReason): - llm_response.finish_reason = finish_reason - else: - finish_reason_str = str(finish_reason).lower() - llm_response.finish_reason = _FINISH_REASON_MAPPING.get( - finish_reason_str, types.FinishReason.OTHER - ) + _set_finish_reason(llm_response, finish_reason) if response.get("usage", None): llm_response.usage_metadata = types.GenerateContentResponseUsageMetadata( prompt_token_count=response["usage"].get("prompt_tokens", 0), @@ -1371,13 +1384,7 @@ async def generate_content_async( # to ensure consistent behavior across both streaming and non-streaming modes. # Without this, Claude and other models via LiteLLM would hit stop conditions # that the agent couldn't properly handle. - if isinstance(finish_reason, types.FinishReason): - aggregated_llm_response_with_tool_call.finish_reason = finish_reason - else: - finish_reason_str = str(finish_reason).lower() - aggregated_llm_response_with_tool_call.finish_reason = _FINISH_REASON_MAPPING.get( - finish_reason_str, types.FinishReason.OTHER - ) + _set_finish_reason(aggregated_llm_response_with_tool_call, finish_reason) text = "" reasoning_parts = [] function_calls.clear() @@ -1399,13 +1406,7 @@ async def generate_content_async( # to ensure consistent behavior across both streaming and non-streaming modes. # Without this, Claude and other models via LiteLLM would hit stop conditions # that the agent couldn't properly handle. - if isinstance(finish_reason, types.FinishReason): - aggregated_llm_response.finish_reason = finish_reason - else: - finish_reason_str = str(finish_reason).lower() - aggregated_llm_response.finish_reason = _FINISH_REASON_MAPPING.get( - finish_reason_str, types.FinishReason.OTHER - ) + _set_finish_reason(aggregated_llm_response, finish_reason) text = "" reasoning_parts = []