From 50d7733705e3b276cce58721ecc122dd661fb1e0 Mon Sep 17 00:00:00 2001 From: Jinni Gu Date: Wed, 29 Oct 2025 22:19:58 -0700 Subject: [PATCH] feat: Add transcription support for single-agent live scenarios --- src/google/adk/runners.py | 9 +- .../streaming/test_live_streaming_configs.py | 100 ++++++++++++++++++ 2 files changed, 105 insertions(+), 4 deletions(-) diff --git a/src/google/adk/runners.py b/src/google/adk/runners.py index 50197dda2f..9db3f4e47c 100644 --- a/src/google/adk/runners.py +++ b/src/google/adk/runners.py @@ -1104,12 +1104,12 @@ def _new_invocation_context_for_live( live_request_queue: Optional[LiveRequestQueue] = None, run_config: Optional[RunConfig] = None, ) -> InvocationContext: - """Creates a new invocation context for live multi-agent.""" + """Creates a new invocation context for live single and multi-agent scenarios.""" run_config = run_config or RunConfig() # For live multi-agent, we need model's text transcription as context for - # next agent. - if self.agent.sub_agents and live_request_queue: + # next agent. For single-agent, we need a general transcription support. + if live_request_queue: if not run_config.response_modalities: # default run_config.response_modalities = ['AUDIO'] @@ -1123,7 +1123,8 @@ def _new_invocation_context_for_live( types.AudioTranscriptionConfig() ) if not run_config.input_audio_transcription: - # need this input transcription for agent transferring in live mode. + # need this input transcription for agent transferring in multi-agent live + # mode and for general transcription support in single agent live mode. run_config.input_audio_transcription = types.AudioTranscriptionConfig() return self._new_invocation_context( session, diff --git a/tests/unittests/streaming/test_live_streaming_configs.py b/tests/unittests/streaming/test_live_streaming_configs.py index ecb253e09f..80560a3e78 100644 --- a/tests/unittests/streaming/test_live_streaming_configs.py +++ b/tests/unittests/streaming/test_live_streaming_configs.py @@ -642,3 +642,103 @@ def test_streaming_with_context_window_compression_config(): llm_request_sent_to_mock.live_connect_config.context_window_compression.sliding_window.target_tokens == 500 ) + + +def test_single_agent_live_streaming_with_transcription(): + """Test single-agent streaming adds transcription configs when not provided.""" + response1 = LlmResponse( + turn_complete=True, + ) + + mock_model = testing_utils.MockModel.create([response1]) + + root_agent = Agent( + name='single_agent', + model=mock_model, + tools=[], + ) + + runner = testing_utils.InMemoryRunner(root_agent=root_agent) + + # Test without passing any run_config to verify default behavior + # The logic in _new_invocation_context_for_live should automatically add + # transcription configs for live streaming + live_request_queue = LiveRequestQueue() + live_request_queue.send_realtime( + blob=types.Blob(data=b'\x00\xFF', mime_type='audio/pcm') + ) + + res_events = runner.run_live(live_request_queue) + + assert res_events is not None, 'Expected a list of events, got None.' + assert ( + len(res_events) > 0 + ), 'Expected at least one response, but got an empty list.' + assert len(mock_model.requests) == 1 + + # Get the request that was captured + llm_request_sent_to_mock = mock_model.requests[0] + + # Assert that transcription configs were added + assert llm_request_sent_to_mock.live_connect_config is not None + assert ( + llm_request_sent_to_mock.live_connect_config.output_audio_transcription + is not None + ) + assert ( + llm_request_sent_to_mock.live_connect_config.input_audio_transcription + is not None + ) + + +def test_single_agent_live_streaming_respects_explicit_transcription(): + """Test that single-agent live streaming respects explicitly provided transcription configs.""" + response1 = LlmResponse( + turn_complete=True, + ) + + mock_model = testing_utils.MockModel.create([response1]) + + # Create a single agent (no sub_agents) + root_agent = Agent( + name='single_agent', + model=mock_model, + tools=[], + ) + + runner = testing_utils.InMemoryRunner(root_agent=root_agent) + + # Create run config with input and output audio transcription + explicit_output_config = types.AudioTranscriptionConfig() + explicit_input_config = types.AudioTranscriptionConfig() + run_config = RunConfig( + output_audio_transcription=explicit_output_config, + input_audio_transcription=explicit_input_config, + ) + + live_request_queue = LiveRequestQueue() + live_request_queue.send_realtime( + blob=types.Blob(data=b'\x00\xFF', mime_type='audio/pcm') + ) + + res_events = runner.run_live(live_request_queue, run_config) + + assert res_events is not None, 'Expected a list of events, got None.' + assert ( + len(res_events) > 0 + ), 'Expected at least one response, but got an empty list.' + assert len(mock_model.requests) == 1 + + # Get the request that was captured + llm_request_sent_to_mock = mock_model.requests[0] + + # Assert that the explicit configs were used + assert llm_request_sent_to_mock.live_connect_config is not None + assert ( + llm_request_sent_to_mock.live_connect_config.output_audio_transcription + is explicit_output_config + ) + assert ( + llm_request_sent_to_mock.live_connect_config.input_audio_transcription + is explicit_input_config + )