Skip to content

Commit b8be90d

Browse files
Mateusz Krawieccopybara-github
authored andcommitted
fix: preserve non-text output in streaming responses
The streaming response aggregator only accumulated text and function-call parts, so any other part (an image returned as inline data, server-side tool calls/responses, etc.) was dropped from the final aggregated response when text accompanied it. Mirror ADK Python and preserve any part that is not text or a function call, keeping the existing exclusion of standalone thought-signature parts. PiperOrigin-RevId: 942147787
1 parent 6dd4594 commit b8be90d

2 files changed

Lines changed: 154 additions & 10 deletions

File tree

core/src/main/java/com/google/adk/models/Gemini.java

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -356,18 +356,24 @@ private static String generateClientFunctionCallId() {
356356
}
357357

358358
/**
359-
* Accumulates text and function calls from incoming parts. Function-call parts passed to this
360-
* method are expected to already have IDs (see {@link #ensureFunctionCallIds}).
359+
* Accumulates content from incoming parts: text, function calls, and any other content part
360+
* (inline image/audio data, file data, code execution, server-side tool calls/responses, and
361+
* future part types). Standalone thought-signature/thought parts are the one exception: their
362+
* signature is captured and re-attached to the last real part in {@link #processFinalResponse},
363+
* so they are not emitted on their own. Function-call parts passed to this method are expected
364+
* to already have IDs (see {@link #ensureFunctionCallIds}).
361365
*
362-
* @return true if any text or function call was present, false otherwise.
366+
* @return true if any content part was present, false otherwise.
363367
*/
364368
private boolean accumulateParts(List<Part> parts) {
365-
boolean hasTextOrFc = false;
369+
boolean hasContent = false;
366370
for (Part part : parts) {
367-
part.thoughtSignature().ifPresent(sig -> currentThoughtSignature = sig);
368371
String text = part.text().orElse("");
369372
if (!text.isEmpty()) {
370-
hasTextOrFc = true;
373+
hasContent = true;
374+
// The signature belongs to this text; capture it so flushTextBufferToSequence attaches
375+
// it.
376+
part.thoughtSignature().ifPresent(sig -> currentThoughtSignature = sig);
371377
boolean isThought = part.thought().orElse(false);
372378
// Immediately flush the active text buffer to preserve the exact interleaved blocks of
373379
// text/thoughts.
@@ -378,13 +384,28 @@ private boolean accumulateParts(List<Part> parts) {
378384
currentTextIsThought = isThought;
379385
}
380386
currentTextBuffer.append(text);
381-
}
382-
if (part.functionCall().isPresent()) {
383-
hasTextOrFc = true;
387+
} else if (part.functionCall().isPresent()) {
388+
hasContent = true;
384389
processFunctionCallPart(part);
390+
} else if (part.text().isEmpty() && !part.thought().orElse(false)) {
391+
// Mirror ADK Python's catch-all: preserve any part that is not text or a function call
392+
// (inline image/audio data, file data, code execution, server-side tool calls/responses,
393+
// future part types) rather than an allowlist that silently drops unlisted types. Flush
394+
// buffered text first so parts keep their order, then append the part verbatim keeping
395+
// any
396+
// thoughtSignature it carries. The signature is intentionally not captured into
397+
// currentThoughtSignature, which would leak it onto the preceding part.
398+
hasContent = true;
399+
flushTextBufferToSequence();
400+
accumulatedSequence.add(part);
401+
} else {
402+
// Standalone thought/thought-signature part with no renderable content: not emitted on
403+
// its
404+
// own; capture its signature to re-attach to the last real part in processFinalResponse.
405+
part.thoughtSignature().ifPresent(sig -> currentThoughtSignature = sig);
385406
}
386407
}
387-
return hasTextOrFc;
408+
return hasContent;
388409
}
389410

390411
/**

core/src/test/java/com/google/adk/models/GeminiTest.java

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import com.google.common.collect.ImmutableList;
2222
import com.google.common.collect.ImmutableMap;
2323
import com.google.common.collect.Iterables;
24+
import com.google.genai.types.Blob;
2425
import com.google.genai.types.Candidate;
2526
import com.google.genai.types.Content;
2627
import com.google.genai.types.FinishReason;
@@ -29,6 +30,8 @@
2930
import com.google.genai.types.GenerateContentResponseUsageMetadata;
3031
import com.google.genai.types.Part;
3132
import com.google.genai.types.PartialArg;
33+
import com.google.genai.types.ToolCall;
34+
import com.google.genai.types.ToolResponse;
3235
import io.reactivex.rxjava3.core.Flowable;
3336
import io.reactivex.rxjava3.functions.Predicate;
3437
import io.reactivex.rxjava3.subscribers.TestSubscriber;
@@ -368,6 +371,126 @@ public void processRawResponses_twoStreamingFunctionCalls_keepArgsSeparate() {
368371
assertThat(second.args().get()).containsExactly("b", "2");
369372
}
370373

374+
@Test
375+
public void processRawResponses_imageOnlyWithStop_emitsFinalImagePart() {
376+
Part imagePart = Part.fromBytes(new byte[] {1, 2, 3}, "image/png");
377+
GenerateContentResponse imageWithStop =
378+
toResponse(
379+
Candidate.builder()
380+
.content(Content.builder().role("model").parts(imagePart).build())
381+
.finishReason(new FinishReason(FinishReason.Known.STOP))
382+
.build());
383+
384+
ImmutableList<LlmResponse> responses =
385+
ImmutableList.copyOf(
386+
Gemini.processRawResponses(Flowable.just(imageWithStop)).blockingIterable());
387+
388+
LlmResponse finalResponse = Iterables.getLast(responses);
389+
assertThat(finalResponse.content().get().parts().get()).hasSize(1);
390+
assertThat(finalResponse.content().get().parts().get().get(0).inlineData()).isPresent();
391+
}
392+
393+
// Image-generation models return image bytes as an inline-data part, often alongside text.
394+
// Regression test: the aggregated response must retain the image, not just the text.
395+
@Test
396+
public void processRawResponses_textThenImageWithStop_finalKeepsTextAndImage() {
397+
Part imagePart = Part.fromBytes(new byte[] {1, 2, 3}, "image/png");
398+
GenerateContentResponse textChunk = toResponseWithText("Here is your image:");
399+
GenerateContentResponse imageWithStop =
400+
toResponse(
401+
Candidate.builder()
402+
.content(Content.builder().role("model").parts(imagePart).build())
403+
.finishReason(new FinishReason(FinishReason.Known.STOP))
404+
.build());
405+
406+
ImmutableList<LlmResponse> responses =
407+
ImmutableList.copyOf(
408+
Gemini.processRawResponses(Flowable.just(textChunk, imageWithStop)).blockingIterable());
409+
410+
LlmResponse finalResponse = Iterables.getLast(responses);
411+
assertThat(finalResponse.content().get().parts().get()).hasSize(2);
412+
assertThat(finalResponse.content().get().parts().get().get(0).text())
413+
.hasValue("Here is your image:");
414+
assertThat(finalResponse.content().get().parts().get().get(1).inlineData()).isPresent();
415+
}
416+
417+
// The aggregator must pass through any non-text, non-function-call part, not just an allowlist.
418+
// These guard the part types that were being silently dropped: server-side tool calls/responses
419+
// and function responses. Each uses a text-then-part sequence so the part must survive the final
420+
// aggregation (a lone part would otherwise slip through via the empty-sequence fallback).
421+
@Test
422+
public void processRawResponses_textThenToolCall_finalKeepsBoth() {
423+
Part toolCallPart =
424+
Part.builder()
425+
.toolCall(ToolCall.builder().id("tc-1").args(ImmutableMap.of("q", "weather")).build())
426+
.build();
427+
428+
LlmResponse finalResponse = aggregateTextThenPart(toolCallPart);
429+
430+
assertThat(finalResponse.content().get().parts().get()).hasSize(2);
431+
assertThat(finalResponse.content().get().parts().get().get(1).toolCall()).isPresent();
432+
}
433+
434+
@Test
435+
public void processRawResponses_textThenToolResponse_finalKeepsBoth() {
436+
Part toolResponsePart =
437+
Part.builder()
438+
.toolResponse(
439+
ToolResponse.builder().id("tc-1").response(ImmutableMap.of("ok", true)).build())
440+
.build();
441+
442+
LlmResponse finalResponse = aggregateTextThenPart(toolResponsePart);
443+
444+
assertThat(finalResponse.content().get().parts().get()).hasSize(2);
445+
assertThat(finalResponse.content().get().parts().get().get(1).toolResponse()).isPresent();
446+
}
447+
448+
@Test
449+
public void processRawResponses_textThenFunctionResponse_finalKeepsBoth() {
450+
Part functionResponsePart = Part.fromFunctionResponse("my_tool", ImmutableMap.of("result", 42));
451+
452+
LlmResponse finalResponse = aggregateTextThenPart(functionResponsePart);
453+
454+
assertThat(finalResponse.content().get().parts().get()).hasSize(2);
455+
assertThat(finalResponse.content().get().parts().get().get(1).functionResponse()).isPresent();
456+
}
457+
458+
// Per the Gemini docs, a data part (e.g. inlineData) can carry a thoughtSignature with the
459+
// thought
460+
// flag unset (multi-turn image editing). The part must be kept verbatim, and its signature must
461+
// not leak onto the preceding text part (the docs forbid putting a signature on a part that did
462+
// not originally carry one).
463+
@Test
464+
public void processRawResponses_textThenDataPartWithSignature_keepsSignatureOnDataPartOnly() {
465+
Part imageWithSignature =
466+
Part.builder()
467+
.inlineData(Blob.builder().mimeType("image/png").data(new byte[] {1, 2, 3}).build())
468+
.thoughtSignature("sig".getBytes(UTF_8))
469+
.build();
470+
471+
LlmResponse finalResponse = aggregateTextThenPart(imageWithSignature);
472+
473+
assertThat(finalResponse.content().get().parts().get()).hasSize(2);
474+
assertThat(finalResponse.content().get().parts().get().get(0).text())
475+
.hasValue("Working on it:");
476+
assertThat(finalResponse.content().get().parts().get().get(0).thoughtSignature()).isEmpty();
477+
assertThat(finalResponse.content().get().parts().get().get(1).inlineData()).isPresent();
478+
assertThat(finalResponse.content().get().parts().get().get(1).thoughtSignature()).isPresent();
479+
}
480+
481+
private LlmResponse aggregateTextThenPart(Part part) {
482+
GenerateContentResponse textChunk = toResponseWithText("Working on it:");
483+
GenerateContentResponse partWithStop =
484+
toResponse(
485+
Candidate.builder()
486+
.content(Content.builder().role("model").parts(part).build())
487+
.finishReason(new FinishReason(FinishReason.Known.STOP))
488+
.build());
489+
return Iterables.getLast(
490+
ImmutableList.copyOf(
491+
Gemini.processRawResponses(Flowable.just(textChunk, partWithStop)).blockingIterable()));
492+
}
493+
371494
@Test
372495
public void processRawResponses_textAndStopReason_emitsPartialThenFinalText() {
373496
Flowable<GenerateContentResponse> rawResponses =

0 commit comments

Comments
 (0)