From f72a400700e2b95d385f11c1e3432727811e5039 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Mon, 11 May 2026 11:18:29 +0200 Subject: [PATCH 1/4] highlight image support --- docs-website/docs/concepts/agents.mdx | 1 + .../pipeline-components/agents-1/agent.mdx | 102 ++++++++++++++++++ 2 files changed, 103 insertions(+) diff --git a/docs-website/docs/concepts/agents.mdx b/docs-website/docs/concepts/agents.mdx index 7e30a2bf1d..05adf2955d 100644 --- a/docs-website/docs/concepts/agents.mdx +++ b/docs-website/docs/concepts/agents.mdx @@ -45,6 +45,7 @@ Key capabilities include: - **Human-in-the-loop**: Intercept tool calls for human review before execution. See [Human in the Loop](../pipeline-components/agents-1/human-in-the-loop.mdx). - **Multi-agent systems**: Wrap an `Agent` as a `ComponentTool` to build coordinator/specialist architectures. See [Multi-Agent Systems](./agents/multi-agent-systems.mdx). - **MCP server exposure**: Expose your agent as an MCP server using [Hayhooks](../development/hayhooks.mdx), making it callable from any MCP-compatible client such as Claude Desktop or Cursor. +- **Multimodal inputs**: Pass images alongside text using `ImageContent` in `ChatMessage` content parts, or return `ImageContent` from tools for dynamic image analysis. Requires a vision-capable model such as `gpt-5` or `gemini-2.5-flash`. See [Multimodal Inputs](../pipeline-components/agents-1/agent.mdx#multimodal-inputs). Check out the [Agent](../pipeline-components/agents-1/agent.mdx) documentation, or the [example](#tool-calling-agent) below to get started. diff --git a/docs-website/docs/pipeline-components/agents-1/agent.mdx b/docs-website/docs/pipeline-components/agents-1/agent.mdx index 1e10283a88..90ccd4a4c5 100644 --- a/docs-website/docs/pipeline-components/agents-1/agent.mdx +++ b/docs-website/docs/pipeline-components/agents-1/agent.mdx @@ -334,6 +334,106 @@ See our [Streaming Support](../generators/guides-to-generators/choosing-the-righ Give preference to `print_streaming_chunk` by default. Write a custom callback only if you need a specific transport (for example, SSE/WebSocket) or custom UI formatting. +## Multimodal Inputs + +Agents support multimodal inputs when paired with a vision-capable model such as `gpt-5` (OpenAI) or `gemini-2.5-flash` (Google). +Pass images alongside text by including `ImageContent` objects in the `content_parts` of a `ChatMessage`: + +```python +from haystack.dataclasses import ChatMessage, ImageContent + +image = ImageContent.from_url("https://example.com/chart.png") +result = agent.run( + messages=[ + ChatMessage.from_user(content_parts=["What does this chart show?", image]), + ], +) +``` + +Tools can also return `ImageContent` directly, letting the agent fetch and reason about images dynamically during its loop. +Two things are required: set `outputs_to_string={"raw_result": True}` so the `ToolInvoker` skips string conversion, and return a `list[ImageContent]` (the tool result type is `str | Sequence[TextContent | ImageContent]`): + +```python +from typing import Annotated +from haystack.components.agents import Agent +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ChatMessage, ImageContent +from haystack.tools import tool + + +@tool(outputs_to_string={"raw_result": True}) +def fetch_image( + url: Annotated[str, "URL of the image to fetch and analyze"], +) -> list[ImageContent]: + """Fetch an image from a URL so the agent can analyze its contents.""" + return [ImageContent.from_url(url)] + + +agent = Agent( + chat_generator=OpenAIChatGenerator(model="gpt-5"), + tools=[fetch_image], + system_prompt="You are a helpful assistant that can fetch and analyze images from URLs.", +) + +result = agent.run( + messages=[ + ChatMessage.from_user( + "Fetch the image at https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Bikesgray.jpg/320px-Bikesgray.jpg and describe what you see.", + ), + ], +) +print(result["last_message"].text) +``` + +`ImageContent` can be created from a URL, a local file path, or a PDF page using the `PDFToImageContent` converter. + +### In a pipeline + +When an `Agent` sits inside a pipeline, use `ChatPromptBuilder` with its string template format and the `| templatize_part` filter to pass images as structured content parts: + +```python +from haystack import Pipeline +from haystack.components.agents import Agent +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ImageContent + +template = """ +{% message role="user" %} +{{ question }} +{{ image | templatize_part }} +{% endmessage %} +""" + +agent = Agent( + chat_generator=OpenAIChatGenerator(model="gpt-5"), + system_prompt="You are a helpful assistant that can analyze images.", +) +prompt_builder = ChatPromptBuilder( + template=template, + required_variables=["question", "image"], +) + +pipeline = Pipeline() +pipeline.add_component("prompt_builder", prompt_builder) +pipeline.add_component("agent", agent) +pipeline.connect("prompt_builder.prompt", "agent.messages") + +image = ImageContent.from_file_path("chart.png") +result = pipeline.run( + { + "prompt_builder": {"question": "What does this chart show?", "image": image}, + }, +) +print(result["agent"]["last_message"].text) +``` + +:::tip +See these cookbooks for complete multimodal agent examples: +- [Multimodal Agents](https://haystack.deepset.ai/cookbook/multimodal_intro#multimodal-agent) — image inputs and tool use with agents +- [Gemma Chat RAG](https://haystack.deepset.ai/cookbook/gemma_chat_rag) — vision model in a RAG pipeline +::: + ## Multi-Agent Systems You can wrap an `Agent` as a tool to build multi-agent systems where specialist agents handle focused subtasks and a coordinator agent plans and delegates. @@ -363,3 +463,5 @@ Agents work with MCP in two directions: 🧑‍🍳 Cookbook: - [Build a GitHub Issue Resolver Agent](https://haystack.deepset.ai/cookbook/github_issue_resolver_agent) +- [Multimodal Agents](https://haystack.deepset.ai/cookbook/multimodal_intro#multimodal-agent) +- [Gemma Chat RAG](https://haystack.deepset.ai/cookbook/gemma_chat_rag) From df5041d46f415ef73106d794743305d4356ee779 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Mon, 11 May 2026 11:27:40 +0200 Subject: [PATCH 2/4] use responses api --- .../pipeline-components/agents-1/agent.mdx | 8 +- .../version-2.28/concepts/agents.mdx | 1 + .../pipeline-components/agents-1/agent.mdx | 104 ++++++++++++++++++ 3 files changed, 110 insertions(+), 3 deletions(-) diff --git a/docs-website/docs/pipeline-components/agents-1/agent.mdx b/docs-website/docs/pipeline-components/agents-1/agent.mdx index 90ccd4a4c5..f371b868ca 100644 --- a/docs-website/docs/pipeline-components/agents-1/agent.mdx +++ b/docs-website/docs/pipeline-components/agents-1/agent.mdx @@ -351,12 +351,14 @@ result = agent.run( ``` Tools can also return `ImageContent` directly, letting the agent fetch and reason about images dynamically during its loop. -Two things are required: set `outputs_to_string={"raw_result": True}` so the `ToolInvoker` skips string conversion, and return a `list[ImageContent]` (the tool result type is `str | Sequence[TextContent | ImageContent]`): +Two things are required: set `outputs_to_string={"raw_result": True}` so the `ToolInvoker` skips string conversion, and return a `list[ImageContent]` (the tool result type is `str | Sequence[TextContent | ImageContent]`). + +The standard Chat Completions API doesn't support images in tool results — use `OpenAIResponsesChatGenerator` (OpenAI's Responses API) instead: ```python from typing import Annotated from haystack.components.agents import Agent -from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.components.generators.chat import OpenAIResponsesChatGenerator from haystack.dataclasses import ChatMessage, ImageContent from haystack.tools import tool @@ -370,7 +372,7 @@ def fetch_image( agent = Agent( - chat_generator=OpenAIChatGenerator(model="gpt-5"), + chat_generator=OpenAIResponsesChatGenerator(model="gpt-5"), tools=[fetch_image], system_prompt="You are a helpful assistant that can fetch and analyze images from URLs.", ) diff --git a/docs-website/versioned_docs/version-2.28/concepts/agents.mdx b/docs-website/versioned_docs/version-2.28/concepts/agents.mdx index 7e30a2bf1d..05adf2955d 100644 --- a/docs-website/versioned_docs/version-2.28/concepts/agents.mdx +++ b/docs-website/versioned_docs/version-2.28/concepts/agents.mdx @@ -45,6 +45,7 @@ Key capabilities include: - **Human-in-the-loop**: Intercept tool calls for human review before execution. See [Human in the Loop](../pipeline-components/agents-1/human-in-the-loop.mdx). - **Multi-agent systems**: Wrap an `Agent` as a `ComponentTool` to build coordinator/specialist architectures. See [Multi-Agent Systems](./agents/multi-agent-systems.mdx). - **MCP server exposure**: Expose your agent as an MCP server using [Hayhooks](../development/hayhooks.mdx), making it callable from any MCP-compatible client such as Claude Desktop or Cursor. +- **Multimodal inputs**: Pass images alongside text using `ImageContent` in `ChatMessage` content parts, or return `ImageContent` from tools for dynamic image analysis. Requires a vision-capable model such as `gpt-5` or `gemini-2.5-flash`. See [Multimodal Inputs](../pipeline-components/agents-1/agent.mdx#multimodal-inputs). Check out the [Agent](../pipeline-components/agents-1/agent.mdx) documentation, or the [example](#tool-calling-agent) below to get started. diff --git a/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx b/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx index 1e10283a88..f371b868ca 100644 --- a/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx +++ b/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx @@ -334,6 +334,108 @@ See our [Streaming Support](../generators/guides-to-generators/choosing-the-righ Give preference to `print_streaming_chunk` by default. Write a custom callback only if you need a specific transport (for example, SSE/WebSocket) or custom UI formatting. +## Multimodal Inputs + +Agents support multimodal inputs when paired with a vision-capable model such as `gpt-5` (OpenAI) or `gemini-2.5-flash` (Google). +Pass images alongside text by including `ImageContent` objects in the `content_parts` of a `ChatMessage`: + +```python +from haystack.dataclasses import ChatMessage, ImageContent + +image = ImageContent.from_url("https://example.com/chart.png") +result = agent.run( + messages=[ + ChatMessage.from_user(content_parts=["What does this chart show?", image]), + ], +) +``` + +Tools can also return `ImageContent` directly, letting the agent fetch and reason about images dynamically during its loop. +Two things are required: set `outputs_to_string={"raw_result": True}` so the `ToolInvoker` skips string conversion, and return a `list[ImageContent]` (the tool result type is `str | Sequence[TextContent | ImageContent]`). + +The standard Chat Completions API doesn't support images in tool results — use `OpenAIResponsesChatGenerator` (OpenAI's Responses API) instead: + +```python +from typing import Annotated +from haystack.components.agents import Agent +from haystack.components.generators.chat import OpenAIResponsesChatGenerator +from haystack.dataclasses import ChatMessage, ImageContent +from haystack.tools import tool + + +@tool(outputs_to_string={"raw_result": True}) +def fetch_image( + url: Annotated[str, "URL of the image to fetch and analyze"], +) -> list[ImageContent]: + """Fetch an image from a URL so the agent can analyze its contents.""" + return [ImageContent.from_url(url)] + + +agent = Agent( + chat_generator=OpenAIResponsesChatGenerator(model="gpt-5"), + tools=[fetch_image], + system_prompt="You are a helpful assistant that can fetch and analyze images from URLs.", +) + +result = agent.run( + messages=[ + ChatMessage.from_user( + "Fetch the image at https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Bikesgray.jpg/320px-Bikesgray.jpg and describe what you see.", + ), + ], +) +print(result["last_message"].text) +``` + +`ImageContent` can be created from a URL, a local file path, or a PDF page using the `PDFToImageContent` converter. + +### In a pipeline + +When an `Agent` sits inside a pipeline, use `ChatPromptBuilder` with its string template format and the `| templatize_part` filter to pass images as structured content parts: + +```python +from haystack import Pipeline +from haystack.components.agents import Agent +from haystack.components.builders import ChatPromptBuilder +from haystack.components.generators.chat import OpenAIChatGenerator +from haystack.dataclasses import ImageContent + +template = """ +{% message role="user" %} +{{ question }} +{{ image | templatize_part }} +{% endmessage %} +""" + +agent = Agent( + chat_generator=OpenAIChatGenerator(model="gpt-5"), + system_prompt="You are a helpful assistant that can analyze images.", +) +prompt_builder = ChatPromptBuilder( + template=template, + required_variables=["question", "image"], +) + +pipeline = Pipeline() +pipeline.add_component("prompt_builder", prompt_builder) +pipeline.add_component("agent", agent) +pipeline.connect("prompt_builder.prompt", "agent.messages") + +image = ImageContent.from_file_path("chart.png") +result = pipeline.run( + { + "prompt_builder": {"question": "What does this chart show?", "image": image}, + }, +) +print(result["agent"]["last_message"].text) +``` + +:::tip +See these cookbooks for complete multimodal agent examples: +- [Multimodal Agents](https://haystack.deepset.ai/cookbook/multimodal_intro#multimodal-agent) — image inputs and tool use with agents +- [Gemma Chat RAG](https://haystack.deepset.ai/cookbook/gemma_chat_rag) — vision model in a RAG pipeline +::: + ## Multi-Agent Systems You can wrap an `Agent` as a tool to build multi-agent systems where specialist agents handle focused subtasks and a coordinator agent plans and delegates. @@ -363,3 +465,5 @@ Agents work with MCP in two directions: 🧑‍🍳 Cookbook: - [Build a GitHub Issue Resolver Agent](https://haystack.deepset.ai/cookbook/github_issue_resolver_agent) +- [Multimodal Agents](https://haystack.deepset.ai/cookbook/multimodal_intro#multimodal-agent) +- [Gemma Chat RAG](https://haystack.deepset.ai/cookbook/gemma_chat_rag) From 351466b67ad5ff770cb32fd9a7a9bd6c11aed7f0 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Mon, 11 May 2026 11:37:51 +0200 Subject: [PATCH 3/4] use valid image url --- docs-website/docs/pipeline-components/agents-1/agent.mdx | 2 +- .../version-2.28/pipeline-components/agents-1/agent.mdx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs-website/docs/pipeline-components/agents-1/agent.mdx b/docs-website/docs/pipeline-components/agents-1/agent.mdx index f371b868ca..d46376f88d 100644 --- a/docs-website/docs/pipeline-components/agents-1/agent.mdx +++ b/docs-website/docs/pipeline-components/agents-1/agent.mdx @@ -380,7 +380,7 @@ agent = Agent( result = agent.run( messages=[ ChatMessage.from_user( - "Fetch the image at https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Bikesgray.jpg/320px-Bikesgray.jpg and describe what you see.", + "Fetch the image at https://picsum.photos/seed/haystack/640/480 and describe what you see.", ), ], ) diff --git a/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx b/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx index f371b868ca..d46376f88d 100644 --- a/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx +++ b/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx @@ -380,7 +380,7 @@ agent = Agent( result = agent.run( messages=[ ChatMessage.from_user( - "Fetch the image at https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Bikesgray.jpg/320px-Bikesgray.jpg and describe what you see.", + "Fetch the image at https://picsum.photos/seed/haystack/640/480 and describe what you see.", ), ], ) From 85553621c781f0575e1137461561031b170ed3a8 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Mon, 11 May 2026 11:39:17 +0200 Subject: [PATCH 4/4] add dev comment --- docs-website/docs/pipeline-components/agents-1/agent.mdx | 1 + .../version-2.28/pipeline-components/agents-1/agent.mdx | 1 + 2 files changed, 2 insertions(+) diff --git a/docs-website/docs/pipeline-components/agents-1/agent.mdx b/docs-website/docs/pipeline-components/agents-1/agent.mdx index d46376f88d..fef0f17e64 100644 --- a/docs-website/docs/pipeline-components/agents-1/agent.mdx +++ b/docs-website/docs/pipeline-components/agents-1/agent.mdx @@ -421,6 +421,7 @@ pipeline.add_component("prompt_builder", prompt_builder) pipeline.add_component("agent", agent) pipeline.connect("prompt_builder.prompt", "agent.messages") +# Download or provide your own chart image as "chart.png" image = ImageContent.from_file_path("chart.png") result = pipeline.run( { diff --git a/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx b/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx index d46376f88d..fef0f17e64 100644 --- a/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx +++ b/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx @@ -421,6 +421,7 @@ pipeline.add_component("prompt_builder", prompt_builder) pipeline.add_component("agent", agent) pipeline.connect("prompt_builder.prompt", "agent.messages") +# Download or provide your own chart image as "chart.png" image = ImageContent.from_file_path("chart.png") result = pipeline.run( {