From f72a400700e2b95d385f11c1e3432727811e5039 Mon Sep 17 00:00:00 2001
From: Sebastian Husch Lee <sjrl423@gmail.com>
Date: Mon, 11 May 2026 11:18:29 +0200
Subject: [PATCH 1/4] highlight image support

---
 docs-website/docs/concepts/agents.mdx         |   1 +
 .../pipeline-components/agents-1/agent.mdx    | 102 ++++++++++++++++++
 2 files changed, 103 insertions(+)

diff --git a/docs-website/docs/concepts/agents.mdx b/docs-website/docs/concepts/agents.mdx
index 7e30a2bf1d..05adf2955d 100644
--- a/docs-website/docs/concepts/agents.mdx
+++ b/docs-website/docs/concepts/agents.mdx
@@ -45,6 +45,7 @@ Key capabilities include:
 - **Human-in-the-loop**: Intercept tool calls for human review before execution. See [Human in the Loop](../pipeline-components/agents-1/human-in-the-loop.mdx).
 - **Multi-agent systems**: Wrap an `Agent` as a `ComponentTool` to build coordinator/specialist architectures. See [Multi-Agent Systems](./agents/multi-agent-systems.mdx).
 - **MCP server exposure**: Expose your agent as an MCP server using [Hayhooks](../development/hayhooks.mdx), making it callable from any MCP-compatible client such as Claude Desktop or Cursor.
+- **Multimodal inputs**: Pass images alongside text using `ImageContent` in `ChatMessage` content parts, or return `ImageContent` from tools for dynamic image analysis. Requires a vision-capable model such as `gpt-5` or `gemini-2.5-flash`. See [Multimodal Inputs](../pipeline-components/agents-1/agent.mdx#multimodal-inputs).
 
 Check out the [Agent](../pipeline-components/agents-1/agent.mdx) documentation, or the [example](#tool-calling-agent) below to get started.
 
diff --git a/docs-website/docs/pipeline-components/agents-1/agent.mdx b/docs-website/docs/pipeline-components/agents-1/agent.mdx
index 1e10283a88..90ccd4a4c5 100644
--- a/docs-website/docs/pipeline-components/agents-1/agent.mdx
+++ b/docs-website/docs/pipeline-components/agents-1/agent.mdx
@@ -334,6 +334,106 @@ See our [Streaming Support](../generators/guides-to-generators/choosing-the-righ
 Give preference to `print_streaming_chunk` by default.
 Write a custom callback only if you need a specific transport (for example, SSE/WebSocket) or custom UI formatting.
 
+## Multimodal Inputs
+
+Agents support multimodal inputs when paired with a vision-capable model such as `gpt-5` (OpenAI) or `gemini-2.5-flash` (Google).
+Pass images alongside text by including `ImageContent` objects in the `content_parts` of a `ChatMessage`:
+
+```python
+from haystack.dataclasses import ChatMessage, ImageContent
+
+image = ImageContent.from_url("https://example.com/chart.png")
+result = agent.run(
+    messages=[
+        ChatMessage.from_user(content_parts=["What does this chart show?", image]),
+    ],
+)
+```
+
+Tools can also return `ImageContent` directly, letting the agent fetch and reason about images dynamically during its loop.
+Two things are required: set `outputs_to_string={"raw_result": True}` so the `ToolInvoker` skips string conversion, and return a `list[ImageContent]` (the tool result type is `str | Sequence[TextContent | ImageContent]`):
+
+```python
+from typing import Annotated
+from haystack.components.agents import Agent
+from haystack.components.generators.chat import OpenAIChatGenerator
+from haystack.dataclasses import ChatMessage, ImageContent
+from haystack.tools import tool
+
+
+@tool(outputs_to_string={"raw_result": True})
+def fetch_image(
+    url: Annotated[str, "URL of the image to fetch and analyze"],
+) -> list[ImageContent]:
+    """Fetch an image from a URL so the agent can analyze its contents."""
+    return [ImageContent.from_url(url)]
+
+
+agent = Agent(
+    chat_generator=OpenAIChatGenerator(model="gpt-5"),
+    tools=[fetch_image],
+    system_prompt="You are a helpful assistant that can fetch and analyze images from URLs.",
+)
+
+result = agent.run(
+    messages=[
+        ChatMessage.from_user(
+            "Fetch the image at https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Bikesgray.jpg/320px-Bikesgray.jpg and describe what you see.",
+        ),
+    ],
+)
+print(result["last_message"].text)
+```
+
+`ImageContent` can be created from a URL, a local file path, or a PDF page using the `PDFToImageContent` converter.
+
+### In a pipeline
+
+When an `Agent` sits inside a pipeline, use `ChatPromptBuilder` with its string template format and the `| templatize_part` filter to pass images as structured content parts:
+
+```python
+from haystack import Pipeline
+from haystack.components.agents import Agent
+from haystack.components.builders import ChatPromptBuilder
+from haystack.components.generators.chat import OpenAIChatGenerator
+from haystack.dataclasses import ImageContent
+
+template = """
+{% message role="user" %}
+{{ question }}
+{{ image | templatize_part }}
+{% endmessage %}
+"""
+
+agent = Agent(
+    chat_generator=OpenAIChatGenerator(model="gpt-5"),
+    system_prompt="You are a helpful assistant that can analyze images.",
+)
+prompt_builder = ChatPromptBuilder(
+    template=template,
+    required_variables=["question", "image"],
+)
+
+pipeline = Pipeline()
+pipeline.add_component("prompt_builder", prompt_builder)
+pipeline.add_component("agent", agent)
+pipeline.connect("prompt_builder.prompt", "agent.messages")
+
+image = ImageContent.from_file_path("chart.png")
+result = pipeline.run(
+    {
+        "prompt_builder": {"question": "What does this chart show?", "image": image},
+    },
+)
+print(result["agent"]["last_message"].text)
+```
+
+:::tip
+See these cookbooks for complete multimodal agent examples:
+- [Multimodal Agents](https://haystack.deepset.ai/cookbook/multimodal_intro#multimodal-agent) — image inputs and tool use with agents
+- [Gemma Chat RAG](https://haystack.deepset.ai/cookbook/gemma_chat_rag) — vision model in a RAG pipeline
+:::
+
 ## Multi-Agent Systems
 
 You can wrap an `Agent` as a tool to build multi-agent systems where specialist agents handle focused subtasks and a coordinator agent plans and delegates.
@@ -363,3 +463,5 @@ Agents work with MCP in two directions:
 🧑‍🍳 Cookbook:
 
 - [Build a GitHub Issue Resolver Agent](https://haystack.deepset.ai/cookbook/github_issue_resolver_agent)
+- [Multimodal Agents](https://haystack.deepset.ai/cookbook/multimodal_intro#multimodal-agent)
+- [Gemma Chat RAG](https://haystack.deepset.ai/cookbook/gemma_chat_rag)

From df5041d46f415ef73106d794743305d4356ee779 Mon Sep 17 00:00:00 2001
From: Sebastian Husch Lee <sjrl423@gmail.com>
Date: Mon, 11 May 2026 11:27:40 +0200
Subject: [PATCH 2/4] use responses api

---
 .../pipeline-components/agents-1/agent.mdx    |   8 +-
 .../version-2.28/concepts/agents.mdx          |   1 +
 .../pipeline-components/agents-1/agent.mdx    | 104 ++++++++++++++++++
 3 files changed, 110 insertions(+), 3 deletions(-)

diff --git a/docs-website/docs/pipeline-components/agents-1/agent.mdx b/docs-website/docs/pipeline-components/agents-1/agent.mdx
index 90ccd4a4c5..f371b868ca 100644
--- a/docs-website/docs/pipeline-components/agents-1/agent.mdx
+++ b/docs-website/docs/pipeline-components/agents-1/agent.mdx
@@ -351,12 +351,14 @@ result = agent.run(
 ```
 
 Tools can also return `ImageContent` directly, letting the agent fetch and reason about images dynamically during its loop.
-Two things are required: set `outputs_to_string={"raw_result": True}` so the `ToolInvoker` skips string conversion, and return a `list[ImageContent]` (the tool result type is `str | Sequence[TextContent | ImageContent]`):
+Two things are required: set `outputs_to_string={"raw_result": True}` so the `ToolInvoker` skips string conversion, and return a `list[ImageContent]` (the tool result type is `str | Sequence[TextContent | ImageContent]`).
+
+The standard Chat Completions API doesn't support images in tool results — use `OpenAIResponsesChatGenerator` (OpenAI's Responses API) instead:
 
 ```python
 from typing import Annotated
 from haystack.components.agents import Agent
-from haystack.components.generators.chat import OpenAIChatGenerator
+from haystack.components.generators.chat import OpenAIResponsesChatGenerator
 from haystack.dataclasses import ChatMessage, ImageContent
 from haystack.tools import tool
 
@@ -370,7 +372,7 @@ def fetch_image(
 
 
 agent = Agent(
-    chat_generator=OpenAIChatGenerator(model="gpt-5"),
+    chat_generator=OpenAIResponsesChatGenerator(model="gpt-5"),
     tools=[fetch_image],
     system_prompt="You are a helpful assistant that can fetch and analyze images from URLs.",
 )
diff --git a/docs-website/versioned_docs/version-2.28/concepts/agents.mdx b/docs-website/versioned_docs/version-2.28/concepts/agents.mdx
index 7e30a2bf1d..05adf2955d 100644
--- a/docs-website/versioned_docs/version-2.28/concepts/agents.mdx
+++ b/docs-website/versioned_docs/version-2.28/concepts/agents.mdx
@@ -45,6 +45,7 @@ Key capabilities include:
 - **Human-in-the-loop**: Intercept tool calls for human review before execution. See [Human in the Loop](../pipeline-components/agents-1/human-in-the-loop.mdx).
 - **Multi-agent systems**: Wrap an `Agent` as a `ComponentTool` to build coordinator/specialist architectures. See [Multi-Agent Systems](./agents/multi-agent-systems.mdx).
 - **MCP server exposure**: Expose your agent as an MCP server using [Hayhooks](../development/hayhooks.mdx), making it callable from any MCP-compatible client such as Claude Desktop or Cursor.
+- **Multimodal inputs**: Pass images alongside text using `ImageContent` in `ChatMessage` content parts, or return `ImageContent` from tools for dynamic image analysis. Requires a vision-capable model such as `gpt-5` or `gemini-2.5-flash`. See [Multimodal Inputs](../pipeline-components/agents-1/agent.mdx#multimodal-inputs).
 
 Check out the [Agent](../pipeline-components/agents-1/agent.mdx) documentation, or the [example](#tool-calling-agent) below to get started.
 
diff --git a/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx b/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx
index 1e10283a88..f371b868ca 100644
--- a/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx
+++ b/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx
@@ -334,6 +334,108 @@ See our [Streaming Support](../generators/guides-to-generators/choosing-the-righ
 Give preference to `print_streaming_chunk` by default.
 Write a custom callback only if you need a specific transport (for example, SSE/WebSocket) or custom UI formatting.
 
+## Multimodal Inputs
+
+Agents support multimodal inputs when paired with a vision-capable model such as `gpt-5` (OpenAI) or `gemini-2.5-flash` (Google).
+Pass images alongside text by including `ImageContent` objects in the `content_parts` of a `ChatMessage`:
+
+```python
+from haystack.dataclasses import ChatMessage, ImageContent
+
+image = ImageContent.from_url("https://example.com/chart.png")
+result = agent.run(
+    messages=[
+        ChatMessage.from_user(content_parts=["What does this chart show?", image]),
+    ],
+)
+```
+
+Tools can also return `ImageContent` directly, letting the agent fetch and reason about images dynamically during its loop.
+Two things are required: set `outputs_to_string={"raw_result": True}` so the `ToolInvoker` skips string conversion, and return a `list[ImageContent]` (the tool result type is `str | Sequence[TextContent | ImageContent]`).
+
+The standard Chat Completions API doesn't support images in tool results — use `OpenAIResponsesChatGenerator` (OpenAI's Responses API) instead:
+
+```python
+from typing import Annotated
+from haystack.components.agents import Agent
+from haystack.components.generators.chat import OpenAIResponsesChatGenerator
+from haystack.dataclasses import ChatMessage, ImageContent
+from haystack.tools import tool
+
+
+@tool(outputs_to_string={"raw_result": True})
+def fetch_image(
+    url: Annotated[str, "URL of the image to fetch and analyze"],
+) -> list[ImageContent]:
+    """Fetch an image from a URL so the agent can analyze its contents."""
+    return [ImageContent.from_url(url)]
+
+
+agent = Agent(
+    chat_generator=OpenAIResponsesChatGenerator(model="gpt-5"),
+    tools=[fetch_image],
+    system_prompt="You are a helpful assistant that can fetch and analyze images from URLs.",
+)
+
+result = agent.run(
+    messages=[
+        ChatMessage.from_user(
+            "Fetch the image at https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Bikesgray.jpg/320px-Bikesgray.jpg and describe what you see.",
+        ),
+    ],
+)
+print(result["last_message"].text)
+```
+
+`ImageContent` can be created from a URL, a local file path, or a PDF page using the `PDFToImageContent` converter.
+
+### In a pipeline
+
+When an `Agent` sits inside a pipeline, use `ChatPromptBuilder` with its string template format and the `| templatize_part` filter to pass images as structured content parts:
+
+```python
+from haystack import Pipeline
+from haystack.components.agents import Agent
+from haystack.components.builders import ChatPromptBuilder
+from haystack.components.generators.chat import OpenAIChatGenerator
+from haystack.dataclasses import ImageContent
+
+template = """
+{% message role="user" %}
+{{ question }}
+{{ image | templatize_part }}
+{% endmessage %}
+"""
+
+agent = Agent(
+    chat_generator=OpenAIChatGenerator(model="gpt-5"),
+    system_prompt="You are a helpful assistant that can analyze images.",
+)
+prompt_builder = ChatPromptBuilder(
+    template=template,
+    required_variables=["question", "image"],
+)
+
+pipeline = Pipeline()
+pipeline.add_component("prompt_builder", prompt_builder)
+pipeline.add_component("agent", agent)
+pipeline.connect("prompt_builder.prompt", "agent.messages")
+
+image = ImageContent.from_file_path("chart.png")
+result = pipeline.run(
+    {
+        "prompt_builder": {"question": "What does this chart show?", "image": image},
+    },
+)
+print(result["agent"]["last_message"].text)
+```
+
+:::tip
+See these cookbooks for complete multimodal agent examples:
+- [Multimodal Agents](https://haystack.deepset.ai/cookbook/multimodal_intro#multimodal-agent) — image inputs and tool use with agents
+- [Gemma Chat RAG](https://haystack.deepset.ai/cookbook/gemma_chat_rag) — vision model in a RAG pipeline
+:::
+
 ## Multi-Agent Systems
 
 You can wrap an `Agent` as a tool to build multi-agent systems where specialist agents handle focused subtasks and a coordinator agent plans and delegates.
@@ -363,3 +465,5 @@ Agents work with MCP in two directions:
 🧑‍🍳 Cookbook:
 
 - [Build a GitHub Issue Resolver Agent](https://haystack.deepset.ai/cookbook/github_issue_resolver_agent)
+- [Multimodal Agents](https://haystack.deepset.ai/cookbook/multimodal_intro#multimodal-agent)
+- [Gemma Chat RAG](https://haystack.deepset.ai/cookbook/gemma_chat_rag)

From 351466b67ad5ff770cb32fd9a7a9bd6c11aed7f0 Mon Sep 17 00:00:00 2001
From: Sebastian Husch Lee <sjrl423@gmail.com>
Date: Mon, 11 May 2026 11:37:51 +0200
Subject: [PATCH 3/4] use valid image url

---
 docs-website/docs/pipeline-components/agents-1/agent.mdx        | 2 +-
 .../version-2.28/pipeline-components/agents-1/agent.mdx         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs-website/docs/pipeline-components/agents-1/agent.mdx b/docs-website/docs/pipeline-components/agents-1/agent.mdx
index f371b868ca..d46376f88d 100644
--- a/docs-website/docs/pipeline-components/agents-1/agent.mdx
+++ b/docs-website/docs/pipeline-components/agents-1/agent.mdx
@@ -380,7 +380,7 @@ agent = Agent(
 result = agent.run(
     messages=[
         ChatMessage.from_user(
-            "Fetch the image at https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Bikesgray.jpg/320px-Bikesgray.jpg and describe what you see.",
+            "Fetch the image at https://picsum.photos/seed/haystack/640/480 and describe what you see.",
         ),
     ],
 )
diff --git a/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx b/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx
index f371b868ca..d46376f88d 100644
--- a/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx
+++ b/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx
@@ -380,7 +380,7 @@ agent = Agent(
 result = agent.run(
     messages=[
         ChatMessage.from_user(
-            "Fetch the image at https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Bikesgray.jpg/320px-Bikesgray.jpg and describe what you see.",
+            "Fetch the image at https://picsum.photos/seed/haystack/640/480 and describe what you see.",
         ),
     ],
 )

From 85553621c781f0575e1137461561031b170ed3a8 Mon Sep 17 00:00:00 2001
From: Sebastian Husch Lee <sjrl423@gmail.com>
Date: Mon, 11 May 2026 11:39:17 +0200
Subject: [PATCH 4/4] add dev comment

---
 docs-website/docs/pipeline-components/agents-1/agent.mdx         | 1 +
 .../version-2.28/pipeline-components/agents-1/agent.mdx          | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docs-website/docs/pipeline-components/agents-1/agent.mdx b/docs-website/docs/pipeline-components/agents-1/agent.mdx
index d46376f88d..fef0f17e64 100644
--- a/docs-website/docs/pipeline-components/agents-1/agent.mdx
+++ b/docs-website/docs/pipeline-components/agents-1/agent.mdx
@@ -421,6 +421,7 @@ pipeline.add_component("prompt_builder", prompt_builder)
 pipeline.add_component("agent", agent)
 pipeline.connect("prompt_builder.prompt", "agent.messages")
 
+# Download or provide your own chart image as "chart.png"
 image = ImageContent.from_file_path("chart.png")
 result = pipeline.run(
     {
diff --git a/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx b/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx
index d46376f88d..fef0f17e64 100644
--- a/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx
+++ b/docs-website/versioned_docs/version-2.28/pipeline-components/agents-1/agent.mdx
@@ -421,6 +421,7 @@ pipeline.add_component("prompt_builder", prompt_builder)
 pipeline.add_component("agent", agent)
 pipeline.connect("prompt_builder.prompt", "agent.messages")
 
+# Download or provide your own chart image as "chart.png"
 image = ImageContent.from_file_path("chart.png")
 result = pipeline.run(
     {