From 791f1a224fbc96c2a674d63e4a103f6b2a4f86bf Mon Sep 17 00:00:00 2001 From: Ammar Date: Tue, 9 Dec 2025 09:10:47 -0600 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=A4=96=20fix:=20raise=20MCP=20image?= =?UTF-8?q?=20guard=20to=208MB,=20add=20JPEG=20test=20matrix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Raise MAX_IMAGE_DATA_BYTES from 16KB to 8MB per image - AI SDK v5 properly converts media tool results to image blocks - Previous limit was unnecessarily restrictive for normal screenshots - New limit guards against pathological payloads while allowing typical images - Tighten IPC integration tests for MCP image handling: - Add JPEG format test alongside existing PNG test - Assert no text parts contain data URIs or large base64 blobs - Verify model response includes 'example domain' to prove image was read - Ensures regression detection if SDK ever serializes images as text _Generated with mux_ --- src/node/services/mcpResultTransform.test.ts | 34 ++-- src/node/services/mcpResultTransform.ts | 16 +- tests/ipc/mcpConfig.test.ts | 171 ++++++++++++++++++- 3 files changed, 192 insertions(+), 29 deletions(-) diff --git a/src/node/services/mcpResultTransform.test.ts b/src/node/services/mcpResultTransform.test.ts index f169843a42..1423325fa0 100644 --- a/src/node/services/mcpResultTransform.test.ts +++ b/src/node/services/mcpResultTransform.test.ts @@ -21,10 +21,10 @@ describe("transformMCPResult", () => { }); }); - it("should truncate large image data to prevent context overflow", () => { - // Create a large base64 string that simulates a big screenshot - // A typical screenshot could be 500KB-2MB of base64 data - const largeImageData = "x".repeat(MAX_IMAGE_DATA_BYTES + 100_000); + it("should omit large image data to prevent context overflow", () => { + // Create a large base64 string that simulates a screenshot + // Even 50KB of base64 would be ~12,500 tokens when treated as text + const largeImageData = "x".repeat(MAX_IMAGE_DATA_BYTES + 10_000); const result = transformMCPResult({ content: [ { type: "text", text: "Screenshot taken" }, @@ -41,16 +41,16 @@ describe("transformMCPResult", () => { expect(transformed.value).toHaveLength(2); expect(transformed.value[0]).toEqual({ type: "text", text: "Screenshot taken" }); - // The image should be replaced with a text message explaining the truncation + // The image should be replaced with a text message explaining why it was omitted const imageResult = transformed.value[1]; expect(imageResult.type).toBe("text"); - expect(imageResult.text).toContain("Image data too large"); - expect(imageResult.text).toContain(String(largeImageData.length)); + expect(imageResult.text).toContain("Image omitted"); + expect(imageResult.text).toContain("per-image guard"); }); - it("should handle multiple images, truncating only the oversized ones", () => { + it("should handle multiple images, omitting only the oversized ones", () => { const smallImageData = "small".repeat(100); - const largeImageData = "x".repeat(MAX_IMAGE_DATA_BYTES + 50_000); + const largeImageData = "x".repeat(MAX_IMAGE_DATA_BYTES + 5_000); const result = transformMCPResult({ content: [ @@ -71,14 +71,14 @@ describe("transformMCPResult", () => { data: smallImageData, mediaType: "image/png", }); - // Large image gets truncated with explanation + // Large image gets omitted with explanation expect(transformed.value[1].type).toBe("text"); - expect(transformed.value[1].text).toContain("Image data too large"); + expect(transformed.value[1].text).toContain("Image omitted"); }); - it("should report approximate file size in KB/MB in truncation message", () => { - // ~1.5MB of base64 data - const largeImageData = "y".repeat(1_500_000); + it("should mention size and guard limit in omission message", () => { + // 100KB of base64 data should trigger the guard if limit is smaller, but we keep it big here + const largeImageData = "y".repeat(MAX_IMAGE_DATA_BYTES + 1_000); const result = transformMCPResult({ content: [{ type: "image", data: largeImageData, mimeType: "image/png" }], }); @@ -89,8 +89,10 @@ describe("transformMCPResult", () => { }; expect(transformed.value[0].type).toBe("text"); - // Should mention MB since it's over 1MB - expect(transformed.value[0].text).toMatch(/\d+(\.\d+)?\s*MB/i); + // Should mention size and guard + expect(transformed.value[0].text).toMatch(/Image omitted/); + expect(transformed.value[0].text).toMatch(/per-image guard/i); + expect(transformed.value[0].text).toMatch(/MB|KB/); }); }); diff --git a/src/node/services/mcpResultTransform.ts b/src/node/services/mcpResultTransform.ts index 8afc044d80..a59ad045e6 100644 --- a/src/node/services/mcpResultTransform.ts +++ b/src/node/services/mcpResultTransform.ts @@ -1,12 +1,14 @@ import { log } from "@/node/services/log"; /** - * Maximum size of base64 image data in bytes before truncation. - * Large images can overflow the model's context window. 256KB of base64 - * represents roughly 192KB of actual image data, which is sufficient for - * screenshots while preventing context overflow. + * Maximum size of base64 image data in bytes before we drop it. + * + * Rationale: providers already accept multi‑megabyte images, but a single + * 20–30MB screenshot can still blow up request sizes or hit provider limits + * (e.g., Anthropic ~32MB total request). We keep a generous per‑image guard to + * pass normal screenshots while preventing pathological payloads. */ -export const MAX_IMAGE_DATA_BYTES = 256 * 1024; // 256KB of base64 data +export const MAX_IMAGE_DATA_BYTES = 8 * 1024 * 1024; // 8MB guard per image /** * MCP CallToolResult content types (from @ai-sdk/mcp) @@ -92,14 +94,14 @@ export function transformMCPResult(result: MCPCallToolResult): unknown { // Check if image data exceeds the limit const dataLength = imageItem.data?.length ?? 0; if (dataLength > MAX_IMAGE_DATA_BYTES) { - log.warn("[MCP] Image data too large, truncating", { + log.warn("[MCP] Image data too large, omitting from context", { mimeType: imageItem.mimeType, dataLength, maxAllowed: MAX_IMAGE_DATA_BYTES, }); return { type: "text" as const, - text: `[Image data too large to include in context: ${formatBytes(dataLength)} (${dataLength} bytes). The image was captured but cannot be displayed inline. Consider using a smaller viewport or requesting a specific region.]`, + text: `[Image omitted: ${formatBytes(dataLength)} exceeds per-image guard of ${formatBytes(MAX_IMAGE_DATA_BYTES)}. Reduce resolution or quality and retry.]`, }; } // Ensure mediaType is present - default to image/png if missing diff --git a/tests/ipc/mcpConfig.test.ts b/tests/ipc/mcpConfig.test.ts index 94a3e245ac..4562063bd3 100644 --- a/tests/ipc/mcpConfig.test.ts +++ b/tests/ipc/mcpConfig.test.ts @@ -186,22 +186,49 @@ describeIntegration("MCP server integration with model", () => { (item as { type: string }).type === "media" ); + const textItems = typedResult.value.filter( + (item): item is { type: "text"; text: string } => + typeof item === "object" && + item !== null && + "type" in item && + (item as { type: string }).type === "text" + ); + + const hasOmittedImageText = textItems.some((t) => t.text.includes("Image omitted")); + + expect(hasOmittedImageText).toBe(false); expect(mediaItems.length).toBeGreaterThan(0); - // Verify mediaType is present and is a valid image type + + // Ensure no text part contains a data URI or a large base64 blob + const longBase64Pattern = /[A-Za-z0-9+/]{10000,}/; + textItems.forEach((t) => { + expect(t.text.startsWith("data:image")).toBe(false); + expect(longBase64Pattern.test(t.text)).toBe(false); + }); + + // Verify media format and size for (const media of mediaItems) { expect(media.mediaType).toBeDefined(); expect(media.mediaType).toMatch(/^image\//); expect(media.data).toBeDefined(); - expect(media.data.length).toBeGreaterThan(100); // Should have actual image data + expect(media.data.length).toBeGreaterThan(1000); // Should have actual image data } + + // Log which path we took for debugging + console.log("[MCP Image Test] Result:", { + mediaCount: mediaItems.length, + textCount: textItems.length, + }); } - // Verify model's response mentions seeing something (proves it understood the image) + // Verify model's response - should describe the screenshot content const deltas = collector.getDeltas(); const responseText = extractTextFromEvents(deltas).toLowerCase(); console.log("[MCP Image Test] Response text preview:", responseText.slice(0, 200)); - // Model should describe something it sees - domain name, content, or visual elements - expect(responseText).toMatch(/example|domain|website|page|text|heading|title/i); + const expectedKeywords = ["example domain"]; + const matchedKeyword = expectedKeywords.some((k) => responseText.includes(k)); + expect(matchedKeyword).toBe(true); + expect(responseText.length).toBeGreaterThan(20); // Model should say something non-trivial collector.stop(); } finally { @@ -210,7 +237,139 @@ describeIntegration("MCP server integration with model", () => { console.log("[MCP Image Test] Done"); } }, - 180000 // 3 minutes - Chrome operations can be slow + 180000 + ); + + test.concurrent( + "MCP JPEG image content is correctly transformed to AI SDK format", + async () => { + console.log("[MCP JPEG Image Test] Setting up workspace..."); + // Setup workspace with Anthropic provider + const { env, workspaceId, tempGitRepo, cleanup } = await setupWorkspace( + "anthropic", + "mcp-chrome-jpeg" + ); + const client = resolveOrpcClient(env); + console.log("[MCP JPEG Image Test] Workspace created:", { workspaceId, tempGitRepo }); + + try { + // Add the Chrome DevTools MCP server to the project with same settings + console.log("[MCP JPEG Image Test] Adding Chrome DevTools MCP server..."); + const addResult = await client.projects.mcp.add({ + projectPath: tempGitRepo, + name: "chrome", + command: + "npx -y chrome-devtools-mcp@latest --headless --isolated --chromeArg='--no-sandbox'", + }); + expect(addResult.success).toBe(true); + console.log("[MCP JPEG Image Test] MCP server added"); + + // Create stream collector to capture events + console.log("[MCP JPEG Image Test] Creating stream collector..."); + const collector = createStreamCollector(env.orpc, workspaceId); + collector.start(); + await collector.waitForSubscription(); + console.log("[MCP JPEG Image Test] Stream collector ready"); + + // Send a message that should trigger JPEG screenshot + console.log("[MCP JPEG Image Test] Sending message..."); + const result = await sendMessageWithModel( + env, + workspaceId, + "Navigate to https://example.com and take a screenshot in JPEG format (use format: \"jpeg\"). Describe what you see in the screenshot.", + HAIKU_MODEL + ); + console.log("[MCP JPEG Image Test] Message sent, result:", result.success); + + expect(result.success).toBe(true); + + // Wait for stream to complete (Chrome may take time) + console.log("[MCP JPEG Image Test] Waiting for stream-end..."); + await collector.waitForEvent("stream-end", 120000); + console.log("[MCP JPEG Image Test] Stream ended"); + assertStreamSuccess(collector); + + // Find the screenshot tool call and its result + const events = collector.getEvents(); + const toolCallEnds = events.filter( + (e): e is Extract => e.type === "tool-call-end" + ); + console.log( + "[MCP JPEG Image Test] Tool call ends:", + toolCallEnds.map((e) => ({ toolName: e.toolName, resultType: typeof e.result })) + ); + + const screenshotResult = toolCallEnds.find((e) => e.toolName === "chrome_take_screenshot"); + expect(screenshotResult).toBeDefined(); + + const result_output = screenshotResult!.result as { type: string; value: unknown[] } | unknown; + + if ( + typeof result_output === "object" && + result_output !== null && + "value" in result_output + ) { + const value = (result_output as { value: unknown[] }).value; + const mediaItems = value.filter( + (item): item is { type: "media"; data: string; mediaType: string } => + typeof item === "object" && + item !== null && + "type" in item && + (item as { type: string }).type === "media" + ); + + const textItems = value.filter( + (item): item is { type: "text"; text: string } => + typeof item === "object" && + item !== null && + "type" in item && + (item as { type: string }).type === "text" + ); + + const hasOmittedImageText = textItems.some((t) => t.text.includes("Image omitted")); + expect(hasOmittedImageText).toBe(false); + expect(mediaItems.length).toBeGreaterThan(0); + + // Ensure no text part contains a data URI or a large base64 blob + const longBase64Pattern = /[A-Za-z0-9+/]{10000,}/; + textItems.forEach((t) => { + expect(t.text.startsWith("data:image")).toBe(false); + expect(longBase64Pattern.test(t.text)).toBe(false); + }); + + for (const media of mediaItems) { + expect(media.mediaType).toBeDefined(); + expect(media.mediaType).toMatch(/^image\//); + // Prefer JPEG, but allow WebP if the server re-encodes; still verify presence + expect(["image/jpeg", "image/jpg", "image/webp"]).toContain(media.mediaType); + expect(media.data).toBeDefined(); + expect(media.data.length).toBeGreaterThan(1000); + } + + console.log("[MCP JPEG Image Test] Result:", { + mediaCount: mediaItems.length, + textCount: textItems.length, + mediaTypes: mediaItems.map((m) => m.mediaType), + }); + } + + // Verify model response is non-trivial and reflects the screenshot + const deltas = collector.getDeltas(); + const responseText = extractTextFromEvents(deltas).toLowerCase(); + console.log("[MCP JPEG Image Test] Response text preview:", responseText.slice(0, 200)); + const expectedKeywords = ["example domain"]; + const matchedKeyword = expectedKeywords.some((k) => responseText.includes(k)); + expect(matchedKeyword).toBe(true); + expect(responseText.length).toBeGreaterThan(20); + + collector.stop(); + } finally { + console.log("[MCP JPEG Image Test] Cleaning up..."); + await cleanup(); + console.log("[MCP JPEG Image Test] Done"); + } + }, + 180000 ); test.concurrent( From 6e7475a34a37a2f6479a79d453bc967a159589e2 Mon Sep 17 00:00:00 2001 From: Ammar Date: Tue, 9 Dec 2025 09:13:23 -0600 Subject: [PATCH 2/3] fmt --- tests/ipc/mcpConfig.test.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/ipc/mcpConfig.test.ts b/tests/ipc/mcpConfig.test.ts index 4562063bd3..5b13aa897e 100644 --- a/tests/ipc/mcpConfig.test.ts +++ b/tests/ipc/mcpConfig.test.ts @@ -276,7 +276,7 @@ describeIntegration("MCP server integration with model", () => { const result = await sendMessageWithModel( env, workspaceId, - "Navigate to https://example.com and take a screenshot in JPEG format (use format: \"jpeg\"). Describe what you see in the screenshot.", + 'Navigate to https://example.com and take a screenshot in JPEG format (use format: "jpeg"). Describe what you see in the screenshot.', HAIKU_MODEL ); console.log("[MCP JPEG Image Test] Message sent, result:", result.success); @@ -302,7 +302,9 @@ describeIntegration("MCP server integration with model", () => { const screenshotResult = toolCallEnds.find((e) => e.toolName === "chrome_take_screenshot"); expect(screenshotResult).toBeDefined(); - const result_output = screenshotResult!.result as { type: string; value: unknown[] } | unknown; + const result_output = screenshotResult!.result as + | { type: string; value: unknown[] } + | unknown; if ( typeof result_output === "object" && From 1534f3e0d3771e42f53ed275d3a476c834f3a94e Mon Sep 17 00:00:00 2001 From: Ammar Date: Tue, 9 Dec 2025 11:40:58 -0600 Subject: [PATCH 3/3] refactor: extract MCP image test helpers, use test.each --- tests/ipc/mcpConfig.test.ts | 359 ++++++++++-------------------------- 1 file changed, 101 insertions(+), 258 deletions(-) diff --git a/tests/ipc/mcpConfig.test.ts b/tests/ipc/mcpConfig.test.ts index 5b13aa897e..c73eba6681 100644 --- a/tests/ipc/mcpConfig.test.ts +++ b/tests/ipc/mcpConfig.test.ts @@ -17,6 +17,7 @@ import { extractTextFromEvents, HAIKU_MODEL, } from "./helpers"; +import type { StreamCollector } from "./streamCollector"; const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip; @@ -24,6 +25,83 @@ if (shouldRunIntegrationTests()) { validateApiKeys(["ANTHROPIC_API_KEY"]); } +// Shared types for MCP content parsing +type MediaItem = { type: "media"; data: string; mediaType: string }; +type TextItem = { type: "text"; text: string }; + +function isMediaItem(item: unknown): item is MediaItem { + return ( + typeof item === "object" && + item !== null && + "type" in item && + (item as { type: string }).type === "media" + ); +} + +function isTextItem(item: unknown): item is TextItem { + return ( + typeof item === "object" && + item !== null && + "type" in item && + (item as { type: string }).type === "text" + ); +} + +/** + * Assert that a screenshot result has valid media content. + * Verifies: proper structure, no omitted images, no base64 in text, valid mediaType. + */ +function assertValidScreenshotResult( + result: unknown, + allowedMediaTypes?: RegExp +): { mediaItems: MediaItem[]; textItems: TextItem[] } { + expect(typeof result).toBe("object"); + expect(result).not.toBeNull(); + expect(result).toHaveProperty("type", "content"); + expect(result).toHaveProperty("value"); + + const value = (result as { value: unknown[] }).value; + expect(Array.isArray(value)).toBe(true); + + const mediaItems = value.filter(isMediaItem); + const textItems = value.filter(isTextItem); + + // No "Image omitted" text + const hasOmittedImageText = textItems.some((t) => t.text.includes("Image omitted")); + expect(hasOmittedImageText).toBe(false); + + // Must have at least one media item + expect(mediaItems.length).toBeGreaterThan(0); + + // Text parts must not contain base64 blobs (would indicate serialization as text) + const longBase64Pattern = /[A-Za-z0-9+/]{10000,}/; + for (const t of textItems) { + expect(t.text.startsWith("data:image")).toBe(false); + expect(longBase64Pattern.test(t.text)).toBe(false); + } + + // Validate media items + const typePattern = allowedMediaTypes ?? /^image\//; + for (const media of mediaItems) { + expect(media.mediaType).toBeDefined(); + expect(media.mediaType).toMatch(typePattern); + expect(media.data).toBeDefined(); + expect(media.data.length).toBeGreaterThan(1000); + } + + return { mediaItems, textItems }; +} + +/** + * Assert that the model response describes example.com content. + */ +function assertModelDescribesScreenshot(collector: StreamCollector): void { + const deltas = collector.getDeltas(); + const responseText = extractTextFromEvents(deltas).toLowerCase(); + expect(responseText).toContain("example domain"); + expect(responseText.length).toBeGreaterThan(20); +} + describeIntegration("MCP project configuration", () => { test.concurrent("add, list, and remove MCP servers", async () => { const env = await createTestEnvironment(); @@ -75,186 +153,32 @@ describeIntegration("MCP project configuration", () => { }); describeIntegration("MCP server integration with model", () => { - test.concurrent( - "MCP image content is correctly transformed to AI SDK format", - async () => { - console.log("[MCP Image Test] Setting up workspace..."); - // Setup workspace with Anthropic provider - const { env, workspaceId, tempGitRepo, cleanup } = await setupWorkspace( - "anthropic", - "mcp-chrome" - ); - const client = resolveOrpcClient(env); - console.log("[MCP Image Test] Workspace created:", { workspaceId, tempGitRepo }); - - try { - // Add the Chrome DevTools MCP server to the project - // Use --headless and --no-sandbox for CI/root environments - console.log("[MCP Image Test] Adding Chrome DevTools MCP server..."); - const addResult = await client.projects.mcp.add({ - projectPath: tempGitRepo, - name: "chrome", - command: - "npx -y chrome-devtools-mcp@latest --headless --isolated --chromeArg='--no-sandbox'", - }); - expect(addResult.success).toBe(true); - console.log("[MCP Image Test] MCP server added"); - - // Create stream collector to capture events - console.log("[MCP Image Test] Creating stream collector..."); - const collector = createStreamCollector(env.orpc, workspaceId); - collector.start(); - await collector.waitForSubscription(); - console.log("[MCP Image Test] Stream collector ready"); - - // Send a message that should trigger screenshot - // First navigate to a simple page, then take a screenshot - console.log("[MCP Image Test] Sending message..."); - const result = await sendMessageWithModel( - env, - workspaceId, - "Navigate to https://example.com and take a screenshot. Describe what you see in the screenshot.", - HAIKU_MODEL - ); - console.log("[MCP Image Test] Message sent, result:", result.success); - - expect(result.success).toBe(true); - - // Wait for stream to complete (this may take a while with Chrome) - console.log("[MCP Image Test] Waiting for stream-end..."); - await collector.waitForEvent("stream-end", 120000); // 2 minutes for Chrome operations - console.log("[MCP Image Test] Stream ended"); - assertStreamSuccess(collector); - - // Find the screenshot tool call and its result - const events = collector.getEvents(); - const toolCallEnds = events.filter( - (e): e is Extract => e.type === "tool-call-end" - ); - console.log( - "[MCP Image Test] Tool call ends:", - toolCallEnds.map((e) => ({ toolName: e.toolName, resultType: typeof e.result })) - ); - - // Find the screenshot tool result (namespaced as chrome_take_screenshot) - const screenshotResult = toolCallEnds.find((e) => e.toolName === "chrome_take_screenshot"); - expect(screenshotResult).toBeDefined(); - - // Verify the result has correct AI SDK format with mediaType - const result_output = screenshotResult!.result as - | { type: string; value: unknown[] } - | unknown; - // Log media items to verify mediaType presence - if ( - typeof result_output === "object" && - result_output !== null && - "value" in result_output - ) { - const value = (result_output as { value: unknown[] }).value; - const mediaPreview = value - .filter( - (v): v is object => - typeof v === "object" && - v !== null && - "type" in v && - (v as { type: string }).type === "media" - ) - .map((m) => ({ - type: (m as { type: string }).type, - mediaType: (m as { mediaType?: string }).mediaType, - dataLen: ((m as { data?: string }).data || "").length, - })); - console.log("[MCP Image Test] Media items:", JSON.stringify(mediaPreview)); - } - - // If it's properly transformed, it should have { type: "content", value: [...] } - if ( - typeof result_output === "object" && - result_output !== null && - "type" in result_output - ) { - const typedResult = result_output as { type: string; value: unknown[] }; - expect(typedResult.type).toBe("content"); - expect(Array.isArray(typedResult.value)).toBe(true); - - // Check for media content with mediaType - const mediaItems = typedResult.value.filter( - (item): item is { type: "media"; data: string; mediaType: string } => - typeof item === "object" && - item !== null && - "type" in item && - (item as { type: string }).type === "media" - ); - - const textItems = typedResult.value.filter( - (item): item is { type: "text"; text: string } => - typeof item === "object" && - item !== null && - "type" in item && - (item as { type: string }).type === "text" - ); - - const hasOmittedImageText = textItems.some((t) => t.text.includes("Image omitted")); - - expect(hasOmittedImageText).toBe(false); - expect(mediaItems.length).toBeGreaterThan(0); - - // Ensure no text part contains a data URI or a large base64 blob - const longBase64Pattern = /[A-Za-z0-9+/]{10000,}/; - textItems.forEach((t) => { - expect(t.text.startsWith("data:image")).toBe(false); - expect(longBase64Pattern.test(t.text)).toBe(false); - }); - - // Verify media format and size - for (const media of mediaItems) { - expect(media.mediaType).toBeDefined(); - expect(media.mediaType).toMatch(/^image\//); - expect(media.data).toBeDefined(); - expect(media.data.length).toBeGreaterThan(1000); // Should have actual image data - } - - // Log which path we took for debugging - console.log("[MCP Image Test] Result:", { - mediaCount: mediaItems.length, - textCount: textItems.length, - }); - } - - // Verify model's response - should describe the screenshot content - const deltas = collector.getDeltas(); - const responseText = extractTextFromEvents(deltas).toLowerCase(); - console.log("[MCP Image Test] Response text preview:", responseText.slice(0, 200)); - const expectedKeywords = ["example domain"]; - const matchedKeyword = expectedKeywords.some((k) => responseText.includes(k)); - expect(matchedKeyword).toBe(true); - expect(responseText.length).toBeGreaterThan(20); // Model should say something non-trivial - - collector.stop(); - } finally { - console.log("[MCP Image Test] Cleaning up..."); - await cleanup(); - console.log("[MCP Image Test] Done"); - } + // Test matrix for image format handling + const imageFormatCases = [ + { + name: "PNG", + prompt: "Navigate to https://example.com and take a screenshot. Describe what you see.", + mediaTypePattern: /^image\//, }, - 180000 - ); + { + name: "JPEG", + prompt: + 'Navigate to https://example.com and take a screenshot in JPEG format (use format: "jpeg"). Describe what you see.', + mediaTypePattern: /^image\/(jpeg|jpg|webp)$/, + }, + ] as const; - test.concurrent( - "MCP JPEG image content is correctly transformed to AI SDK format", - async () => { - console.log("[MCP JPEG Image Test] Setting up workspace..."); - // Setup workspace with Anthropic provider + test.concurrent.each(imageFormatCases)( + "MCP $name image content is correctly transformed to AI SDK format", + async ({ name, prompt, mediaTypePattern }) => { const { env, workspaceId, tempGitRepo, cleanup } = await setupWorkspace( "anthropic", - "mcp-chrome-jpeg" + `mcp-chrome-${name.toLowerCase()}` ); const client = resolveOrpcClient(env); - console.log("[MCP JPEG Image Test] Workspace created:", { workspaceId, tempGitRepo }); try { - // Add the Chrome DevTools MCP server to the project with same settings - console.log("[MCP JPEG Image Test] Adding Chrome DevTools MCP server..."); + // Add Chrome DevTools MCP server (headless + no-sandbox for CI) const addResult = await client.projects.mcp.add({ projectPath: tempGitRepo, name: "chrome", @@ -262,113 +186,32 @@ describeIntegration("MCP server integration with model", () => { "npx -y chrome-devtools-mcp@latest --headless --isolated --chromeArg='--no-sandbox'", }); expect(addResult.success).toBe(true); - console.log("[MCP JPEG Image Test] MCP server added"); - // Create stream collector to capture events - console.log("[MCP JPEG Image Test] Creating stream collector..."); const collector = createStreamCollector(env.orpc, workspaceId); collector.start(); await collector.waitForSubscription(); - console.log("[MCP JPEG Image Test] Stream collector ready"); - - // Send a message that should trigger JPEG screenshot - console.log("[MCP JPEG Image Test] Sending message..."); - const result = await sendMessageWithModel( - env, - workspaceId, - 'Navigate to https://example.com and take a screenshot in JPEG format (use format: "jpeg"). Describe what you see in the screenshot.', - HAIKU_MODEL - ); - console.log("[MCP JPEG Image Test] Message sent, result:", result.success); + const result = await sendMessageWithModel(env, workspaceId, prompt, HAIKU_MODEL); expect(result.success).toBe(true); - // Wait for stream to complete (Chrome may take time) - console.log("[MCP JPEG Image Test] Waiting for stream-end..."); await collector.waitForEvent("stream-end", 120000); - console.log("[MCP JPEG Image Test] Stream ended"); assertStreamSuccess(collector); - // Find the screenshot tool call and its result + // Find screenshot tool result const events = collector.getEvents(); const toolCallEnds = events.filter( (e): e is Extract => e.type === "tool-call-end" ); - console.log( - "[MCP JPEG Image Test] Tool call ends:", - toolCallEnds.map((e) => ({ toolName: e.toolName, resultType: typeof e.result })) - ); - const screenshotResult = toolCallEnds.find((e) => e.toolName === "chrome_take_screenshot"); expect(screenshotResult).toBeDefined(); - const result_output = screenshotResult!.result as - | { type: string; value: unknown[] } - | unknown; - - if ( - typeof result_output === "object" && - result_output !== null && - "value" in result_output - ) { - const value = (result_output as { value: unknown[] }).value; - const mediaItems = value.filter( - (item): item is { type: "media"; data: string; mediaType: string } => - typeof item === "object" && - item !== null && - "type" in item && - (item as { type: string }).type === "media" - ); - - const textItems = value.filter( - (item): item is { type: "text"; text: string } => - typeof item === "object" && - item !== null && - "type" in item && - (item as { type: string }).type === "text" - ); - - const hasOmittedImageText = textItems.some((t) => t.text.includes("Image omitted")); - expect(hasOmittedImageText).toBe(false); - expect(mediaItems.length).toBeGreaterThan(0); - - // Ensure no text part contains a data URI or a large base64 blob - const longBase64Pattern = /[A-Za-z0-9+/]{10000,}/; - textItems.forEach((t) => { - expect(t.text.startsWith("data:image")).toBe(false); - expect(longBase64Pattern.test(t.text)).toBe(false); - }); - - for (const media of mediaItems) { - expect(media.mediaType).toBeDefined(); - expect(media.mediaType).toMatch(/^image\//); - // Prefer JPEG, but allow WebP if the server re-encodes; still verify presence - expect(["image/jpeg", "image/jpg", "image/webp"]).toContain(media.mediaType); - expect(media.data).toBeDefined(); - expect(media.data.length).toBeGreaterThan(1000); - } - - console.log("[MCP JPEG Image Test] Result:", { - mediaCount: mediaItems.length, - textCount: textItems.length, - mediaTypes: mediaItems.map((m) => m.mediaType), - }); - } - - // Verify model response is non-trivial and reflects the screenshot - const deltas = collector.getDeltas(); - const responseText = extractTextFromEvents(deltas).toLowerCase(); - console.log("[MCP JPEG Image Test] Response text preview:", responseText.slice(0, 200)); - const expectedKeywords = ["example domain"]; - const matchedKeyword = expectedKeywords.some((k) => responseText.includes(k)); - expect(matchedKeyword).toBe(true); - expect(responseText.length).toBeGreaterThan(20); + // Validate result structure and media content + assertValidScreenshotResult(screenshotResult!.result, mediaTypePattern); + assertModelDescribesScreenshot(collector); collector.stop(); } finally { - console.log("[MCP JPEG Image Test] Cleaning up..."); await cleanup(); - console.log("[MCP JPEG Image Test] Done"); } }, 180000