diff --git a/src/node/services/mcpResultTransform.test.ts b/src/node/services/mcpResultTransform.test.ts index f169843a42..1423325fa0 100644 --- a/src/node/services/mcpResultTransform.test.ts +++ b/src/node/services/mcpResultTransform.test.ts @@ -21,10 +21,10 @@ describe("transformMCPResult", () => { }); }); - it("should truncate large image data to prevent context overflow", () => { - // Create a large base64 string that simulates a big screenshot - // A typical screenshot could be 500KB-2MB of base64 data - const largeImageData = "x".repeat(MAX_IMAGE_DATA_BYTES + 100_000); + it("should omit large image data to prevent context overflow", () => { + // Create a large base64 string that simulates a screenshot + // Even 50KB of base64 would be ~12,500 tokens when treated as text + const largeImageData = "x".repeat(MAX_IMAGE_DATA_BYTES + 10_000); const result = transformMCPResult({ content: [ { type: "text", text: "Screenshot taken" }, @@ -41,16 +41,16 @@ describe("transformMCPResult", () => { expect(transformed.value).toHaveLength(2); expect(transformed.value[0]).toEqual({ type: "text", text: "Screenshot taken" }); - // The image should be replaced with a text message explaining the truncation + // The image should be replaced with a text message explaining why it was omitted const imageResult = transformed.value[1]; expect(imageResult.type).toBe("text"); - expect(imageResult.text).toContain("Image data too large"); - expect(imageResult.text).toContain(String(largeImageData.length)); + expect(imageResult.text).toContain("Image omitted"); + expect(imageResult.text).toContain("per-image guard"); }); - it("should handle multiple images, truncating only the oversized ones", () => { + it("should handle multiple images, omitting only the oversized ones", () => { const smallImageData = "small".repeat(100); - const largeImageData = "x".repeat(MAX_IMAGE_DATA_BYTES + 50_000); + const largeImageData = "x".repeat(MAX_IMAGE_DATA_BYTES + 5_000); const result = transformMCPResult({ content: [ @@ -71,14 +71,14 @@ describe("transformMCPResult", () => { data: smallImageData, mediaType: "image/png", }); - // Large image gets truncated with explanation + // Large image gets omitted with explanation expect(transformed.value[1].type).toBe("text"); - expect(transformed.value[1].text).toContain("Image data too large"); + expect(transformed.value[1].text).toContain("Image omitted"); }); - it("should report approximate file size in KB/MB in truncation message", () => { - // ~1.5MB of base64 data - const largeImageData = "y".repeat(1_500_000); + it("should mention size and guard limit in omission message", () => { + // 100KB of base64 data should trigger the guard if limit is smaller, but we keep it big here + const largeImageData = "y".repeat(MAX_IMAGE_DATA_BYTES + 1_000); const result = transformMCPResult({ content: [{ type: "image", data: largeImageData, mimeType: "image/png" }], }); @@ -89,8 +89,10 @@ describe("transformMCPResult", () => { }; expect(transformed.value[0].type).toBe("text"); - // Should mention MB since it's over 1MB - expect(transformed.value[0].text).toMatch(/\d+(\.\d+)?\s*MB/i); + // Should mention size and guard + expect(transformed.value[0].text).toMatch(/Image omitted/); + expect(transformed.value[0].text).toMatch(/per-image guard/i); + expect(transformed.value[0].text).toMatch(/MB|KB/); }); }); diff --git a/src/node/services/mcpResultTransform.ts b/src/node/services/mcpResultTransform.ts index 8afc044d80..a59ad045e6 100644 --- a/src/node/services/mcpResultTransform.ts +++ b/src/node/services/mcpResultTransform.ts @@ -1,12 +1,14 @@ import { log } from "@/node/services/log"; /** - * Maximum size of base64 image data in bytes before truncation. - * Large images can overflow the model's context window. 256KB of base64 - * represents roughly 192KB of actual image data, which is sufficient for - * screenshots while preventing context overflow. + * Maximum size of base64 image data in bytes before we drop it. + * + * Rationale: providers already accept multi‑megabyte images, but a single + * 20–30MB screenshot can still blow up request sizes or hit provider limits + * (e.g., Anthropic ~32MB total request). We keep a generous per‑image guard to + * pass normal screenshots while preventing pathological payloads. */ -export const MAX_IMAGE_DATA_BYTES = 256 * 1024; // 256KB of base64 data +export const MAX_IMAGE_DATA_BYTES = 8 * 1024 * 1024; // 8MB guard per image /** * MCP CallToolResult content types (from @ai-sdk/mcp) @@ -92,14 +94,14 @@ export function transformMCPResult(result: MCPCallToolResult): unknown { // Check if image data exceeds the limit const dataLength = imageItem.data?.length ?? 0; if (dataLength > MAX_IMAGE_DATA_BYTES) { - log.warn("[MCP] Image data too large, truncating", { + log.warn("[MCP] Image data too large, omitting from context", { mimeType: imageItem.mimeType, dataLength, maxAllowed: MAX_IMAGE_DATA_BYTES, }); return { type: "text" as const, - text: `[Image data too large to include in context: ${formatBytes(dataLength)} (${dataLength} bytes). The image was captured but cannot be displayed inline. Consider using a smaller viewport or requesting a specific region.]`, + text: `[Image omitted: ${formatBytes(dataLength)} exceeds per-image guard of ${formatBytes(MAX_IMAGE_DATA_BYTES)}. Reduce resolution or quality and retry.]`, }; } // Ensure mediaType is present - default to image/png if missing diff --git a/tests/ipc/mcpConfig.test.ts b/tests/ipc/mcpConfig.test.ts index 94a3e245ac..c73eba6681 100644 --- a/tests/ipc/mcpConfig.test.ts +++ b/tests/ipc/mcpConfig.test.ts @@ -17,6 +17,7 @@ import { extractTextFromEvents, HAIKU_MODEL, } from "./helpers"; +import type { StreamCollector } from "./streamCollector"; const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip; @@ -24,6 +25,83 @@ if (shouldRunIntegrationTests()) { validateApiKeys(["ANTHROPIC_API_KEY"]); } +// Shared types for MCP content parsing +type MediaItem = { type: "media"; data: string; mediaType: string }; +type TextItem = { type: "text"; text: string }; + +function isMediaItem(item: unknown): item is MediaItem { + return ( + typeof item === "object" && + item !== null && + "type" in item && + (item as { type: string }).type === "media" + ); +} + +function isTextItem(item: unknown): item is TextItem { + return ( + typeof item === "object" && + item !== null && + "type" in item && + (item as { type: string }).type === "text" + ); +} + +/** + * Assert that a screenshot result has valid media content. + * Verifies: proper structure, no omitted images, no base64 in text, valid mediaType. + */ +function assertValidScreenshotResult( + result: unknown, + allowedMediaTypes?: RegExp +): { mediaItems: MediaItem[]; textItems: TextItem[] } { + expect(typeof result).toBe("object"); + expect(result).not.toBeNull(); + expect(result).toHaveProperty("type", "content"); + expect(result).toHaveProperty("value"); + + const value = (result as { value: unknown[] }).value; + expect(Array.isArray(value)).toBe(true); + + const mediaItems = value.filter(isMediaItem); + const textItems = value.filter(isTextItem); + + // No "Image omitted" text + const hasOmittedImageText = textItems.some((t) => t.text.includes("Image omitted")); + expect(hasOmittedImageText).toBe(false); + + // Must have at least one media item + expect(mediaItems.length).toBeGreaterThan(0); + + // Text parts must not contain base64 blobs (would indicate serialization as text) + const longBase64Pattern = /[A-Za-z0-9+/]{10000,}/; + for (const t of textItems) { + expect(t.text.startsWith("data:image")).toBe(false); + expect(longBase64Pattern.test(t.text)).toBe(false); + } + + // Validate media items + const typePattern = allowedMediaTypes ?? /^image\//; + for (const media of mediaItems) { + expect(media.mediaType).toBeDefined(); + expect(media.mediaType).toMatch(typePattern); + expect(media.data).toBeDefined(); + expect(media.data.length).toBeGreaterThan(1000); + } + + return { mediaItems, textItems }; +} + +/** + * Assert that the model response describes example.com content. + */ +function assertModelDescribesScreenshot(collector: StreamCollector): void { + const deltas = collector.getDeltas(); + const responseText = extractTextFromEvents(deltas).toLowerCase(); + expect(responseText).toContain("example domain"); + expect(responseText.length).toBeGreaterThan(20); +} + describeIntegration("MCP project configuration", () => { test.concurrent("add, list, and remove MCP servers", async () => { const env = await createTestEnvironment(); @@ -75,22 +153,32 @@ describeIntegration("MCP project configuration", () => { }); describeIntegration("MCP server integration with model", () => { - test.concurrent( - "MCP image content is correctly transformed to AI SDK format", - async () => { - console.log("[MCP Image Test] Setting up workspace..."); - // Setup workspace with Anthropic provider + // Test matrix for image format handling + const imageFormatCases = [ + { + name: "PNG", + prompt: "Navigate to https://example.com and take a screenshot. Describe what you see.", + mediaTypePattern: /^image\//, + }, + { + name: "JPEG", + prompt: + 'Navigate to https://example.com and take a screenshot in JPEG format (use format: "jpeg"). Describe what you see.', + mediaTypePattern: /^image\/(jpeg|jpg|webp)$/, + }, + ] as const; + + test.concurrent.each(imageFormatCases)( + "MCP $name image content is correctly transformed to AI SDK format", + async ({ name, prompt, mediaTypePattern }) => { const { env, workspaceId, tempGitRepo, cleanup } = await setupWorkspace( "anthropic", - "mcp-chrome" + `mcp-chrome-${name.toLowerCase()}` ); const client = resolveOrpcClient(env); - console.log("[MCP Image Test] Workspace created:", { workspaceId, tempGitRepo }); try { - // Add the Chrome DevTools MCP server to the project - // Use --headless and --no-sandbox for CI/root environments - console.log("[MCP Image Test] Adding Chrome DevTools MCP server..."); + // Add Chrome DevTools MCP server (headless + no-sandbox for CI) const addResult = await client.projects.mcp.add({ projectPath: tempGitRepo, name: "chrome", @@ -98,119 +186,35 @@ describeIntegration("MCP server integration with model", () => { "npx -y chrome-devtools-mcp@latest --headless --isolated --chromeArg='--no-sandbox'", }); expect(addResult.success).toBe(true); - console.log("[MCP Image Test] MCP server added"); - // Create stream collector to capture events - console.log("[MCP Image Test] Creating stream collector..."); const collector = createStreamCollector(env.orpc, workspaceId); collector.start(); await collector.waitForSubscription(); - console.log("[MCP Image Test] Stream collector ready"); - - // Send a message that should trigger screenshot - // First navigate to a simple page, then take a screenshot - console.log("[MCP Image Test] Sending message..."); - const result = await sendMessageWithModel( - env, - workspaceId, - "Navigate to https://example.com and take a screenshot. Describe what you see in the screenshot.", - HAIKU_MODEL - ); - console.log("[MCP Image Test] Message sent, result:", result.success); + const result = await sendMessageWithModel(env, workspaceId, prompt, HAIKU_MODEL); expect(result.success).toBe(true); - // Wait for stream to complete (this may take a while with Chrome) - console.log("[MCP Image Test] Waiting for stream-end..."); - await collector.waitForEvent("stream-end", 120000); // 2 minutes for Chrome operations - console.log("[MCP Image Test] Stream ended"); + await collector.waitForEvent("stream-end", 120000); assertStreamSuccess(collector); - // Find the screenshot tool call and its result + // Find screenshot tool result const events = collector.getEvents(); const toolCallEnds = events.filter( (e): e is Extract => e.type === "tool-call-end" ); - console.log( - "[MCP Image Test] Tool call ends:", - toolCallEnds.map((e) => ({ toolName: e.toolName, resultType: typeof e.result })) - ); - - // Find the screenshot tool result (namespaced as chrome_take_screenshot) const screenshotResult = toolCallEnds.find((e) => e.toolName === "chrome_take_screenshot"); expect(screenshotResult).toBeDefined(); - // Verify the result has correct AI SDK format with mediaType - const result_output = screenshotResult!.result as - | { type: string; value: unknown[] } - | unknown; - // Log media items to verify mediaType presence - if ( - typeof result_output === "object" && - result_output !== null && - "value" in result_output - ) { - const value = (result_output as { value: unknown[] }).value; - const mediaPreview = value - .filter( - (v): v is object => - typeof v === "object" && - v !== null && - "type" in v && - (v as { type: string }).type === "media" - ) - .map((m) => ({ - type: (m as { type: string }).type, - mediaType: (m as { mediaType?: string }).mediaType, - dataLen: ((m as { data?: string }).data || "").length, - })); - console.log("[MCP Image Test] Media items:", JSON.stringify(mediaPreview)); - } - - // If it's properly transformed, it should have { type: "content", value: [...] } - if ( - typeof result_output === "object" && - result_output !== null && - "type" in result_output - ) { - const typedResult = result_output as { type: string; value: unknown[] }; - expect(typedResult.type).toBe("content"); - expect(Array.isArray(typedResult.value)).toBe(true); - - // Check for media content with mediaType - const mediaItems = typedResult.value.filter( - (item): item is { type: "media"; data: string; mediaType: string } => - typeof item === "object" && - item !== null && - "type" in item && - (item as { type: string }).type === "media" - ); - - expect(mediaItems.length).toBeGreaterThan(0); - // Verify mediaType is present and is a valid image type - for (const media of mediaItems) { - expect(media.mediaType).toBeDefined(); - expect(media.mediaType).toMatch(/^image\//); - expect(media.data).toBeDefined(); - expect(media.data.length).toBeGreaterThan(100); // Should have actual image data - } - } - - // Verify model's response mentions seeing something (proves it understood the image) - const deltas = collector.getDeltas(); - const responseText = extractTextFromEvents(deltas).toLowerCase(); - console.log("[MCP Image Test] Response text preview:", responseText.slice(0, 200)); - // Model should describe something it sees - domain name, content, or visual elements - expect(responseText).toMatch(/example|domain|website|page|text|heading|title/i); + // Validate result structure and media content + assertValidScreenshotResult(screenshotResult!.result, mediaTypePattern); + assertModelDescribesScreenshot(collector); collector.stop(); } finally { - console.log("[MCP Image Test] Cleaning up..."); await cleanup(); - console.log("[MCP Image Test] Done"); } }, - 180000 // 3 minutes - Chrome operations can be slow + 180000 ); test.concurrent(