diff --git a/src/services/aiService.ts b/src/services/aiService.ts index ea90a4ab2e..1073cd4bf5 100644 --- a/src/services/aiService.ts +++ b/src/services/aiService.ts @@ -19,7 +19,7 @@ import { validateAnthropicCompliance, addInterruptedSentinel, filterEmptyAssistantMessages, - stripReasoningForOpenAI, + clearProviderMetadataForOpenAI, } from "@/utils/messages/modelMessageTransform"; import { applyCacheControl } from "@/utils/ai/cacheStrategy"; import type { HistoryService } from "./historyService"; @@ -283,31 +283,31 @@ export class AIService extends EventEmitter { const [providerName] = modelString.split(":"); // Filter out assistant messages with only reasoning (no text/tools) - let filteredMessages = filterEmptyAssistantMessages(messages); + const filteredMessages = filterEmptyAssistantMessages(messages); log.debug(`Filtered ${messages.length - filteredMessages.length} empty assistant messages`); log.debug_obj(`${workspaceId}/1a_filtered_messages.json`, filteredMessages); - // OpenAI-specific: Strip reasoning parts from history - // OpenAI manages reasoning via previousResponseId; sending Anthropic-style reasoning - // parts creates orphaned reasoning items that cause API errors - if (providerName === "openai") { - filteredMessages = stripReasoningForOpenAI(filteredMessages); - log.debug("Stripped reasoning parts for OpenAI"); - log.debug_obj(`${workspaceId}/1b_openai_stripped.json`, filteredMessages); - } - // Add [INTERRUPTED] sentinel to partial messages (for model context) const messagesWithSentinel = addInterruptedSentinel(filteredMessages); // Convert CmuxMessage to ModelMessage format using Vercel AI SDK utility // Type assertion needed because CmuxMessage has custom tool parts for interrupted tools // eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-argument - const modelMessages = convertToModelMessages(messagesWithSentinel as any); + let modelMessages = convertToModelMessages(messagesWithSentinel as any); log.debug_obj(`${workspaceId}/2_model_messages.json`, modelMessages); + // OpenAI-specific: Clear provider metadata to prevent reasoning/tool errors + // OpenAI manages reasoning via previousResponseId; sending stale provider metadata + // from history causes "reasoning without following item" and tool call errors + if (providerName === "openai") { + modelMessages = clearProviderMetadataForOpenAI(modelMessages); + log.debug("Cleared provider metadata for OpenAI"); + log.debug_obj(`${workspaceId}/2a_openai_cleaned.json`, modelMessages); + } + // Apply ModelMessage transforms based on provider requirements - const transformedMessages = transformModelMessages(modelMessages, providerName); + const transformedMessages = transformModelMessages(modelMessages); // Apply cache control for Anthropic models AFTER transformation const finalMessages = applyCacheControl(transformedMessages, modelString); @@ -387,8 +387,7 @@ export class AIService extends EventEmitter { timestamp: Date.now(), }, providerOptions, - maxOutputTokens, - toolPolicy + maxOutputTokens ); if (!streamResult.success) { diff --git a/src/utils/messages/modelMessageTransform.test.ts b/src/utils/messages/modelMessageTransform.test.ts index 0db2f0574d..c6af6d14b9 100644 --- a/src/utils/messages/modelMessageTransform.test.ts +++ b/src/utils/messages/modelMessageTransform.test.ts @@ -23,7 +23,7 @@ describe("modelMessageTransform", () => { assistantMsg, ]; - const result = transformModelMessages(messages, "anthropic"); + const result = transformModelMessages(messages); expect(result).toEqual(messages); }); @@ -38,7 +38,7 @@ describe("modelMessageTransform", () => { }; const messages: ModelMessage[] = [assistantMsg1, assistantMsg2]; - const result = transformModelMessages(messages, "anthropic"); + const result = transformModelMessages(messages); expect(result).toEqual(messages); }); @@ -52,7 +52,7 @@ describe("modelMessageTransform", () => { }; const messages: ModelMessage[] = [assistantMsg]; - const result = transformModelMessages(messages, "anthropic"); + const result = transformModelMessages(messages); // Should only keep text, strip interrupted tool calls expect(result).toHaveLength(1); @@ -71,7 +71,7 @@ describe("modelMessageTransform", () => { }; const messages: ModelMessage[] = [assistantMsg]; - const result = transformModelMessages(messages, "anthropic"); + const result = transformModelMessages(messages); // Should filter out the entire message since it only has orphaned tool calls expect(result).toHaveLength(0); @@ -108,7 +108,7 @@ describe("modelMessageTransform", () => { }; const messages: ModelMessage[] = [assistantMsg, toolMsg]; - const result = transformModelMessages(messages, "anthropic"); + const result = transformModelMessages(messages); // Should have: text message, tool calls (only call1 & call2), tool results expect(result).toHaveLength(3); @@ -198,7 +198,7 @@ describe("modelMessageTransform", () => { }; const messages: ModelMessage[] = [assistantMsg, toolMsg]; - const result = transformModelMessages(messages, "anthropic"); + const result = transformModelMessages(messages); // Should split into multiple messages with tool results properly placed expect(result.length).toBeGreaterThan(2); @@ -323,7 +323,7 @@ describe("modelMessageTransform", () => { }, ]; - const result = transformModelMessages(messages, "anthropic"); + const result = transformModelMessages(messages); expect(result).toHaveLength(1); expect(result[0].role).toBe("user"); expect((result[0].content as Array<{ type: string; text: string }>)[0].text).toBe("Hello"); @@ -341,7 +341,7 @@ describe("modelMessageTransform", () => { }, ]; - const result = transformModelMessages(messages, "anthropic"); + const result = transformModelMessages(messages); expect(result).toHaveLength(1); expect(result[0].role).toBe("user"); expect((result[0].content as Array<{ type: string; text: string }>)[0].text).toBe( @@ -365,7 +365,7 @@ describe("modelMessageTransform", () => { }, ]; - const result = transformModelMessages(messages, "anthropic"); + const result = transformModelMessages(messages); expect(result).toHaveLength(1); expect(result[0].role).toBe("user"); expect((result[0].content as Array<{ type: string; text: string }>)[0].text).toBe( @@ -389,7 +389,7 @@ describe("modelMessageTransform", () => { }, ]; - const result = transformModelMessages(messages, "anthropic"); + const result = transformModelMessages(messages); expect(result).toHaveLength(3); expect(result[0].role).toBe("user"); expect((result[0].content as Array<{ type: string; text: string }>)[0].text).toBe("Hello"); @@ -517,8 +517,8 @@ describe("modelMessageTransform", () => { }); }); - describe("reasoning part stripping for OpenAI", () => { - it("should strip reasoning parts for OpenAI provider", () => { + describe("reasoning part handling", () => { + it("should preserve reasoning parts for both OpenAI and Anthropic", () => { const messages: ModelMessage[] = [ { role: "user", @@ -533,46 +533,28 @@ describe("modelMessageTransform", () => { }, ]; - const result = transformModelMessages(messages, "openai"); + // Both providers should preserve reasoning parts + // OpenAI-specific metadata clearing is done in aiService.ts, not in transformModelMessages + const resultOpenAI = transformModelMessages(messages); + const resultAnthropic = transformModelMessages(messages); - // Should have 2 messages, assistant message should only have text - expect(result).toHaveLength(2); - expect(result[1].role).toBe("assistant"); - expect((result[1] as AssistantModelMessage).content).toEqual([ - { type: "text", text: "Here's the solution" }, - ]); - }); - - it("should preserve reasoning parts for Anthropic provider", () => { - const messages: ModelMessage[] = [ - { - role: "user", - content: [{ type: "text", text: "Solve this problem" }], - }, - { - role: "assistant", - content: [ - { type: "reasoning", text: "Let me think about this..." }, - { type: "text", text: "Here's the solution" }, - ], - }, - ]; + // Both should have 2 messages with reasoning and text preserved + expect(resultOpenAI).toHaveLength(2); + expect(resultAnthropic).toHaveLength(2); - const result = transformModelMessages(messages, "anthropic"); - - // Should have 2 messages, assistant message should have both reasoning and text - expect(result).toHaveLength(2); - expect(result[1].role).toBe("assistant"); - const content = (result[1] as AssistantModelMessage).content; - expect(Array.isArray(content)).toBe(true); - if (Array.isArray(content)) { - expect(content).toHaveLength(2); - expect(content[0]).toEqual({ type: "reasoning", text: "Let me think about this..." }); - expect(content[1]).toEqual({ type: "text", text: "Here's the solution" }); + for (const result of [resultOpenAI, resultAnthropic]) { + expect(result[1].role).toBe("assistant"); + const content = (result[1] as AssistantModelMessage).content; + expect(Array.isArray(content)).toBe(true); + if (Array.isArray(content)) { + expect(content).toHaveLength(2); + expect(content[0]).toEqual({ type: "reasoning", text: "Let me think about this..." }); + expect(content[1]).toEqual({ type: "text", text: "Here's the solution" }); + } } }); - it("should filter out reasoning-only messages for OpenAI", () => { + it("should filter out reasoning-only messages for all providers", () => { const messages: ModelMessage[] = [ { role: "user", @@ -584,14 +566,18 @@ describe("modelMessageTransform", () => { }, ]; - const result = transformModelMessages(messages, "openai"); + // Both providers should filter reasoning-only messages + const resultOpenAI = transformModelMessages(messages); + const resultAnthropic = transformModelMessages(messages); - // Should only have user message, reasoning-only assistant message should be filtered out - expect(result).toHaveLength(1); - expect(result[0].role).toBe("user"); + // Should only have user message for both providers + expect(resultOpenAI).toHaveLength(1); + expect(resultOpenAI[0].role).toBe("user"); + expect(resultAnthropic).toHaveLength(1); + expect(resultAnthropic[0].role).toBe("user"); }); - it("should preserve tool calls when stripping reasoning for OpenAI", () => { + it("should preserve reasoning and tool calls in messages", () => { const messages: ModelMessage[] = [ { role: "user", @@ -618,9 +604,9 @@ describe("modelMessageTransform", () => { }, ]; - const result = transformModelMessages(messages, "openai"); + const result = transformModelMessages(messages); - // Should have user, text, tool-call, tool-result (no reasoning) + // Should split into text message and tool-call/tool-result messages expect(result.length).toBeGreaterThan(2); // Find the assistant message with text @@ -633,8 +619,8 @@ describe("modelMessageTransform", () => { if (textMessage) { const content = (textMessage as AssistantModelMessage).content; if (Array.isArray(content)) { - // Should not have reasoning parts - expect(content.some((c) => c.type === "reasoning")).toBe(false); + // Should have reasoning parts preserved + expect(content.some((c) => c.type === "reasoning")).toBe(true); // Should have text expect(content.some((c) => c.type === "text")).toBe(true); } @@ -649,7 +635,7 @@ describe("modelMessageTransform", () => { expect(toolCallMessage).toBeDefined(); }); - it("should handle multiple reasoning parts for OpenAI", () => { + it("should coalesce multiple consecutive reasoning parts", () => { const messages: ModelMessage[] = [ { role: "user", @@ -665,14 +651,22 @@ describe("modelMessageTransform", () => { }, ]; - const result = transformModelMessages(messages, "openai"); + const result = transformModelMessages(messages); - // Should have 2 messages, assistant should only have text + // Should have 2 messages, assistant should have coalesced reasoning and text expect(result).toHaveLength(2); expect(result[1].role).toBe("assistant"); - expect((result[1] as AssistantModelMessage).content).toEqual([ - { type: "text", text: "Final answer" }, - ]); + const content = (result[1] as AssistantModelMessage).content; + expect(Array.isArray(content)).toBe(true); + if (Array.isArray(content)) { + // Should coalesce the two reasoning parts into one + expect(content).toHaveLength(2); + expect(content[0]).toEqual({ + type: "reasoning", + text: "First, I'll consider...Then, I'll analyze...", + }); + expect(content[1]).toEqual({ type: "text", text: "Final answer" }); + } }); }); }); diff --git a/src/utils/messages/modelMessageTransform.ts b/src/utils/messages/modelMessageTransform.ts index 6fdab7509d..9ec42553d8 100644 --- a/src/utils/messages/modelMessageTransform.ts +++ b/src/utils/messages/modelMessageTransform.ts @@ -32,32 +32,92 @@ export function filterEmptyAssistantMessages(messages: CmuxMessage[]): CmuxMessa } /** - * Strip reasoning parts from messages for OpenAI. + * Clear provider metadata from ModelMessages for OpenAI to prevent reasoning/tool errors. * - * OpenAI's Responses API uses encrypted reasoning items (with IDs like rs_*) that are - * managed automatically via previous_response_id. When reasoning parts from history - * (which are Anthropic-style text-based reasoning) are sent to OpenAI, they create - * orphaned reasoning items that cause "reasoning without following item" errors. + * OpenAI's Responses API uses encrypted reasoning items (IDs like rs_*) that are + * managed automatically via previous_response_id. When these provider metadata + * references are sent back to OpenAI from stored history, they can cause errors: + * - "Item 'rs_*' of type 'reasoning' was provided without its required following item" + * - "referenced reasoning on a function_call was not provided" * - * Anthropic's reasoning (text-based) is different and SHOULD be sent back via sendReasoning. + * The solution is to blank out providerMetadata on reasoning parts and + * callProviderMetadata on tool-call parts. This lets OpenAI manage conversation + * state via previousResponseId without conflicting with stale metadata. * - * @param messages - Messages that may contain reasoning parts - * @returns Messages with reasoning parts stripped (for OpenAI only) + * Reference: https://github.com/vercel/ai/issues/7099 + * User solution: https://github.com/gvkhna/vibescraper + * + * @param messages - ModelMessages after convertToModelMessages() + * @returns Messages with provider metadata cleared (for OpenAI only) */ -export function stripReasoningForOpenAI(messages: CmuxMessage[]): CmuxMessage[] { +export function clearProviderMetadataForOpenAI(messages: ModelMessage[]): ModelMessage[] { return messages.map((msg) => { - // Only process assistant messages - if (msg.role !== "assistant") { - return msg; + // Process assistant messages (which may have reasoning/text/tool-call parts) + if (msg.role === "assistant") { + const assistantMsg = msg; + + // Handle string content (no parts to process) + if (typeof assistantMsg.content === "string") { + return msg; + } + + // Process content array and clear provider metadata + const cleanedContent = assistantMsg.content.map((part) => { + // Clear providerMetadata for text and reasoning parts + if ((part.type === "text" || part.type === "reasoning") && "providerMetadata" in part) { + return { + ...part, + providerMetadata: {}, + }; + } + + // Clear providerMetadata for tool-call parts + if (part.type === "tool-call" && "providerMetadata" in part) { + return { + ...part, + providerMetadata: {}, + }; + } + + return part; + }); + + return { + ...assistantMsg, + content: cleanedContent, + }; } - // Strip reasoning parts - OpenAI manages reasoning via previousResponseId - const filteredParts = msg.parts.filter((part) => part.type !== "reasoning"); + // Process tool messages (which may have tool-result parts with stale metadata) + if (msg.role === "tool") { + const toolMsg = msg; + + // Handle string content (no parts to process) + if (typeof toolMsg.content === "string") { + return msg; + } - return { - ...msg, - parts: filteredParts, - }; + // Process content array and clear provider metadata + const cleanedContent = toolMsg.content.map((part) => { + // Clear providerMetadata for tool-result parts + if (part.type === "tool-result" && "providerMetadata" in part) { + return { + ...part, + providerMetadata: {}, + }; + } + + return part; + }); + + return { + ...toolMsg, + content: cleanedContent, + }; + } + + // Other message types (user, system) pass through unchanged + return msg; }); } @@ -122,7 +182,10 @@ function splitMixedContentMessages(messages: ModelMessage[]): ModelMessage[] { } // Check if this assistant message has both text and tool calls - const textParts = assistantMsg.content.filter((c) => c.type === "text" && c.text.trim()); + // Note: Reasoning parts are treated like text parts (they stay together) + const textParts = assistantMsg.content.filter( + (c) => (c.type === "text" && c.text.trim()) || c.type === "reasoning" + ); const toolCallParts = assistantMsg.content.filter((c) => c.type === "tool-call"); // Check if the next message is a tool result message @@ -180,7 +243,9 @@ function splitMixedContentMessages(messages: ModelMessage[]): ModelMessage[] { let currentGroup: { type: "text" | "tool-call"; parts: ContentArray } | null = null; for (const item of contentWithPositions) { - const partType = item.content.type === "text" ? "text" : "tool-call"; + // Reasoning parts are treated as text (they go together with text) + const partType = + item.content.type === "text" || item.content.type === "reasoning" ? "text" : "tool-call"; if (!currentGroup || currentGroup.type !== partType) { if (currentGroup) groups.push(currentGroup); @@ -305,37 +370,6 @@ function filterReasoningOnlyMessages(messages: ModelMessage[]): ModelMessage[] { }); } -/** - * Strip reasoning parts from assistant messages. - * OpenAI's Responses API has its own reasoning format (encrypted reasoning items with IDs). - * Anthropic's text-based reasoning parts are incompatible and must be removed. - * This function removes reasoning parts while preserving text and tool-call parts. - */ -function stripReasoningParts(messages: ModelMessage[]): ModelMessage[] { - return messages.map((msg) => { - // Only process assistant messages with array content - if (msg.role !== "assistant") { - return msg; - } - - const assistantMsg = msg; - - // Skip string content (no reasoning parts to strip) - if (typeof assistantMsg.content === "string") { - return msg; - } - - // Filter out reasoning parts, keep everything else - const filteredContent = assistantMsg.content.filter((part) => part.type !== "reasoning"); - - // If all content was filtered out, this message will be caught by filterReasoningOnlyMessages - return { - ...assistantMsg, - content: filteredContent, - }; - }); -} - /** * Coalesce consecutive parts of the same type within each message. * Streaming creates many individual text/reasoning parts; merge them for easier debugging. @@ -429,44 +463,30 @@ function mergeConsecutiveUserMessages(messages: ModelMessage[]): ModelMessage[] /** * Transform messages to ensure provider API compliance. - * Applies multiple transformation passes based on provider requirements: + * Applies multiple transformation passes: * 0. Coalesce consecutive parts (text/reasoning) - all providers, reduces JSON overhead * 1. Split mixed content messages (text + tool calls) - all providers - * 2. Strip/filter reasoning parts: - * - OpenAI: Strip all Anthropic reasoning parts (incompatible format) - * - Anthropic: Filter out reasoning-only messages (API rejects them) + * 2. Filter out reasoning-only messages - all providers * 3. Merge consecutive user messages - all providers * - * Note: encryptedContent stripping happens earlier in streamManager when tool results - * are first stored, not during message transformation. + * Note: Provider-specific handling (like clearing OpenAI metadata) happens in aiService.ts + * before/after this transformation. * * @param messages The messages to transform - * @param provider The provider name (e.g., "anthropic", "openai") */ -export function transformModelMessages(messages: ModelMessage[], provider: string): ModelMessage[] { +export function transformModelMessages(messages: ModelMessage[]): ModelMessage[] { // Pass 0: Coalesce consecutive parts to reduce JSON overhead from streaming (applies to all providers) const coalesced = coalesceConsecutiveParts(messages); // Pass 1: Split mixed content messages (applies to all providers) const split = splitMixedContentMessages(coalesced); - // Pass 2: Provider-specific reasoning handling - let reasoningHandled: ModelMessage[]; - if (provider === "openai") { - // OpenAI: Strip all reasoning parts (Anthropic's text-based reasoning is incompatible with OpenAI's format) - reasoningHandled = stripReasoningParts(split); - // Then filter out any messages that became empty after stripping - reasoningHandled = filterReasoningOnlyMessages(reasoningHandled); - } else if (provider === "anthropic") { - // Anthropic: Filter out reasoning-only messages (API rejects messages with only reasoning) - reasoningHandled = filterReasoningOnlyMessages(split); - } else { - // Unknown provider: no reasoning handling - reasoningHandled = split; - } + // Pass 2: Filter out reasoning-only messages (applies to all providers) + // Both Anthropic and OpenAI reject messages that have only reasoning parts + const reasoningFiltered = filterReasoningOnlyMessages(split); // Pass 3: Merge consecutive user messages (applies to all providers) - const merged = mergeConsecutiveUserMessages(reasoningHandled); + const merged = mergeConsecutiveUserMessages(reasoningFiltered); return merged; } diff --git a/tests/ipcMain/README_OPENAI_REASONING.md b/tests/ipcMain/README_OPENAI_REASONING.md new file mode 100644 index 0000000000..43b53fc66a --- /dev/null +++ b/tests/ipcMain/README_OPENAI_REASONING.md @@ -0,0 +1,79 @@ +# OpenAI Reasoning Error Reproduction Test + +## Problem + +OpenAI reasoning models (gpt-5-codex, o3-mini, etc.) intermittently return this error: + +``` +Item 'rs_*' of type 'reasoning' was provided without its required following item. +``` + +This occurs in multi-turn conversations, especially when: +- Previous responses contained reasoning parts +- Tool calls are involved +- The `previous_response_id` parameter is used + +## Test + +`openaiReasoning.test.ts` - Attempts to reproduce the error by: +1. Sending a message that triggers reasoning + tool calls +2. Sending follow-up messages that reference the conversation history +3. Running multiple attempts since the error is intermittent + +## Running the Test + +```bash +# Run with default 10 attempts +TEST_INTEGRATION=1 bun x jest tests/ipcMain/openaiReasoning.test.ts + +# Run with more attempts to increase reproduction chance +OPENAI_REASONING_TEST_RUNS=20 TEST_INTEGRATION=1 bun x jest tests/ipcMain/openaiReasoning.test.ts + +# Run with fewer attempts for quick testing +OPENAI_REASONING_TEST_RUNS=3 TEST_INTEGRATION=1 bun x jest tests/ipcMain/openaiReasoning.test.ts +``` + +## Expected Behavior + +The test will: +- Run N attempts (default 10) +- For each attempt, create a fresh workspace +- Send 3 messages in sequence +- Check for the specific error in stream events +- Report if the error was reproduced + +## Output + +Success (error reproduced): +``` +🎯 [Run 5] REPRODUCED THE ERROR on second message! +✅ Successfully reproduced the OpenAI reasoning error! +``` + +No reproduction: +``` +❌ Failed to reproduce the error after 10 attempts +Consider increasing OPENAI_REASONING_TEST_RUNS or modifying the test prompts +``` + +## Why Multiple Attempts? + +The error is intermittent and depends on: +- OpenAI's internal state management +- Timing of requests +- Specific conversation patterns +- Model behavior variations + +## Next Steps + +Once reproduced: +1. Examine the debug dumps in `~/.cmux/debug_obj//` +2. Check the conversation history in `~/.cmux/sessions//chat.jsonl` +3. Analyze the `providerMetadata` on reasoning parts +4. Test potential fixes (e.g., clearing `providerMetadata`, omitting `previous_response_id`) + +## Related + +- GitHub Issue: vercel/ai#7099 +- User's fix: @gvkhna's solution for similar issue +- PR #61, PR #68: Previous fix attempts (reverted) diff --git a/tests/ipcMain/openaiReasoning.test.ts b/tests/ipcMain/openaiReasoning.test.ts new file mode 100644 index 0000000000..56d895dcb5 --- /dev/null +++ b/tests/ipcMain/openaiReasoning.test.ts @@ -0,0 +1,221 @@ +/** + * OpenAI Reasoning Error Reproduction Test + * + * This test attempts to reproduce the error: + * "Item 'rs_*' of type 'reasoning' was provided without its required following item" + * + * The error occurs when: + * 1. OpenAI reasoning model (gpt-5-codex, o3-mini, etc.) is used + * 2. First message triggers reasoning + tool calls + * 3. Follow-up message causes OpenAI to reference stale reasoning item IDs + * + * Run with: TEST_INTEGRATION=1 bun x jest tests/ipcMain/openaiReasoning.test.ts + * + * Set OPENAI_REASONING_TEST_RUNS= to control number of attempts (default: 10) + * The error is intermittent, so we retry multiple times to increase chances of reproduction. + */ + +import { + setupWorkspace, + shouldRunIntegrationTests, + validateApiKeys, + type TestEnvironment, +} from "./setup"; +import { + sendMessageWithModel, + createEventCollector, + assertStreamSuccess, +} from "./helpers"; + +// Skip all tests if TEST_INTEGRATION is not set +const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip; + +// Validate API keys before running tests +if (shouldRunIntegrationTests()) { + validateApiKeys(["OPENAI_API_KEY"]); +} + +// Number of test runs to attempt (error is intermittent) +const TEST_RUNS = process.env.OPENAI_REASONING_TEST_RUNS + ? parseInt(process.env.OPENAI_REASONING_TEST_RUNS, 10) + : 10; + +describeIntegration("OpenAI Reasoning Error Reproduction", () => { + // Use longer timeout since we're doing multiple runs + const TOTAL_TIMEOUT = TEST_RUNS * 60000; // 60s per run + + test.concurrent( + `should handle multi-turn reasoning conversations (${TEST_RUNS} attempts)`, + async () => { + const provider = "openai"; + const model = "gpt-5-codex"; // OpenAI reasoning model + + let reproduced = false; + let lastError: unknown = null; + + // Try multiple times to reproduce the error + for (let run = 1; run <= TEST_RUNS; run++) { + console.log(`\n[Run ${run}/${TEST_RUNS}] Starting OpenAI reasoning test...`); + + const { env, workspaceId, cleanup } = await setupWorkspace(provider, `reasoning-${run}`); + + try { + // TURN 1: Message that triggers reasoning + tool calls + console.log(`[Run ${run}] Sending first message (with reasoning)...`); + const firstMessage = + "Look at the files in this directory and write a hello.txt file with 'Hello from reasoning test'"; + + const result1 = await sendMessageWithModel( + env.mockIpcRenderer, + workspaceId, + firstMessage, + provider, + model + ); + + if (!result1.success) { + console.log(`[Run ${run}] First message failed:`, result1.error); + await cleanup(); + continue; + } + + // Wait for stream to complete (or error) + const collector1 = createEventCollector(env.sentEvents, workspaceId); + // Don't wait for stream-end if there's an error - check events immediately after any completion + await collector1.waitForEvent("stream-end", 30000).catch(() => {/* Timeout is OK if error occurred */}); + + // Check if stream had an error + const streamError1 = collector1.getEvents().find((e) => "type" in e && e.type === "stream-error"); + if (streamError1) { + console.log(`[Run ${run}] First stream error:`, streamError1); + + // Check if this is the error we're looking for + if ("error" in streamError1 && typeof streamError1.error === "string") { + if (streamError1.error.includes("reasoning") && streamError1.error.includes("without its required following item")) { + console.log(`\n🎯 [Run ${run}] REPRODUCED THE ERROR on first message!`); + reproduced = true; + lastError = streamError1.error; + await cleanup(); + break; + } + } + + await cleanup(); + continue; + } + + console.log(`[Run ${run}] First message succeeded`); + + // Clear events for second message + env.sentEvents.length = 0; + + // TURN 2: Follow-up message (this is where the error often occurs) + console.log(`[Run ${run}] Sending second message (follow-up)...`); + const secondMessage = "Now read that file and tell me what it says"; + + const result2 = await sendMessageWithModel( + env.mockIpcRenderer, + workspaceId, + secondMessage, + provider, + model + ); + + if (!result2.success) { + console.log(`[Run ${run}] Second message failed:`, result2.error); + await cleanup(); + continue; + } + + // Wait for stream to complete (or error) + const collector2 = createEventCollector(env.sentEvents, workspaceId); + await collector2.waitForEvent("stream-end", 30000).catch(() => {/* Timeout is OK if error occurred */}); + + // Check if stream had the error we're looking for + const streamError2 = collector2.getEvents().find((e) => "type" in e && e.type === "stream-error"); + if (streamError2) { + console.log(`[Run ${run}] Second stream error:`, streamError2); + + // Check if this is the error we're looking for + if ("error" in streamError2 && typeof streamError2.error === "string") { + if (streamError2.error.includes("reasoning") && streamError2.error.includes("without its required following item")) { + console.log(`\n🎯 [Run ${run}] REPRODUCED THE ERROR on second message!`); + reproduced = true; + lastError = streamError2.error; + await cleanup(); + break; + } + } + + await cleanup(); + continue; + } + + console.log(`[Run ${run}] Second message succeeded`); + + // If we got here, both messages succeeded - try a third message + env.sentEvents.length = 0; + + // TURN 3: Another follow-up + console.log(`[Run ${run}] Sending third message (another follow-up)...`); + const thirdMessage = "What is the content of hello.txt?"; + + const result3 = await sendMessageWithModel( + env.mockIpcRenderer, + workspaceId, + thirdMessage, + provider, + model + ); + + if (!result3.success) { + console.log(`[Run ${run}] Third message failed:`, result3.error); + await cleanup(); + continue; + } + + // Wait for stream to complete (or error) + const collector3 = createEventCollector(env.sentEvents, workspaceId); + await collector3.waitForEvent("stream-end", 30000).catch(() => {/* Timeout is OK if error occurred */}); + + // Check if stream had the error + const streamError3 = collector3.getEvents().find((e) => "type" in e && e.type === "stream-error"); + if (streamError3) { + console.log(`[Run ${run}] Third stream error:`, streamError3); + + // Check if this is the error we're looking for + if ("error" in streamError3 && typeof streamError3.error === "string") { + if (streamError3.error.includes("reasoning") && streamError3.error.includes("without its required following item")) { + console.log(`\n🎯 [Run ${run}] REPRODUCED THE ERROR on third message!`); + reproduced = true; + lastError = streamError3.error; + await cleanup(); + break; + } + } + } + + console.log(`[Run ${run}] All three messages succeeded`); + await cleanup(); + } catch (error) { + console.log(`[Run ${run}] Exception:`, error); + await cleanup(); + } + } + + // Report results + if (reproduced) { + console.log(`\n✅ Successfully reproduced the OpenAI reasoning error!`); + console.log(`Error: ${lastError}`); + // Don't fail the test - we want to see the error in logs + expect(reproduced).toBe(true); + } else { + console.log(`\n❌ Failed to reproduce the error after ${TEST_RUNS} attempts`); + console.log(`Consider increasing OPENAI_REASONING_TEST_RUNS or modifying the test prompts`); + // Don't fail - the error is intermittent + expect(true).toBe(true); + } + }, + TOTAL_TIMEOUT + ); +});