🤖 fix: resolve flaky bash and web_search integration tests (#713)

ammar-agent · web-flow · commit 5f311237e23c · 2025-11-23T20:43:38.000-06:00
Generated with `mux` Fixes flaky integration tests observed in PR #701. Changes: - **Runtime Bash Test**: Decoupled tool execution time verification from total test duration. Now measures strict execution time (<10s) using event timestamps, while allowing a generous overall timeout (30s) to accommodate CI network/AI latency. - **Web Search Test**: Lowered `thinkingLevel` to 'low' to improve speed and reliability, and increased timeout to 180s. Verification: - `tests/ipcMain/runtimeExecuteBash.test.ts` passes locally. - `tests/ipcMain/openai-web-search.test.ts` passes locally.
diff --git a/tests/ipcMain/openai-web-search.test.ts b/tests/ipcMain/openai-web-search.test.ts
@@ -38,7 +38,7 @@ describeIntegration("OpenAI web_search integration tests", () => {
             "Then tell me if it's a good day for a picnic.",
           modelString("openai", "gpt-5.1-codex-mini"),
           {
-            thinkingLevel: "medium", // Ensure reasoning without excessive deliberation
+            thinkingLevel: "low", // Ensure reasoning without excessive deliberation
           }
         );
 
@@ -48,8 +48,8 @@ describeIntegration("OpenAI web_search integration tests", () => {
         // Collect and verify stream events
         const collector = createEventCollector(env.sentEvents, workspaceId);
 
-        // Wait for stream to complete (90s should be enough for simple weather + analysis)
-        const streamEnd = await collector.waitForEvent("stream-end", 90000);
+        // Wait for stream to complete (150s should be enough for simple weather + analysis)
+        const streamEnd = await collector.waitForEvent("stream-end", 150000);
         expect(streamEnd).toBeDefined();
 
         // Verify no errors occurred - this is the KEY test
@@ -84,6 +84,6 @@ describeIntegration("OpenAI web_search integration tests", () => {
         await cleanup();
       }
     },
-    120000 // 120 second timeout - reasoning + web_search should complete faster with simpler task
+    180000 // 180 second timeout - reasoning + web_search should complete faster with simpler task
   );
 });
diff --git a/tests/ipcMain/runtimeExecuteBash.test.ts b/tests/ipcMain/runtimeExecuteBash.test.ts
@@ -52,6 +52,27 @@ function collectToolOutputs(events: WorkspaceChatMessage[], toolName: string): s
     .join("\n");
 }
 
+// Helper to calculate tool execution duration from captured events
+function getToolDuration(
+  env: { sentEvents: Array<{ channel: string; data: unknown; timestamp: number }> },
+  toolName: string
+): number {
+  const startEvent = env.sentEvents.find((e) => {
+    const msg = e.data as any;
+    return msg.type === "tool-call-start" && msg.toolName === toolName;
+  });
+
+  const endEvent = env.sentEvents.find((e) => {
+    const msg = e.data as any;
+    return msg.type === "tool-call-end" && msg.toolName === toolName;
+  });
+
+  if (startEvent && endEvent) {
+    return endEvent.timestamp - startEvent.timestamp;
+  }
+  return -1;
+}
+
 // Skip all tests if TEST_INTEGRATION is not set
 const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
 
@@ -259,16 +280,17 @@ describeIntegration("Runtime Bash Execution", () => {
               // Test command that pipes file through stdin-reading command (grep)
               // This would hang forever if stdin.close() was used instead of stdin.abort()
               // Regression test for: https://github.com/coder/mux/issues/503
-              const startTime = Date.now();
               const events = await sendMessageAndWait(
                 env,
                 workspaceId,
                 "Run bash: cat /tmp/test.json | grep test",
                 HAIKU_MODEL,
                 BASH_ONLY,
-                10000 // 10s timeout - should complete in ~4s per API call
+                30000 // Relaxed timeout for CI stability (was 10s)
               );
-              const duration = Date.now() - startTime;
+
+              // Calculate actual tool execution duration
+              const toolDuration = getToolDuration(env, "bash");
 
               // Extract response text
               const responseText = extractTextFromEvents(events);
@@ -279,10 +301,9 @@ describeIntegration("Runtime Bash Execution", () => {
               expect(bashOutput).toContain('"test": "data"');
 
               // Verify command completed quickly (not hanging until timeout)
-              // With tokenizer preloading, both local and SSH complete in ~8s total
-              // Actual hangs would hit bash tool's 180s timeout
+              expect(toolDuration).toBeGreaterThan(0);
               const maxDuration = 10000;
-              expect(duration).toBeLessThan(maxDuration);
+              expect(toolDuration).toBeLessThan(maxDuration);
 
               // Verify bash tool was called
               const toolCallStarts = events.filter((e: any) => e.type === "tool-call-start");
@@ -337,16 +358,17 @@ describeIntegration("Runtime Bash Execution", () => {
 
               // Test grep | head pattern - this historically hangs over SSH
               // This is a regression test for the bash hang issue
-              const startTime = Date.now();
               const events = await sendMessageAndWait(
                 env,
                 workspaceId,
                 'Run bash: grep -n "terminal bench" testfile.txt | head -n 200',
                 HAIKU_MODEL,
                 BASH_ONLY,
-                15000 // 15s timeout - should complete quickly
+                30000 // Relaxed timeout for CI stability (was 15s)
               );
-              const duration = Date.now() - startTime;
+
+              // Calculate actual tool execution duration
+              const toolDuration = getToolDuration(env, "bash");
 
               // Extract response text
               const responseText = extractTextFromEvents(events);
@@ -356,8 +378,9 @@ describeIntegration("Runtime Bash Execution", () => {
 
               // Verify command completed quickly (not hanging until timeout)
               // SSH runtime should complete in <10s even with high latency
+              expect(toolDuration).toBeGreaterThan(0);
               const maxDuration = 15000;
-              expect(duration).toBeLessThan(maxDuration);
+              expect(toolDuration).toBeLessThan(maxDuration);
 
               // Verify bash tool was called
               const toolCallStarts = events.filter((e: any) => e.type === "tool-call-start");