Skip to content

Commit 5f31123

Browse files
authored
🤖 fix: resolve flaky bash and web_search integration tests (#713)
Generated with `mux` Fixes flaky integration tests observed in PR #701. Changes: - **Runtime Bash Test**: Decoupled tool execution time verification from total test duration. Now measures strict execution time (<10s) using event timestamps, while allowing a generous overall timeout (30s) to accommodate CI network/AI latency. - **Web Search Test**: Lowered `thinkingLevel` to 'low' to improve speed and reliability, and increased timeout to 180s. Verification: - `tests/ipcMain/runtimeExecuteBash.test.ts` passes locally. - `tests/ipcMain/openai-web-search.test.ts` passes locally.
1 parent d0d8e49 commit 5f31123

File tree

2 files changed

+37
-14
lines changed

2 files changed

+37
-14
lines changed

tests/ipcMain/openai-web-search.test.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ describeIntegration("OpenAI web_search integration tests", () => {
3838
"Then tell me if it's a good day for a picnic.",
3939
modelString("openai", "gpt-5.1-codex-mini"),
4040
{
41-
thinkingLevel: "medium", // Ensure reasoning without excessive deliberation
41+
thinkingLevel: "low", // Ensure reasoning without excessive deliberation
4242
}
4343
);
4444

@@ -48,8 +48,8 @@ describeIntegration("OpenAI web_search integration tests", () => {
4848
// Collect and verify stream events
4949
const collector = createEventCollector(env.sentEvents, workspaceId);
5050

51-
// Wait for stream to complete (90s should be enough for simple weather + analysis)
52-
const streamEnd = await collector.waitForEvent("stream-end", 90000);
51+
// Wait for stream to complete (150s should be enough for simple weather + analysis)
52+
const streamEnd = await collector.waitForEvent("stream-end", 150000);
5353
expect(streamEnd).toBeDefined();
5454

5555
// Verify no errors occurred - this is the KEY test
@@ -84,6 +84,6 @@ describeIntegration("OpenAI web_search integration tests", () => {
8484
await cleanup();
8585
}
8686
},
87-
120000 // 120 second timeout - reasoning + web_search should complete faster with simpler task
87+
180000 // 180 second timeout - reasoning + web_search should complete faster with simpler task
8888
);
8989
});

tests/ipcMain/runtimeExecuteBash.test.ts

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,27 @@ function collectToolOutputs(events: WorkspaceChatMessage[], toolName: string): s
5252
.join("\n");
5353
}
5454

55+
// Helper to calculate tool execution duration from captured events
56+
function getToolDuration(
57+
env: { sentEvents: Array<{ channel: string; data: unknown; timestamp: number }> },
58+
toolName: string
59+
): number {
60+
const startEvent = env.sentEvents.find((e) => {
61+
const msg = e.data as any;
62+
return msg.type === "tool-call-start" && msg.toolName === toolName;
63+
});
64+
65+
const endEvent = env.sentEvents.find((e) => {
66+
const msg = e.data as any;
67+
return msg.type === "tool-call-end" && msg.toolName === toolName;
68+
});
69+
70+
if (startEvent && endEvent) {
71+
return endEvent.timestamp - startEvent.timestamp;
72+
}
73+
return -1;
74+
}
75+
5576
// Skip all tests if TEST_INTEGRATION is not set
5677
const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
5778

@@ -259,16 +280,17 @@ describeIntegration("Runtime Bash Execution", () => {
259280
// Test command that pipes file through stdin-reading command (grep)
260281
// This would hang forever if stdin.close() was used instead of stdin.abort()
261282
// Regression test for: https://github.com/coder/mux/issues/503
262-
const startTime = Date.now();
263283
const events = await sendMessageAndWait(
264284
env,
265285
workspaceId,
266286
"Run bash: cat /tmp/test.json | grep test",
267287
HAIKU_MODEL,
268288
BASH_ONLY,
269-
10000 // 10s timeout - should complete in ~4s per API call
289+
30000 // Relaxed timeout for CI stability (was 10s)
270290
);
271-
const duration = Date.now() - startTime;
291+
292+
// Calculate actual tool execution duration
293+
const toolDuration = getToolDuration(env, "bash");
272294

273295
// Extract response text
274296
const responseText = extractTextFromEvents(events);
@@ -279,10 +301,9 @@ describeIntegration("Runtime Bash Execution", () => {
279301
expect(bashOutput).toContain('"test": "data"');
280302

281303
// Verify command completed quickly (not hanging until timeout)
282-
// With tokenizer preloading, both local and SSH complete in ~8s total
283-
// Actual hangs would hit bash tool's 180s timeout
304+
expect(toolDuration).toBeGreaterThan(0);
284305
const maxDuration = 10000;
285-
expect(duration).toBeLessThan(maxDuration);
306+
expect(toolDuration).toBeLessThan(maxDuration);
286307

287308
// Verify bash tool was called
288309
const toolCallStarts = events.filter((e: any) => e.type === "tool-call-start");
@@ -337,16 +358,17 @@ describeIntegration("Runtime Bash Execution", () => {
337358

338359
// Test grep | head pattern - this historically hangs over SSH
339360
// This is a regression test for the bash hang issue
340-
const startTime = Date.now();
341361
const events = await sendMessageAndWait(
342362
env,
343363
workspaceId,
344364
'Run bash: grep -n "terminal bench" testfile.txt | head -n 200',
345365
HAIKU_MODEL,
346366
BASH_ONLY,
347-
15000 // 15s timeout - should complete quickly
367+
30000 // Relaxed timeout for CI stability (was 15s)
348368
);
349-
const duration = Date.now() - startTime;
369+
370+
// Calculate actual tool execution duration
371+
const toolDuration = getToolDuration(env, "bash");
350372

351373
// Extract response text
352374
const responseText = extractTextFromEvents(events);
@@ -356,8 +378,9 @@ describeIntegration("Runtime Bash Execution", () => {
356378

357379
// Verify command completed quickly (not hanging until timeout)
358380
// SSH runtime should complete in <10s even with high latency
381+
expect(toolDuration).toBeGreaterThan(0);
359382
const maxDuration = 15000;
360-
expect(duration).toBeLessThan(maxDuration);
383+
expect(toolDuration).toBeLessThan(maxDuration);
361384

362385
// Verify bash tool was called
363386
const toolCallStarts = events.filter((e: any) => e.type === "tool-call-start");

0 commit comments

Comments
 (0)