coder · ammario · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -133,7 +133,7 @@ jobs:
       - name: Run all integration tests with coverage
         # TEST_OLLAMA=1 enables Ollama-specific tests (now included with all integration tests)
         # --silent suppresses per-test output (17+ test files × workers = overwhelming logs)
-        run: TEST_INTEGRATION=1 TEST_OLLAMA=1 bun x jest --coverage --maxWorkers=100% --silent ${{ github.event.inputs.test_filter || 'tests' }}
+        run: TEST_INTEGRATION=1 bun x jest --coverage --maxWorkers=100% --silent ${{ github.event.inputs.test_filter || 'tests' }}
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}

diff --git a/tests/ipcMain/anthropic1MContext.test.ts b/tests/ipcMain/anthropic1MContext.test.ts
@@ -16,11 +16,6 @@ if (shouldRunIntegrationTests()) {
 }
 
 describeIntegration("IpcMain anthropic 1M context integration tests", () => {
-  // Enable retries in CI for flaky API tests
-  if (process.env.CI && typeof jest !== "undefined" && jest.retryTimes) {
-    jest.retryTimes(3, { logErrorsBeforeRetry: true });
-  }
-
   test.concurrent(
     "should handle larger context with 1M flag enabled vs standard limits",
     async () => {

diff --git a/tests/ipcMain/anthropicCacheStrategy.test.ts b/tests/ipcMain/anthropicCacheStrategy.test.ts
@@ -13,11 +13,6 @@ if (shouldRunIntegrationTests() && !shouldRunSuite) {
 }
 
 describeIntegration("Anthropic cache strategy integration", () => {
-  // Enable retries in CI for flaky API tests
-  if (process.env.CI && typeof jest !== "undefined" && jest.retryTimes) {
-    jest.retryTimes(2, { logErrorsBeforeRetry: true });
-  }
-
   test(
     "should apply cache control to messages, system prompt, and tools for Anthropic models",
     async () => {

diff --git a/tests/ipcMain/forkWorkspace.test.ts b/tests/ipcMain/forkWorkspace.test.ts
@@ -28,11 +28,6 @@ if (shouldRunIntegrationTests()) {
 }
 
 describeIntegration("IpcMain fork workspace integration tests", () => {
-  // Enable retries in CI for flaky API tests
-  if (process.env.CI && typeof jest !== "undefined" && jest.retryTimes) {
-    jest.retryTimes(3, { logErrorsBeforeRetry: true });
-  }
-
   test.concurrent(
     "should fail to fork workspace with invalid name",
     async () => {

diff --git a/tests/ipcMain/helpers.ts b/tests/ipcMain/helpers.ts
@@ -793,3 +793,13 @@ export async function buildLargeHistory(
     }
   }
 }
+
+/**
+ * Configure test retries for flaky tests in CI
+ * Only works with Jest
+ */
+export function configureTestRetries(retries = 3): void {
+  if (process.env.CI && typeof jest !== "undefined" && jest.retryTimes) {
+    jest.retryTimes(retries, { logErrorsBeforeRetry: true });
+  }
+}
diff --git a/tests/ipcMain/modelNotFound.test.ts b/tests/ipcMain/modelNotFound.test.ts
@@ -14,11 +14,6 @@ if (shouldRunIntegrationTests()) {
 }
 
 describeIntegration("IpcMain model_not_found error handling", () => {
-  // Enable retries in CI for flaky API tests
-  if (process.env.CI && typeof jest !== "undefined" && jest.retryTimes) {
-    jest.retryTimes(3, { logErrorsBeforeRetry: true });
-  }
-
   test.concurrent(
     "should classify Anthropic 404 as model_not_found (not retryable)",
     async () => {

diff --git a/tests/ipcMain/ollama.test.ts b/tests/ipcMain/ollama.test.ts
@@ -5,6 +5,7 @@ import {
   assertStreamSuccess,
   extractTextFromEvents,
   modelString,
+  configureTestRetries,
 } from "./helpers";
 import { spawn } from "child_process";
 
@@ -83,9 +84,7 @@ async function ensureOllamaModel(model: string): Promise<void> {
 
 describeOllama("IpcMain Ollama integration tests", () => {
   // Enable retries in CI for potential network flakiness with Ollama
-  if (process.env.CI && typeof jest !== "undefined" && jest.retryTimes) {
-    jest.retryTimes(3, { logErrorsBeforeRetry: true });
-  }
+  configureTestRetries(3);
 
   // Load tokenizer modules and ensure model is available before all tests
   beforeAll(async () => {
@@ -184,7 +183,7 @@ describeOllama("IpcMain Ollama integration tests", () => {
 
       // Wait for stream to complete
       const collector = createEventCollector(env.sentEvents, workspaceId);
-      await collector.waitForEvent("stream-end", 60000);
+      await collector.waitForEvent("stream-end", 90000);
 
       assertStreamSuccess(collector);
 

diff --git a/tests/ipcMain/openai-web-search.test.ts b/tests/ipcMain/openai-web-search.test.ts
@@ -4,6 +4,7 @@ import {
   createEventCollector,
   assertStreamSuccess,
   modelString,
+  configureTestRetries,
 } from "./helpers";
 
 // Skip all tests if TEST_INTEGRATION is not set
@@ -16,9 +17,7 @@ if (shouldRunIntegrationTests()) {
 
 describeIntegration("OpenAI web_search integration tests", () => {
   // Enable retries in CI for flaky API tests
-  if (process.env.CI && typeof jest !== "undefined" && jest.retryTimes) {
-    jest.retryTimes(3, { logErrorsBeforeRetry: true });
-  }
+  configureTestRetries(3);
 
   test.concurrent(
     "should handle reasoning + web_search without itemId errors",

diff --git a/tests/ipcMain/queuedMessages.test.ts b/tests/ipcMain/queuedMessages.test.ts
@@ -66,11 +66,6 @@ async function waitForRestoreToInputEvent(
 }
 
 describeIntegration("IpcMain queuedMessages integration tests", () => {
-  // Enable retries in CI for flaky API tests
-  if (process.env.CI && typeof jest !== "undefined" && jest.retryTimes) {
-    jest.retryTimes(3, { logErrorsBeforeRetry: true });
-  }
-
   test.concurrent(
     "should queue message during streaming and auto-send on stream end",
     async () => {

diff --git a/tests/ipcMain/resumeStream.test.ts b/tests/ipcMain/resumeStream.test.ts
@@ -15,11 +15,6 @@ if (shouldRunIntegrationTests()) {
 }
 
 describeIntegration("IpcMain resumeStream integration tests", () => {
-  // Enable retries in CI for flaky API tests
-  if (process.env.CI && typeof jest !== "undefined" && jest.retryTimes) {
-    jest.retryTimes(3, { logErrorsBeforeRetry: true });
-  }
-
   test.concurrent(
     "should resume interrupted stream without new user message",
     async () => {

diff --git a/tests/ipcMain/sendMessage.test.ts b/tests/ipcMain/sendMessage.test.ts
@@ -18,6 +18,7 @@ import {
   readChatHistory,
   TEST_IMAGES,
   modelString,
+  configureTestRetries,
 } from "./helpers";
 import type { StreamDeltaEvent } from "../../src/common/types/stream";
 import { IPC_CHANNELS } from "../../src/common/constants/ipc-constants";
@@ -45,11 +46,6 @@ const PROVIDER_CONFIGS: Array<[string, string]> = [
 // - Test timeout values (in describe/test) should be 2-3x the expected duration
 
 describeIntegration("IpcMain sendMessage integration tests", () => {
-  // Enable retries in CI for flaky API tests (only works with Jest, not Bun test runner)
-  if (process.env.CI && typeof jest !== "undefined" && jest.retryTimes) {
-    jest.retryTimes(3, { logErrorsBeforeRetry: true });
-  }
-
   // Run tests for each provider concurrently
   describe.each(PROVIDER_CONFIGS)("%s:%s provider tests", (provider, model) => {
     test.concurrent(
@@ -1078,11 +1074,6 @@ These are general instructions that apply to all modes.
 
   // Tool policy tests
   describe("tool policy", () => {
-    // Retry tool policy tests in CI (they depend on external API behavior)
-    if (process.env.CI && typeof jest !== "undefined" && jest.retryTimes) {
-      jest.retryTimes(2, { logErrorsBeforeRetry: true });
-    }
-
     test.each(PROVIDER_CONFIGS)(
       "%s should respect tool policy that disables bash",
       async (provider, model) => {
@@ -1504,6 +1495,9 @@ These are general instructions that apply to all modes.
 
 // Test image support across providers
 describe.each(PROVIDER_CONFIGS)("%s:%s image support", (provider, model) => {
+  // Retry image tests in CI as they can be flaky with some providers
+  configureTestRetries(3);
+
   test.concurrent(
     "should send images to AI model and get response",
     async () => {

diff --git a/tests/ipcMain/streamErrorRecovery.test.ts b/tests/ipcMain/streamErrorRecovery.test.ts
@@ -220,11 +220,6 @@ async function collectStreamUntil(
 }
 
 describeIntegration("Stream Error Recovery (No Amnesia)", () => {
-  // Enable retries in CI for flaky API tests
-  if (process.env.CI && typeof jest !== "undefined" && jest.retryTimes) {
-    jest.retryTimes(3, { logErrorsBeforeRetry: true });
-  }
-
   test.concurrent(
     "should preserve exact prefix and continue from exact point after stream error",
     async () => {

diff --git a/tests/ipcMain/truncate.test.ts b/tests/ipcMain/truncate.test.ts
@@ -19,11 +19,6 @@ if (shouldRunIntegrationTests()) {
 }
 
 describeIntegration("IpcMain truncate integration tests", () => {
-  // Enable retries in CI for flaky API tests
-  if (process.env.CI && typeof jest !== "undefined" && jest.retryTimes) {
-    jest.retryTimes(3, { logErrorsBeforeRetry: true });
-  }
-
   test.concurrent(
     "should truncate 50% of chat history and verify context is updated",
     async () => {

diff --git a/tests/runtime/ssh-fixture.ts b/tests/runtime/ssh-fixture.ts
@@ -49,6 +49,7 @@ export async function isDockerAvailable(): Promise<boolean> {
 export async function startSSHServer(): Promise<SSHServerConfig> {
   // Create temp directory for SSH keys
   const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "mux-ssh-test-"));
+  let containerId: string | undefined;
 
   try {
     // Generate ephemeral SSH key pair
@@ -93,7 +94,7 @@ export async function startSSHServer(): Promise<SSHServerConfig> {
       "mux-ssh-test",
     ]);
 
-    const containerId = runResult.stdout.trim();
+    containerId = runResult.stdout.trim();
 
     // Wait for container to be ready
     await waitForContainer(containerId);
@@ -121,6 +122,14 @@ export async function startSSHServer(): Promise<SSHServerConfig> {
       tempDir,
     };
   } catch (error) {
+    // Cleanup container on failure if it was started
+    if (containerId) {
+      try {
+        await execCommand("docker", ["stop", containerId], { timeout: 10000 });
+      } catch (cleanupError) {
+        console.error("Error stopping container during cleanup:", cleanupError);
+      }
+    }
     // Cleanup temp directory on failure
     await fs.rm(tempDir, { recursive: true, force: true });
     throw error;