From 118452d989f22e1e1c4225be6f72ab5c414d402b Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 8 Nov 2025 16:44:10 +0000
Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=A4=96=20refactor:=20consolidate=20mo?=
 =?UTF-8?q?del=5Fnot=5Ffound=20error=20classification?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Moved model_not_found detection logic from error handling into
the categorizeError method to eliminate code duplication. This ensures
consistent error classification across all error handling paths.

Previously, the logic existed in two places:
1. In categorizeError (but returned 'api' for 404s)
2. In error handling (as a workaround to override 'api' to 'model_not_found')

Now categorizeError directly returns 'model_not_found' for both:
- OpenAI: 400 with error.code === 'model_not_found'
- Anthropic: 404 with error.type === 'not_found_error'

This prevents retry spam because model_not_found is in the
NON_RETRYABLE_STREAM_ERRORS list in retryEligibility.ts.

## Test Coverage

**Backend (IPC layer integration tests)**:
- ✅ Anthropic 404 errors are classified as model_not_found
- ✅ OpenAI 400 errors are classified as model_not_found

**Frontend (unit tests in retryEligibility.test.ts)**:
- ✅ model_not_found is in NON_RETRYABLE_STREAM_ERRORS list
- ✅ isEligibleForAutoRetry returns false for model_not_found errors

Generated with `cmux`
---
 src/services/streamManager.ts       |  70 +++++++++----------
 tests/ipcMain/modelNotFound.test.ts | 105 ++++++++++++++++++++++++++++
 2 files changed, 140 insertions(+), 35 deletions(-)
 create mode 100644 tests/ipcMain/modelNotFound.test.ts

diff --git a/src/services/streamManager.ts b/src/services/streamManager.ts
index 56668342d..1fe97dc93 100644
--- a/src/services/streamManager.ts
+++ b/src/services/streamManager.ts
@@ -899,41 +899,11 @@ export class StreamManager extends EventEmitter {
 
       let errorType = this.categorizeError(actualError);
 
-      // Detect and enhance model-not-found errors
-      if (APICallError.isInstance(actualError)) {
-        const apiError = actualError;
-
-        // Type guard for error data structure
-        const hasErrorProperty = (
-          data: unknown
-        ): data is { error: { code?: string; type?: string } } => {
-          return (
-            typeof data === "object" &&
-            data !== null &&
-            "error" in data &&
-            typeof data.error === "object" &&
-            data.error !== null
-          );
-        };
-
-        // OpenAI: 400 with error.code === 'model_not_found'
-        const isOpenAIModelError =
-          apiError.statusCode === 400 &&
-          hasErrorProperty(apiError.data) &&
-          apiError.data.error.code === "model_not_found";
-
-        // Anthropic: 404 with error.type === 'not_found_error'
-        const isAnthropicModelError =
-          apiError.statusCode === 404 &&
-          hasErrorProperty(apiError.data) &&
-          apiError.data.error.type === "not_found_error";
-
-        if (isOpenAIModelError || isAnthropicModelError) {
-          errorType = "model_not_found";
-          // Extract model name from model string (e.g., "anthropic:sonnet-1m" -> "sonnet-1m")
-          const [, modelName] = streamInfo.model.split(":");
-          errorMessage = `Model '${modelName || streamInfo.model}' does not exist or is not available. Please check your model selection.`;
-        }
+      // Enhance model-not-found error messages
+      if (errorType === "model_not_found") {
+        // Extract model name from model string (e.g., "anthropic:sonnet-1m" -> "sonnet-1m")
+        const [, modelName] = streamInfo.model.split(":");
+        errorMessage = `Model '${modelName || streamInfo.model}' does not exist or is not available. Please check your model selection.`;
       }
 
       // If we detect API key issues in the error message, override the type
@@ -1044,6 +1014,36 @@ export class StreamManager extends EventEmitter {
       if (error.statusCode === 429) return "rate_limit";
       if (error.statusCode && error.statusCode >= 500) return "server_error";
 
+      // Check for model_not_found errors (OpenAI and Anthropic)
+      // Type guard for error data structure
+      const hasErrorProperty = (
+        data: unknown
+      ): data is { error: { code?: string; type?: string } } => {
+        return (
+          typeof data === "object" &&
+          data !== null &&
+          "error" in data &&
+          typeof data.error === "object" &&
+          data.error !== null
+        );
+      };
+
+      // OpenAI: 400 with error.code === 'model_not_found'
+      const isOpenAIModelError =
+        error.statusCode === 400 &&
+        hasErrorProperty(error.data) &&
+        error.data.error.code === "model_not_found";
+
+      // Anthropic: 404 with error.type === 'not_found_error'
+      const isAnthropicModelError =
+        error.statusCode === 404 &&
+        hasErrorProperty(error.data) &&
+        error.data.error.type === "not_found_error";
+
+      if (isOpenAIModelError || isAnthropicModelError) {
+        return "model_not_found";
+      }
+
       // Check for Anthropic context exceeded errors
       if (error.message.includes("prompt is too long:")) {
         return "context_exceeded";
diff --git a/tests/ipcMain/modelNotFound.test.ts b/tests/ipcMain/modelNotFound.test.ts
new file mode 100644
index 000000000..9183cfec9
--- /dev/null
+++ b/tests/ipcMain/modelNotFound.test.ts
@@ -0,0 +1,105 @@
+import { setupWorkspace, shouldRunIntegrationTests, validateApiKeys } from "./setup";
+import { sendMessageWithModel, createEventCollector, waitFor } from "./helpers";
+import { IPC_CHANNELS } from "../../src/constants/ipc-constants";
+import type { Result } from "../../src/types/result";
+import type { SendMessageError } from "../../src/types/errors";
+import type { StreamErrorMessage } from "../../src/types/ipc";
+
+// Skip all tests if TEST_INTEGRATION is not set
+const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
+
+// Validate API keys before running tests
+if (shouldRunIntegrationTests()) {
+  validateApiKeys(["ANTHROPIC_API_KEY", "OPENAI_API_KEY"]);
+}
+
+describeIntegration("IpcMain model_not_found error handling", () => {
+  // Enable retries in CI for flaky API tests
+  if (process.env.CI && typeof jest !== "undefined" && jest.retryTimes) {
+    jest.retryTimes(3, { logErrorsBeforeRetry: true });
+  }
+
+  // Load tokenizer modules once before all tests
+  beforeAll(async () => {
+    const { loadTokenizerModules } = await import("../../src/utils/main/tokenizer");
+    await loadTokenizerModules();
+  }, 30000); // 30s timeout for tokenizer loading
+
+  test.concurrent(
+    "should classify Anthropic 404 as model_not_found (not retryable)",
+    async () => {
+      const { env, workspaceId, cleanup } = await setupWorkspace("anthropic");
+      try {
+        // Send a message with a non-existent model
+        // Anthropic returns 404 with error.type === 'not_found_error'
+        void sendMessageWithModel(
+          env.mockIpcRenderer,
+          workspaceId,
+          "Hello",
+          "anthropic",
+          "invalid-model-that-does-not-exist-xyz123"
+        );
+
+        // Collect events to verify error classification
+        const collector = createEventCollector(env.sentEvents, workspaceId);
+        await waitFor(() => {
+          collector.collect();
+          return collector.getEvents().some((e) => "type" in e && e.type === "stream-error");
+        }, 10000);
+
+        const events = collector.getEvents();
+        const errorEvent = events.find((e) => "type" in e && e.type === "stream-error") as
+          | StreamErrorMessage
+          | undefined;
+
+        expect(errorEvent).toBeDefined();
+        
+        // Bug: Error should be classified as 'model_not_found', not 'api' or 'unknown'
+        // This ensures it's marked as non-retryable in retryEligibility.ts
+        expect(errorEvent?.errorType).toBe("model_not_found");
+      } finally {
+        await cleanup();
+      }
+    },
+    30000 // 30s timeout
+  );
+
+  test.concurrent(
+    "should classify OpenAI 400 model_not_found as model_not_found (not retryable)",
+    async () => {
+      const { env, workspaceId, cleanup } = await setupWorkspace("openai");
+      try {
+        // Send a message with a non-existent model
+        // OpenAI returns 400 with error.code === 'model_not_found'
+        void sendMessageWithModel(
+          env.mockIpcRenderer,
+          workspaceId,
+          "Hello",
+          "openai",
+          "gpt-nonexistent-model-xyz123"
+        );
+
+        // Collect events to verify error classification
+        const collector = createEventCollector(env.sentEvents, workspaceId);
+        await waitFor(() => {
+          collector.collect();
+          return collector.getEvents().some((e) => "type" in e && e.type === "stream-error");
+        }, 10000);
+
+        const events = collector.getEvents();
+        const errorEvent = events.find((e) => "type" in e && e.type === "stream-error") as
+          | StreamErrorMessage
+          | undefined;
+
+        expect(errorEvent).toBeDefined();
+        
+        // Bug: Error should be classified as 'model_not_found', not 'api' or 'unknown'
+        expect(errorEvent?.errorType).toBe("model_not_found");
+      } finally {
+        await cleanup();
+      }
+    },
+    30000 // 30s timeout
+  );
+
+});

From dc772dd2fc47255fcddcc80c3478cdce6d19c784 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 8 Nov 2025 16:52:42 +0000
Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=A4=96=20fix:=20prettier=20formatting?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/ipcMain/modelNotFound.test.ts | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/ipcMain/modelNotFound.test.ts b/tests/ipcMain/modelNotFound.test.ts
index 9183cfec9..3521b604c 100644
--- a/tests/ipcMain/modelNotFound.test.ts
+++ b/tests/ipcMain/modelNotFound.test.ts
@@ -53,7 +53,7 @@ describeIntegration("IpcMain model_not_found error handling", () => {
           | undefined;
 
         expect(errorEvent).toBeDefined();
-        
+
         // Bug: Error should be classified as 'model_not_found', not 'api' or 'unknown'
         // This ensures it's marked as non-retryable in retryEligibility.ts
         expect(errorEvent?.errorType).toBe("model_not_found");
@@ -92,7 +92,7 @@ describeIntegration("IpcMain model_not_found error handling", () => {
           | undefined;
 
         expect(errorEvent).toBeDefined();
-        
+
         // Bug: Error should be classified as 'model_not_found', not 'api' or 'unknown'
         expect(errorEvent?.errorType).toBe("model_not_found");
       } finally {
@@ -101,5 +101,4 @@ describeIntegration("IpcMain model_not_found error handling", () => {
     },
     30000 // 30s timeout
   );
-
 });

From f1f0648f962ca1d98ab67318ed809d339eaddfac Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 8 Nov 2025 16:59:24 +0000
Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=A4=96=20refactor:=20remove=20duplica?=
 =?UTF-8?q?te=20tokenizer=20preloading=20from=20integration=20tests?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removed manual loadTokenizerModules() calls from 8 integration test files.
The tokenizer is already preloaded globally in tests/setup.ts via the
preloadTestModules() function, which runs once per Jest worker.

This eliminates duplication and ensures consistent test initialization
across all integration tests.

Files cleaned up:
- anthropic1MContext.test.ts
- forkWorkspace.test.ts
- modelNotFound.test.ts
- openai-web-search.test.ts
- resumeStream.test.ts
- sendMessage.test.ts
- streamErrorRecovery.test.ts
- truncate.test.ts

Generated with `cmux`
---
 tests/ipcMain/anthropic1MContext.test.ts  | 7 -------
 tests/ipcMain/forkWorkspace.test.ts       | 7 -------
 tests/ipcMain/modelNotFound.test.ts       | 6 ------
 tests/ipcMain/openai-web-search.test.ts   | 7 -------
 tests/ipcMain/resumeStream.test.ts        | 7 -------
 tests/ipcMain/sendMessage.test.ts         | 6 ------
 tests/ipcMain/streamErrorRecovery.test.ts | 7 -------
 tests/ipcMain/truncate.test.ts            | 7 -------
 8 files changed, 54 deletions(-)

diff --git a/tests/ipcMain/anthropic1MContext.test.ts b/tests/ipcMain/anthropic1MContext.test.ts
index 34c60b27b..f3c0d6fcd 100644
--- a/tests/ipcMain/anthropic1MContext.test.ts
+++ b/tests/ipcMain/anthropic1MContext.test.ts
@@ -20,13 +20,6 @@ describeIntegration("IpcMain anthropic 1M context integration tests", () => {
     jest.retryTimes(3, { logErrorsBeforeRetry: true });
   }
 
-  // Load tokenizer modules once before all tests (takes ~14s)
-  // This ensures accurate token counts for API calls without timing out individual tests
-  beforeAll(async () => {
-    const { loadTokenizerModules } = await import("../../src/utils/main/tokenizer");
-    await loadTokenizerModules();
-  }, 30000); // 30s timeout for tokenizer loading
-
   test.concurrent(
     "should handle larger context with 1M flag enabled vs standard limits",
     async () => {
diff --git a/tests/ipcMain/forkWorkspace.test.ts b/tests/ipcMain/forkWorkspace.test.ts
index efe2280bf..c818ba482 100644
--- a/tests/ipcMain/forkWorkspace.test.ts
+++ b/tests/ipcMain/forkWorkspace.test.ts
@@ -32,13 +32,6 @@ describeIntegration("IpcMain fork workspace integration tests", () => {
     jest.retryTimes(3, { logErrorsBeforeRetry: true });
   }
 
-  // Load tokenizer modules once before all tests (takes ~14s)
-  // This ensures accurate token counts for API calls without timing out individual tests
-  beforeAll(async () => {
-    const { loadTokenizerModules } = await import("../../src/utils/main/tokenizer");
-    await loadTokenizerModules();
-  }, 30000); // 30s timeout for tokenizer loading
-
   test.concurrent(
     "should fail to fork workspace with invalid name",
     async () => {
diff --git a/tests/ipcMain/modelNotFound.test.ts b/tests/ipcMain/modelNotFound.test.ts
index 3521b604c..131a9c02a 100644
--- a/tests/ipcMain/modelNotFound.test.ts
+++ b/tests/ipcMain/modelNotFound.test.ts
@@ -19,12 +19,6 @@ describeIntegration("IpcMain model_not_found error handling", () => {
     jest.retryTimes(3, { logErrorsBeforeRetry: true });
   }
 
-  // Load tokenizer modules once before all tests
-  beforeAll(async () => {
-    const { loadTokenizerModules } = await import("../../src/utils/main/tokenizer");
-    await loadTokenizerModules();
-  }, 30000); // 30s timeout for tokenizer loading
-
   test.concurrent(
     "should classify Anthropic 404 as model_not_found (not retryable)",
     async () => {
diff --git a/tests/ipcMain/openai-web-search.test.ts b/tests/ipcMain/openai-web-search.test.ts
index 441692766..5fd5e4174 100644
--- a/tests/ipcMain/openai-web-search.test.ts
+++ b/tests/ipcMain/openai-web-search.test.ts
@@ -15,13 +15,6 @@ describeIntegration("OpenAI web_search integration tests", () => {
     jest.retryTimes(3, { logErrorsBeforeRetry: true });
   }
 
-  // Load tokenizer modules once before all tests (takes ~14s)
-  // This ensures accurate token counts for API calls without timing out individual tests
-  beforeAll(async () => {
-    const { loadTokenizerModules } = await import("../../src/utils/main/tokenizer");
-    await loadTokenizerModules();
-  }, 30000); // 30s timeout for tokenizer loading
-
   test.concurrent(
     "should handle reasoning + web_search without itemId errors",
     async () => {
diff --git a/tests/ipcMain/resumeStream.test.ts b/tests/ipcMain/resumeStream.test.ts
index 9e03af9a4..dc0b79e18 100644
--- a/tests/ipcMain/resumeStream.test.ts
+++ b/tests/ipcMain/resumeStream.test.ts
@@ -20,13 +20,6 @@ describeIntegration("IpcMain resumeStream integration tests", () => {
     jest.retryTimes(3, { logErrorsBeforeRetry: true });
   }
 
-  // Load tokenizer modules once before all tests (takes ~14s)
-  // This ensures accurate token counts for API calls without timing out individual tests
-  beforeAll(async () => {
-    const { loadTokenizerModules } = await import("../../src/utils/main/tokenizer");
-    await loadTokenizerModules();
-  }, 30000); // 30s timeout for tokenizer loading
-
   test.concurrent(
     "should resume interrupted stream without new user message",
     async () => {
diff --git a/tests/ipcMain/sendMessage.test.ts b/tests/ipcMain/sendMessage.test.ts
index 2363c4bc9..544ec8cda 100644
--- a/tests/ipcMain/sendMessage.test.ts
+++ b/tests/ipcMain/sendMessage.test.ts
@@ -48,12 +48,6 @@ describeIntegration("IpcMain sendMessage integration tests", () => {
     jest.retryTimes(3, { logErrorsBeforeRetry: true });
   }
 
-  // Load tokenizer modules once before all tests (takes ~14s)
-  // This ensures accurate token counts for API calls without timing out individual tests
-  beforeAll(async () => {
-    const { loadTokenizerModules } = await import("../../src/utils/main/tokenizer");
-    await loadTokenizerModules();
-  }, 30000); // 30s timeout for tokenizer loading
   // Run tests for each provider concurrently
   describe.each(PROVIDER_CONFIGS)("%s:%s provider tests", (provider, model) => {
     test.concurrent(
diff --git a/tests/ipcMain/streamErrorRecovery.test.ts b/tests/ipcMain/streamErrorRecovery.test.ts
index 5b4e8e3ce..658704ff5 100644
--- a/tests/ipcMain/streamErrorRecovery.test.ts
+++ b/tests/ipcMain/streamErrorRecovery.test.ts
@@ -220,13 +220,6 @@ describeIntegration("Stream Error Recovery (No Amnesia)", () => {
     jest.retryTimes(3, { logErrorsBeforeRetry: true });
   }
 
-  // Load tokenizer modules once before all tests (takes ~14s)
-  // This ensures accurate token counts for API calls without timing out individual tests
-  beforeAll(async () => {
-    const { loadTokenizerModules } = await import("../../src/utils/main/tokenizer");
-    await loadTokenizerModules();
-  }, 30000); // 30s timeout for tokenizer loading
-
   test.concurrent(
     "should preserve exact prefix and continue from exact point after stream error",
     async () => {
diff --git a/tests/ipcMain/truncate.test.ts b/tests/ipcMain/truncate.test.ts
index d60a837e1..312631c95 100644
--- a/tests/ipcMain/truncate.test.ts
+++ b/tests/ipcMain/truncate.test.ts
@@ -24,13 +24,6 @@ describeIntegration("IpcMain truncate integration tests", () => {
     jest.retryTimes(3, { logErrorsBeforeRetry: true });
   }
 
-  // Load tokenizer modules once before all tests (takes ~14s)
-  // This ensures accurate token counts for API calls without timing out individual tests
-  beforeAll(async () => {
-    const { loadTokenizerModules } = await import("../../src/utils/main/tokenizer");
-    await loadTokenizerModules();
-  }, 30000); // 30s timeout for tokenizer loading
-
   test.concurrent(
     "should truncate 50% of chat history and verify context is updated",
     async () => {