diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 0c75a4c3d..6db5b58fb 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -4,34 +4,34 @@ on: workflow_call: inputs: model_name: - description: "Model to use (e.g., anthropic:claude-sonnet-4-5)" + description: 'Model to use (e.g., anthropic:claude-sonnet-4-5)' required: false type: string thinking_level: - description: "Thinking level (off, low, medium, high)" + description: 'Thinking level (off, low, medium, high)' required: false type: string dataset: - description: "Terminal-Bench dataset to use" + description: 'Terminal-Bench dataset to use' required: false type: string - default: "terminal-bench-core==0.1.1" + default: 'terminal-bench-core==0.1.1' concurrency: - description: "Number of concurrent tasks (--n-concurrent)" + description: 'Number of concurrent tasks (--n-concurrent)' required: false type: string - default: "4" + default: '4' livestream: - description: "Enable livestream mode" + description: 'Enable livestream mode' required: false type: boolean default: true sample_size: - description: "Number of random tasks to run (empty = all tasks)" + description: 'Number of random tasks to run (empty = all tasks)' required: false type: string extra_args: - description: "Additional arguments to pass to terminal-bench" + description: 'Additional arguments to pass to terminal-bench' required: false type: string secrets: @@ -42,34 +42,34 @@ on: workflow_dispatch: inputs: dataset: - description: "Terminal-Bench dataset to use" + description: 'Terminal-Bench dataset to use' required: false - default: "terminal-bench-core==0.1.1" + default: 'terminal-bench-core==0.1.1' type: string concurrency: - description: "Number of concurrent tasks (--n-concurrent)" + description: 'Number of concurrent tasks (--n-concurrent)' required: false - default: "4" + default: '4' type: string livestream: - description: "Enable livestream mode" + description: 'Enable livestream mode' required: false default: true type: boolean sample_size: - description: "Number of random tasks to run (empty = all tasks)" + description: 'Number of random tasks to run (empty = all tasks)' required: false type: string model_name: - description: "Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)" + description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)' required: false type: string thinking_level: - description: "Thinking level (off, low, medium, high)" + description: 'Thinking level (off, low, medium, high)' required: false type: string extra_args: - description: "Additional arguments to pass to terminal-bench" + description: 'Additional arguments to pass to terminal-bench' required: false type: string @@ -97,6 +97,9 @@ jobs: - name: Generate version file run: ./scripts/generate-version.sh + - name: Build dist/ (skip icons - not needed for benchmark) + run: make build-main build-preload + - name: Run Terminal-Bench run: make benchmark-terminal env: @@ -120,7 +123,7 @@ jobs: cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE" echo "" echo "Per-task summary:" - cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .is_resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details" + cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details" else echo "No results.json found in runs/" ls -la runs/ @@ -148,3 +151,4 @@ jobs: runs/ if-no-files-found: warn retention-days: 30 + diff --git a/tests/ipcMain/initWorkspace.test.ts b/tests/ipcMain/initWorkspace.test.ts index 82a59090b..a9f542957 100644 --- a/tests/ipcMain/initWorkspace.test.ts +++ b/tests/ipcMain/initWorkspace.test.ts @@ -701,8 +701,9 @@ exit 1 // ASSERTION 7: Second message should be MUCH faster than first // First message had to wait ~5 seconds for init. Second should be instant. const secondMessageDuration = Date.now() - startSecondMessage; - // Allow 10 seconds for API round-trip but should be way less than first message - expect(secondMessageDuration).toBeLessThan(10000); + // Allow 15 seconds for API round-trip but should be way less than first message + // Increased timeout to account for CI runner variability + expect(secondMessageDuration).toBeLessThan(15000); // Log timing for debugging console.log(`Second message completed in ${secondMessageDuration}ms (no init wait)`); diff --git a/tests/ipcMain/runtimeExecuteBash.test.ts b/tests/ipcMain/runtimeExecuteBash.test.ts index 4f7d5288b..4861bcced 100644 --- a/tests/ipcMain/runtimeExecuteBash.test.ts +++ b/tests/ipcMain/runtimeExecuteBash.test.ts @@ -325,8 +325,10 @@ describeIntegration("Runtime Bash Execution", () => { expect(responseText).toContain("data"); // Verify command completed quickly (not hanging until timeout) - // Should complete in under 5 seconds for SSH, 3 seconds for local - const maxDuration = type === "ssh" ? 8000 : 5000; + // Should complete in under 15 seconds for SSH, 10 seconds for local + // Generous timeouts to account for CI runner variability + // (actual hangs would hit bash tool's 180s timeout) + const maxDuration = type === "ssh" ? 15000 : 10000; expect(duration).toBeLessThan(maxDuration); // Verify bash tool was called