Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 23 additions & 19 deletions .github/workflows/terminal-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,34 @@ on:
workflow_call:
inputs:
model_name:
description: "Model to use (e.g., anthropic:claude-sonnet-4-5)"
description: 'Model to use (e.g., anthropic:claude-sonnet-4-5)'
required: false
type: string
thinking_level:
description: "Thinking level (off, low, medium, high)"
description: 'Thinking level (off, low, medium, high)'
required: false
type: string
dataset:
description: "Terminal-Bench dataset to use"
description: 'Terminal-Bench dataset to use'
required: false
type: string
default: "terminal-bench-core==0.1.1"
default: 'terminal-bench-core==0.1.1'
concurrency:
description: "Number of concurrent tasks (--n-concurrent)"
description: 'Number of concurrent tasks (--n-concurrent)'
required: false
type: string
default: "4"
default: '4'
livestream:
description: "Enable livestream mode"
description: 'Enable livestream mode'
required: false
type: boolean
default: true
sample_size:
description: "Number of random tasks to run (empty = all tasks)"
description: 'Number of random tasks to run (empty = all tasks)'
required: false
type: string
extra_args:
description: "Additional arguments to pass to terminal-bench"
description: 'Additional arguments to pass to terminal-bench'
required: false
type: string
secrets:
Expand All @@ -42,34 +42,34 @@ on:
workflow_dispatch:
inputs:
dataset:
description: "Terminal-Bench dataset to use"
description: 'Terminal-Bench dataset to use'
required: false
default: "terminal-bench-core==0.1.1"
default: 'terminal-bench-core==0.1.1'
type: string
concurrency:
description: "Number of concurrent tasks (--n-concurrent)"
description: 'Number of concurrent tasks (--n-concurrent)'
required: false
default: "4"
default: '4'
type: string
livestream:
description: "Enable livestream mode"
description: 'Enable livestream mode'
required: false
default: true
type: boolean
sample_size:
description: "Number of random tasks to run (empty = all tasks)"
description: 'Number of random tasks to run (empty = all tasks)'
required: false
type: string
model_name:
description: "Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)"
description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)'
required: false
type: string
thinking_level:
description: "Thinking level (off, low, medium, high)"
description: 'Thinking level (off, low, medium, high)'
required: false
type: string
extra_args:
description: "Additional arguments to pass to terminal-bench"
description: 'Additional arguments to pass to terminal-bench'
required: false
type: string

Expand Down Expand Up @@ -97,6 +97,9 @@ jobs:
- name: Generate version file
run: ./scripts/generate-version.sh

- name: Build dist/ (skip icons - not needed for benchmark)
run: make build-main build-preload

- name: Run Terminal-Bench
run: make benchmark-terminal
env:
Expand All @@ -120,7 +123,7 @@ jobs:
cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
echo ""
echo "Per-task summary:"
cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .is_resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
else
echo "No results.json found in runs/"
ls -la runs/
Expand Down Expand Up @@ -148,3 +151,4 @@ jobs:
runs/
if-no-files-found: warn
retention-days: 30

5 changes: 3 additions & 2 deletions tests/ipcMain/initWorkspace.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -701,8 +701,9 @@ exit 1
// ASSERTION 7: Second message should be MUCH faster than first
// First message had to wait ~5 seconds for init. Second should be instant.
const secondMessageDuration = Date.now() - startSecondMessage;
// Allow 10 seconds for API round-trip but should be way less than first message
expect(secondMessageDuration).toBeLessThan(10000);
// Allow 15 seconds for API round-trip but should be way less than first message
// Increased timeout to account for CI runner variability
expect(secondMessageDuration).toBeLessThan(15000);

// Log timing for debugging
console.log(`Second message completed in ${secondMessageDuration}ms (no init wait)`);
Expand Down
6 changes: 4 additions & 2 deletions tests/ipcMain/runtimeExecuteBash.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -325,8 +325,10 @@ describeIntegration("Runtime Bash Execution", () => {
expect(responseText).toContain("data");

// Verify command completed quickly (not hanging until timeout)
// Should complete in under 5 seconds for SSH, 3 seconds for local
const maxDuration = type === "ssh" ? 8000 : 5000;
// Should complete in under 15 seconds for SSH, 10 seconds for local
// Generous timeouts to account for CI runner variability
// (actual hangs would hit bash tool's 180s timeout)
const maxDuration = type === "ssh" ? 15000 : 10000;
expect(duration).toBeLessThan(maxDuration);

// Verify bash tool was called
Expand Down
Loading