Skip to content

Commit 8542a87

Browse files
committed
🤖 fix: build dist/ before running terminal-bench
PR #507 added `dist/` to the terminal-bench archive include paths to fix worker crashes. However, the workflow wasn't building `dist/` before running the benchmark, causing all tasks to fail immediately with: ``` Error running agent for task <name>: Required file /home/runner/work/cmux/cmux/dist missing ``` Now runs `make build` before `make benchmark-terminal` to ensure dist/ exists and contains the compiled worker files. Verified with workflow run #19140594821 which successfully completed the modernize-fortran-build task.
1 parent e99516e commit 8542a87

File tree

1 file changed

+23
-19
lines changed

1 file changed

+23
-19
lines changed

.github/workflows/terminal-bench.yml

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,34 +4,34 @@ on:
44
workflow_call:
55
inputs:
66
model_name:
7-
description: "Model to use (e.g., anthropic:claude-sonnet-4-5)"
7+
description: 'Model to use (e.g., anthropic:claude-sonnet-4-5)'
88
required: false
99
type: string
1010
thinking_level:
11-
description: "Thinking level (off, low, medium, high)"
11+
description: 'Thinking level (off, low, medium, high)'
1212
required: false
1313
type: string
1414
dataset:
15-
description: "Terminal-Bench dataset to use"
15+
description: 'Terminal-Bench dataset to use'
1616
required: false
1717
type: string
18-
default: "terminal-bench-core==0.1.1"
18+
default: 'terminal-bench-core==0.1.1'
1919
concurrency:
20-
description: "Number of concurrent tasks (--n-concurrent)"
20+
description: 'Number of concurrent tasks (--n-concurrent)'
2121
required: false
2222
type: string
23-
default: "4"
23+
default: '4'
2424
livestream:
25-
description: "Enable livestream mode"
25+
description: 'Enable livestream mode'
2626
required: false
2727
type: boolean
2828
default: true
2929
sample_size:
30-
description: "Number of random tasks to run (empty = all tasks)"
30+
description: 'Number of random tasks to run (empty = all tasks)'
3131
required: false
3232
type: string
3333
extra_args:
34-
description: "Additional arguments to pass to terminal-bench"
34+
description: 'Additional arguments to pass to terminal-bench'
3535
required: false
3636
type: string
3737
secrets:
@@ -42,34 +42,34 @@ on:
4242
workflow_dispatch:
4343
inputs:
4444
dataset:
45-
description: "Terminal-Bench dataset to use"
45+
description: 'Terminal-Bench dataset to use'
4646
required: false
47-
default: "terminal-bench-core==0.1.1"
47+
default: 'terminal-bench-core==0.1.1'
4848
type: string
4949
concurrency:
50-
description: "Number of concurrent tasks (--n-concurrent)"
50+
description: 'Number of concurrent tasks (--n-concurrent)'
5151
required: false
52-
default: "4"
52+
default: '4'
5353
type: string
5454
livestream:
55-
description: "Enable livestream mode"
55+
description: 'Enable livestream mode'
5656
required: false
5757
default: true
5858
type: boolean
5959
sample_size:
60-
description: "Number of random tasks to run (empty = all tasks)"
60+
description: 'Number of random tasks to run (empty = all tasks)'
6161
required: false
6262
type: string
6363
model_name:
64-
description: "Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)"
64+
description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)'
6565
required: false
6666
type: string
6767
thinking_level:
68-
description: "Thinking level (off, low, medium, high)"
68+
description: 'Thinking level (off, low, medium, high)'
6969
required: false
7070
type: string
7171
extra_args:
72-
description: "Additional arguments to pass to terminal-bench"
72+
description: 'Additional arguments to pass to terminal-bench'
7373
required: false
7474
type: string
7575

@@ -97,6 +97,9 @@ jobs:
9797
- name: Generate version file
9898
run: ./scripts/generate-version.sh
9999

100+
- name: Build dist/
101+
run: make build
102+
100103
- name: Run Terminal-Bench
101104
run: make benchmark-terminal
102105
env:
@@ -120,7 +123,7 @@ jobs:
120123
cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
121124
echo ""
122125
echo "Per-task summary:"
123-
cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .is_resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
126+
cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
124127
else
125128
echo "No results.json found in runs/"
126129
ls -la runs/
@@ -148,3 +151,4 @@ jobs:
148151
runs/
149152
if-no-files-found: warn
150153
retention-days: 30
154+

0 commit comments

Comments
 (0)