From 8542a872c67652a4b68419fa11d2d7656ed3dc5f Mon Sep 17 00:00:00 2001 From: Ammar Date: Thu, 6 Nov 2025 15:27:32 +0000 Subject: [PATCH] =?UTF-8?q?=F0=9F=A4=96=20fix:=20build=20dist/=20before=20?= =?UTF-8?q?running=20terminal-bench?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #507 added `dist/` to the terminal-bench archive include paths to fix worker crashes. However, the workflow wasn't building `dist/` before running the benchmark, causing all tasks to fail immediately with: ``` Error running agent for task : Required file /home/runner/work/cmux/cmux/dist missing ``` Now runs `make build` before `make benchmark-terminal` to ensure dist/ exists and contains the compiled worker files. Verified with workflow run #19140594821 which successfully completed the modernize-fortran-build task. --- .github/workflows/terminal-bench.yml | 42 +++++++++++++++------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 0c75a4c3d..e573f6781 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -4,34 +4,34 @@ on: workflow_call: inputs: model_name: - description: "Model to use (e.g., anthropic:claude-sonnet-4-5)" + description: 'Model to use (e.g., anthropic:claude-sonnet-4-5)' required: false type: string thinking_level: - description: "Thinking level (off, low, medium, high)" + description: 'Thinking level (off, low, medium, high)' required: false type: string dataset: - description: "Terminal-Bench dataset to use" + description: 'Terminal-Bench dataset to use' required: false type: string - default: "terminal-bench-core==0.1.1" + default: 'terminal-bench-core==0.1.1' concurrency: - description: "Number of concurrent tasks (--n-concurrent)" + description: 'Number of concurrent tasks (--n-concurrent)' required: false type: string - default: "4" + default: '4' livestream: - description: "Enable livestream mode" + description: 'Enable livestream mode' required: false type: boolean default: true sample_size: - description: "Number of random tasks to run (empty = all tasks)" + description: 'Number of random tasks to run (empty = all tasks)' required: false type: string extra_args: - description: "Additional arguments to pass to terminal-bench" + description: 'Additional arguments to pass to terminal-bench' required: false type: string secrets: @@ -42,34 +42,34 @@ on: workflow_dispatch: inputs: dataset: - description: "Terminal-Bench dataset to use" + description: 'Terminal-Bench dataset to use' required: false - default: "terminal-bench-core==0.1.1" + default: 'terminal-bench-core==0.1.1' type: string concurrency: - description: "Number of concurrent tasks (--n-concurrent)" + description: 'Number of concurrent tasks (--n-concurrent)' required: false - default: "4" + default: '4' type: string livestream: - description: "Enable livestream mode" + description: 'Enable livestream mode' required: false default: true type: boolean sample_size: - description: "Number of random tasks to run (empty = all tasks)" + description: 'Number of random tasks to run (empty = all tasks)' required: false type: string model_name: - description: "Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)" + description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)' required: false type: string thinking_level: - description: "Thinking level (off, low, medium, high)" + description: 'Thinking level (off, low, medium, high)' required: false type: string extra_args: - description: "Additional arguments to pass to terminal-bench" + description: 'Additional arguments to pass to terminal-bench' required: false type: string @@ -97,6 +97,9 @@ jobs: - name: Generate version file run: ./scripts/generate-version.sh + - name: Build dist/ + run: make build + - name: Run Terminal-Bench run: make benchmark-terminal env: @@ -120,7 +123,7 @@ jobs: cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE" echo "" echo "Per-task summary:" - cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .is_resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details" + cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details" else echo "No results.json found in runs/" ls -la runs/ @@ -148,3 +151,4 @@ jobs: runs/ if-no-files-found: warn retention-days: 30 +