From 8542a872c67652a4b68419fa11d2d7656ed3dc5f Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Thu, 6 Nov 2025 15:27:32 +0000
Subject: [PATCH] =?UTF-8?q?=F0=9F=A4=96=20fix:=20build=20dist/=20before=20?=
 =?UTF-8?q?running=20terminal-bench?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #507 added `dist/` to the terminal-bench archive include paths to fix worker crashes. However, the workflow wasn't building `dist/` before running the benchmark, causing all tasks to fail immediately with:

```
Error running agent for task <name>: Required file /home/runner/work/cmux/cmux/dist missing
```

Now runs `make build` before `make benchmark-terminal` to ensure dist/ exists and contains the compiled worker files.

Verified with workflow run #19140594821 which successfully completed the modernize-fortran-build task.
---
 .github/workflows/terminal-bench.yml | 42 +++++++++++++++-------------
 1 file changed, 23 insertions(+), 19 deletions(-)
diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index 0c75a4c3d..e573f6781 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -4,34 +4,34 @@ on:
   workflow_call:
     inputs:
       model_name:
-        description: "Model to use (e.g., anthropic:claude-sonnet-4-5)"
+        description: 'Model to use (e.g., anthropic:claude-sonnet-4-5)'
         required: false
         type: string
       thinking_level:
-        description: "Thinking level (off, low, medium, high)"
+        description: 'Thinking level (off, low, medium, high)'
         required: false
         type: string
       dataset:
-        description: "Terminal-Bench dataset to use"
+        description: 'Terminal-Bench dataset to use'
         required: false
         type: string
-        default: "terminal-bench-core==0.1.1"
+        default: 'terminal-bench-core==0.1.1'
       concurrency:
-        description: "Number of concurrent tasks (--n-concurrent)"
+        description: 'Number of concurrent tasks (--n-concurrent)'
         required: false
         type: string
-        default: "4"
+        default: '4'
       livestream:
-        description: "Enable livestream mode"
+        description: 'Enable livestream mode'
         required: false
         type: boolean
         default: true
       sample_size:
-        description: "Number of random tasks to run (empty = all tasks)"
+        description: 'Number of random tasks to run (empty = all tasks)'
         required: false
         type: string
       extra_args:
-        description: "Additional arguments to pass to terminal-bench"
+        description: 'Additional arguments to pass to terminal-bench'
         required: false
         type: string
     secrets:
@@ -42,34 +42,34 @@ on:
   workflow_dispatch:
     inputs:
       dataset:
-        description: "Terminal-Bench dataset to use"
+        description: 'Terminal-Bench dataset to use'
         required: false
-        default: "terminal-bench-core==0.1.1"
+        default: 'terminal-bench-core==0.1.1'
         type: string
       concurrency:
-        description: "Number of concurrent tasks (--n-concurrent)"
+        description: 'Number of concurrent tasks (--n-concurrent)'
         required: false
-        default: "4"
+        default: '4'
         type: string
       livestream:
-        description: "Enable livestream mode"
+        description: 'Enable livestream mode'
         required: false
         default: true
         type: boolean
       sample_size:
-        description: "Number of random tasks to run (empty = all tasks)"
+        description: 'Number of random tasks to run (empty = all tasks)'
         required: false
         type: string
       model_name:
-        description: "Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)"
+        description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)'
         required: false
         type: string
       thinking_level:
-        description: "Thinking level (off, low, medium, high)"
+        description: 'Thinking level (off, low, medium, high)'
         required: false
         type: string
       extra_args:
-        description: "Additional arguments to pass to terminal-bench"
+        description: 'Additional arguments to pass to terminal-bench'
         required: false
         type: string
 
@@ -97,6 +97,9 @@ jobs:
       - name: Generate version file
         run: ./scripts/generate-version.sh
 
+      - name: Build dist/
+        run: make build
+
       - name: Run Terminal-Bench
         run: make benchmark-terminal
         env:
@@ -120,7 +123,7 @@ jobs:
             cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
             echo ""
             echo "Per-task summary:"
-            cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .is_resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
+            cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
           else
             echo "No results.json found in runs/"
             ls -la runs/
@@ -148,3 +151,4 @@ jobs:
             runs/
           if-no-files-found: warn
           retention-days: 30
+