coder · ammar-agent · Nov 6, 2025
diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
@@ -4,34 +4,34 @@ on:
   workflow_call:
     inputs:
       model_name:
-        description: "Model to use (e.g., anthropic:claude-sonnet-4-5)"
+        description: 'Model to use (e.g., anthropic:claude-sonnet-4-5)'
         required: false
         type: string
       thinking_level:
-        description: "Thinking level (off, low, medium, high)"
+        description: 'Thinking level (off, low, medium, high)'
         required: false
         type: string
       dataset:
-        description: "Terminal-Bench dataset to use"
+        description: 'Terminal-Bench dataset to use'
         required: false
         type: string
-        default: "terminal-bench-core==0.1.1"
+        default: 'terminal-bench-core==0.1.1'
       concurrency:
-        description: "Number of concurrent tasks (--n-concurrent)"
+        description: 'Number of concurrent tasks (--n-concurrent)'
         required: false
         type: string
-        default: "4"
+        default: '4'
       livestream:
-        description: "Enable livestream mode"
+        description: 'Enable livestream mode'
         required: false
         type: boolean
         default: true
       sample_size:
-        description: "Number of random tasks to run (empty = all tasks)"
+        description: 'Number of random tasks to run (empty = all tasks)'
         required: false
         type: string
       extra_args:
-        description: "Additional arguments to pass to terminal-bench"
+        description: 'Additional arguments to pass to terminal-bench'
         required: false
         type: string
     secrets:
@@ -42,34 +42,34 @@ on:
   workflow_dispatch:
     inputs:
       dataset:
-        description: "Terminal-Bench dataset to use"
+        description: 'Terminal-Bench dataset to use'
         required: false
-        default: "terminal-bench-core==0.1.1"
+        default: 'terminal-bench-core==0.1.1'
         type: string
       concurrency:
-        description: "Number of concurrent tasks (--n-concurrent)"
+        description: 'Number of concurrent tasks (--n-concurrent)'
         required: false
-        default: "4"
+        default: '4'
         type: string
       livestream:
-        description: "Enable livestream mode"
+        description: 'Enable livestream mode'
         required: false
         default: true
         type: boolean
       sample_size:
-        description: "Number of random tasks to run (empty = all tasks)"
+        description: 'Number of random tasks to run (empty = all tasks)'
         required: false
         type: string
       model_name:
-        description: "Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)"
+        description: 'Model to use (e.g., anthropic:claude-sonnet-4-5, openai:gpt-5-codex)'
         required: false
         type: string
       thinking_level:
-        description: "Thinking level (off, low, medium, high)"
+        description: 'Thinking level (off, low, medium, high)'
         required: false
         type: string
       extra_args:
-        description: "Additional arguments to pass to terminal-bench"
+        description: 'Additional arguments to pass to terminal-bench'
         required: false
         type: string
 
@@ -97,6 +97,9 @@ jobs:
       - name: Generate version file
         run: ./scripts/generate-version.sh
 
+      - name: Build dist/
+        run: make build
+
       - name: Run Terminal-Bench
         run: make benchmark-terminal
         env:
@@ -120,7 +123,7 @@ jobs:
             cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
             echo ""
             echo "Per-task summary:"
-            cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .is_resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
+            cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
           else
             echo "No results.json found in runs/"
             ls -la runs/
@@ -148,3 +151,4 @@ jobs:
             runs/
           if-no-files-found: warn
           retention-days: 30
+