From b623bae979ecdccad4a3a62c7080fce1346f9c38 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 8 Nov 2025 16:29:35 +0000
Subject: [PATCH 1/7] =?UTF-8?q?=F0=9F=A4=96=20fix:=20prevent=20terminal-be?=
 =?UTF-8?q?nch=20timeout=20bypass=20and=20add=20robust=20timeout=20handlin?=
 =?UTF-8?q?g?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**Root Cause:**
Agent was setting max_timeout_sec=float('inf') which bypassed terminal-bench's
timeout enforcement, causing tasks to hang indefinitely. Nov 8 nightly run hit
the 3-hour workflow timeout when tasks hung.

**Changes:**

1. **Critical Fix (cmux_agent.py):**
   - Change max_timeout_sec from float('inf') to None
   - Allows terminal-bench to properly enforce timeouts
   - Prevents infinite task execution

2. **Defense-in-Depth Timeout Layers:**
   - Terminal-bench: --global-agent-timeout-sec (configurable)
   - Cmux agent: --timeout via CMUX_TIMEOUT_MS (Makefile exports)
   - Shell: timeout command with 60s buffer (cmux-run.sh)
   - Workflow: 240 min total timeout (up from 180 min)

3. **Nightly Configuration:**
   - Aggressive 15-min per-task timeout (down from 30 min default)
   - Faster detection of hung tasks
   - Added task_timeout parameter to workflows

4. **Improved Monitoring:**
   - Results summary shows pass rate percentage
   - Detects and reports timeout-related failures
   - Better error messages when results missing

**Testing:**
- Syntax validated (Python, Bash, YAML)
- Typecheck passes
- Formatting applied

Successful runs typically complete in ~60-90 minutes. The 4-hour workflow
timeout provides headroom for API slowdowns while preventing infinite hangs.

_Generated with `cmux`_
---
 .github/workflows/nightly-terminal-bench.yml |  3 ++
 .github/workflows/terminal-bench.yml         | 40 +++++++++++++++++---
 Makefile                                     |  1 +
 benchmarks/terminal_bench/cmux-run.sh        | 18 ++++++++-
 benchmarks/terminal_bench/cmux_agent.py      |  4 +-
 5 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml
index 119c20dcc..5bb10a8e8 100644
--- a/.github/workflows/nightly-terminal-bench.yml
+++ b/.github/workflows/nightly-terminal-bench.yml
@@ -45,6 +45,9 @@ jobs:
       dataset: "terminal-bench-core==0.1.1"
       concurrency: "4"
       livestream: true
+      # Set aggressive per-task timeout (15 min) to catch hung tasks faster
+      # Tasks that consistently hit this timeout indicate bugs in task handling
+      task_timeout: "900"
     secrets:
       ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index 6db5b58fb..78d7db57d 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -34,6 +34,11 @@ on:
         description: 'Additional arguments to pass to terminal-bench'
         required: false
         type: string
+      task_timeout:
+        description: 'Per-task timeout in seconds (default: 1800 = 30 min)'
+        required: false
+        type: string
+        default: '1800'
     secrets:
       ANTHROPIC_API_KEY:
         required: true
@@ -72,14 +77,20 @@ on:
         description: 'Additional arguments to pass to terminal-bench'
         required: false
         type: string
+      task_timeout:
+        description: 'Per-task timeout in seconds (default: 1800 = 30 min)'
+        required: false
+        type: string
+        default: '1800'
 
 jobs:
   benchmark:
     name: Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }}
     runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
-    # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes
-    # Allow 3 hours for safety margin and slower tasks
-    timeout-minutes: 180
+    # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes typically
+    # Set 4-hour timeout to handle occasional API slowdowns while preventing infinite hangs
+    # If consistently hitting this timeout, investigate task-level issues
+    timeout-minutes: 240
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -107,6 +118,7 @@ jobs:
           TB_CONCURRENCY: ${{ inputs.concurrency }}
           TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }}
           TB_SAMPLE_SIZE: ${{ inputs.sample_size }}
+          TB_TIMEOUT: ${{ inputs.task_timeout }}
           TB_ARGS: ${{ inputs.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', inputs.model_name, inputs.thinking_level, inputs.extra_args) || inputs.extra_args }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -115,7 +127,7 @@ jobs:
         if: always()
         run: |
           echo "=== Terminal-Bench Results Summary ==="
-          if [ -f "$(find runs -name 'results.json' | head -1)" ]; then
+          if [ -f "$(find runs -name 'results.json' 2>/dev/null | head -1)" ]; then
             RESULTS_FILE=$(find runs -name 'results.json' | head -1)
             echo "Results file: $RESULTS_FILE"
             echo ""
@@ -124,9 +136,25 @@ jobs:
             echo ""
             echo "Per-task summary:"
             cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
+            echo ""
+            echo "Pass rate:"
+            TOTAL=$(cat "$RESULTS_FILE" | jq '.trials | length')
+            PASSED=$(cat "$RESULTS_FILE" | jq '[.trials[] | select(.resolved)] | length')
+            echo "$PASSED/$TOTAL tasks passed ($(echo "scale=1; $PASSED * 100 / $TOTAL" | bc)%)"
+            
+            # Check for timeout indicators
+            TIMED_OUT=$(cat "$RESULTS_FILE" | jq '[.trials[] | select(.resolved == false and (.error // "" | contains("timeout") or contains("Timed out")))] | length')
+            if [ "$TIMED_OUT" -gt 0 ]; then
+              echo "⚠️  WARNING: $TIMED_OUT tasks failed due to timeout"
+              echo "Consider investigating task performance or increasing task_timeout"
+            fi
           else
-            echo "No results.json found in runs/"
-            ls -la runs/
+            echo "❌ No results.json found in runs/"
+            if [ -d "runs" ]; then
+              ls -laR runs/ || echo "Failed to list runs directory"
+            else
+              echo "runs/ directory does not exist"
+            fi
           fi
 
       - name: Set artifact name
diff --git a/Makefile b/Makefile
index 12f66deb4..8392bd149 100644
--- a/Makefile
+++ b/Makefile
@@ -320,6 +320,7 @@ benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB
 	fi; \
 	echo "Using timeout: $$TB_TIMEOUT seconds"; \
 	echo "Running Terminal-Bench with dataset $$TB_DATASET"; \
+	export CMUX_TIMEOUT_MS=$$((TB_TIMEOUT * 1000)); \
 	uvx terminal-bench run \
 		--dataset "$$TB_DATASET" \
 		--agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \
diff --git a/benchmarks/terminal_bench/cmux-run.sh b/benchmarks/terminal_bench/cmux-run.sh
index 82d126c6d..de0e941ce 100644
--- a/benchmarks/terminal_bench/cmux-run.sh
+++ b/benchmarks/terminal_bench/cmux-run.sh
@@ -94,6 +94,20 @@ if [[ -n "${CMUX_THINKING_LEVEL}" ]]; then
   cmd+=(--thinking-level "${CMUX_THINKING_LEVEL}")
 fi
 
-if ! printf '%s' "${instruction}" | "${cmd[@]}"; then
-  fatal "cmux agent session failed"
+# Run with timeout if available (fallback to running without timeout on older systems)
+if command -v timeout >/dev/null 2>&1 && [[ -n "${CMUX_TIMEOUT_MS}" ]]; then
+  # Add 60s buffer to allow cmux's internal timeout to trigger first
+  SHELL_TIMEOUT_SEC=$((CMUX_TIMEOUT_MS / 1000 + 60))
+  log "enforcing shell-level timeout of ${SHELL_TIMEOUT_SEC}s (cmux timeout: ${CMUX_TIMEOUT_MS}ms)"
+  if ! printf '%s' "${instruction}" | timeout "${SHELL_TIMEOUT_SEC}s" "${cmd[@]}"; then
+    EXIT_CODE=$?
+    if [ $EXIT_CODE -eq 124 ]; then
+      fatal "shell timeout reached (${SHELL_TIMEOUT_SEC}s) - task exceeded maximum duration"
+    fi
+    fatal "cmux agent session failed (exit code: $EXIT_CODE)"
+  fi
+else
+  if ! printf '%s' "${instruction}" | "${cmd[@]}"; then
+    fatal "cmux agent session failed"
+  fi
 fi
diff --git a/benchmarks/terminal_bench/cmux_agent.py b/benchmarks/terminal_bench/cmux_agent.py
index 9bb9d93f9..c909b551f 100644
--- a/benchmarks/terminal_bench/cmux_agent.py
+++ b/benchmarks/terminal_bench/cmux_agent.py
@@ -193,11 +193,13 @@ def _ensure_payload_staged(self, session: TmuxSession) -> None:
     def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]:
         escaped = shlex.quote(instruction)
         command = f"bash /installed-agent/{self._RUNNER_NAME} {escaped}"
+        # Use terminal-bench's global timeout instead of bypassing it
+        # This allows terminal-bench to properly enforce timeouts and prevent hung tasks
         return [
             TerminalCommand(
                 command=command,
                 min_timeout_sec=0.0,
-                max_timeout_sec=float("inf"),
+                max_timeout_sec=None,  # Use global timeout from --global-agent-timeout-sec
                 block=True,
                 append_enter=True,
             )

From 9b645b94bca1c9fb14f2a4497b15c304f8b1ab6f Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 8 Nov 2025 16:31:34 +0000
Subject: [PATCH 2/7] fix: use python3 instead of python in benchmark-terminal
 target

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 8392bd149..a27559132 100644
--- a/Makefile
+++ b/Makefile
@@ -305,7 +305,7 @@ benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB
 		echo "Ensuring dataset $$TB_DATASET is downloaded..."; \
 		uvx terminal-bench datasets download --dataset "$$TB_DATASET" 2>&1 | grep -v "already exists" || true; \
 		echo "Sampling $$TB_SAMPLE_SIZE tasks from $$TB_DATASET..."; \
-		TASK_IDS=$$(python benchmarks/terminal_bench/sample_tasks.py --dataset "$$TB_DATASET" --sample-size "$$TB_SAMPLE_SIZE" --format space) || { \
+		TASK_IDS=$$(python3 benchmarks/terminal_bench/sample_tasks.py --dataset "$$TB_DATASET" --sample-size "$$TB_SAMPLE_SIZE" --format space) || { \
 			echo "Error: Failed to sample tasks" >&2; \
 			exit 1; \
 		}; \

From d3df7974048e87bcef702e563b6f153e70c9c640 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 8 Nov 2025 16:33:47 +0000
Subject: [PATCH 3/7] fix: remove max_timeout_sec override to use
 terminal-bench defaults

TerminalCommand requires max_timeout_sec to be a float, not None.
By not setting it at all, we use the Pydantic default (180s) while
still allowing terminal-bench's --global-agent-timeout-sec to take
precedence.
---
 benchmarks/terminal_bench/cmux_agent.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/terminal_bench/cmux_agent.py b/benchmarks/terminal_bench/cmux_agent.py
index c909b551f..c1fc7f4d8 100644
--- a/benchmarks/terminal_bench/cmux_agent.py
+++ b/benchmarks/terminal_bench/cmux_agent.py
@@ -193,13 +193,13 @@ def _ensure_payload_staged(self, session: TmuxSession) -> None:
     def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]:
         escaped = shlex.quote(instruction)
         command = f"bash /installed-agent/{self._RUNNER_NAME} {escaped}"
-        # Use terminal-bench's global timeout instead of bypassing it
-        # This allows terminal-bench to properly enforce timeouts and prevent hung tasks
+        # Don't override max_timeout_sec - let terminal-bench enforce its global timeout
+        # Previously this was set to float("inf") which bypassed timeout controls
         return [
             TerminalCommand(
                 command=command,
                 min_timeout_sec=0.0,
-                max_timeout_sec=None,  # Use global timeout from --global-agent-timeout-sec
+                # max_timeout_sec uses default (180s) but global timeout takes precedence
                 block=True,
                 append_enter=True,
             )

From 3e639dc37de5827bdf8ff39a4157fbc403294f84 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 8 Nov 2025 16:47:46 +0000
Subject: [PATCH 4/7] fix: correct results.json parsing in workflow summary

Terminal-bench format uses .results[] not .trials[], and is_resolved not .resolved.
Also improved output format to show failure_mode and avoid bc dependency.
---
 .github/workflows/terminal-bench.yml | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index 78d7db57d..1fa95a193 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -131,23 +131,25 @@ jobs:
             RESULTS_FILE=$(find runs -name 'results.json' | head -1)
             echo "Results file: $RESULTS_FILE"
             echo ""
-            echo "Full results.json:"
-            cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
+            echo "Summary statistics:"
+            cat "$RESULTS_FILE" | jq '{n_resolved, n_unresolved, accuracy}' || echo "Failed to parse summary"
             echo ""
-            echo "Per-task summary:"
-            cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
+            echo "Per-task results:"
+            cat "$RESULTS_FILE" | jq -r '.results[] | "\(.task_id): \(if .is_resolved then "✓ PASS" else "✗ FAIL" end) (\(.failure_mode // "unknown"))"' 2>/dev/null || echo "Failed to parse task details"
             echo ""
-            echo "Pass rate:"
-            TOTAL=$(cat "$RESULTS_FILE" | jq '.trials | length')
-            PASSED=$(cat "$RESULTS_FILE" | jq '[.trials[] | select(.resolved)] | length')
-            echo "$PASSED/$TOTAL tasks passed ($(echo "scale=1; $PASSED * 100 / $TOTAL" | bc)%)"
             
-            # Check for timeout indicators
-            TIMED_OUT=$(cat "$RESULTS_FILE" | jq '[.trials[] | select(.resolved == false and (.error // "" | contains("timeout") or contains("Timed out")))] | length')
-            if [ "$TIMED_OUT" -gt 0 ]; then
-              echo "⚠️  WARNING: $TIMED_OUT tasks failed due to timeout"
-              echo "Consider investigating task performance or increasing task_timeout"
+            # Check for timeout-related failures
+            TOTAL=$(cat "$RESULTS_FILE" | jq '.results | length' 2>/dev/null || echo "0")
+            TIMED_OUT=$(cat "$RESULTS_FILE" | jq '[.results[] | select(.failure_mode == "agent_timeout")] | length' 2>/dev/null || echo "0")
+            if [ "$TIMED_OUT" -gt 0 ] && [ "$TOTAL" -gt 0 ]; then
+              echo "⚠️  WARNING: $TIMED_OUT/$TOTAL tasks hit agent_timeout"
+              echo "This may indicate tasks need more time or are genuinely stuck"
             fi
+            
+            # Full results for debugging
+            echo ""
+            echo "Full results.json:"
+            cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
           else
             echo "❌ No results.json found in runs/"
             if [ -d "runs" ]; then

From a22638f12f883b71071dc4ae10283670977b06c8 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 8 Nov 2025 17:24:07 +0000
Subject: [PATCH 5/7] feat: reduce GitHub Actions log verbosity for
 terminal-bench
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Agent logs are saved as artifacts, so no need to spam console output.

Changes:
- Disable livestream by default (was true, now false)
- Filter output to show only key info: task selection, timeouts, results
- Save full benchmark.log as artifact for debugging
- Show last 50 lines on error for quick diagnosis

Console now shows:
✓ Configuration (dataset, timeout, sample size)
✓ Selected task IDs
✓ Results summary (resolved/unresolved/accuracy)
✗ Not shown: verbose agent logs, docker output, intermediate steps

Full logs still available in benchmark.log artifact.
---
 .github/workflows/nightly-terminal-bench.yml |  2 +-
 .github/workflows/terminal-bench.yml         | 28 ++++++++++++++++----
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml
index 5bb10a8e8..c4758e00a 100644
--- a/.github/workflows/nightly-terminal-bench.yml
+++ b/.github/workflows/nightly-terminal-bench.yml
@@ -44,7 +44,7 @@ jobs:
       thinking_level: "high"
       dataset: "terminal-bench-core==0.1.1"
       concurrency: "4"
-      livestream: true
+      livestream: false
       # Set aggressive per-task timeout (15 min) to catch hung tasks faster
       # Tasks that consistently hit this timeout indicate bugs in task handling
       task_timeout: "900"
diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index 1fa95a193..2b0e0765e 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -22,10 +22,10 @@ on:
         type: string
         default: '4'
       livestream:
-        description: 'Enable livestream mode'
+        description: 'Enable livestream mode (verbose output to console)'
         required: false
         type: boolean
-        default: true
+        default: false
       sample_size:
         description: 'Number of random tasks to run (empty = all tasks)'
         required: false
@@ -57,9 +57,9 @@ on:
         default: '4'
         type: string
       livestream:
-        description: 'Enable livestream mode'
+        description: 'Enable livestream mode (verbose output to console)'
         required: false
-        default: true
+        default: false
         type: boolean
       sample_size:
         description: 'Number of random tasks to run (empty = all tasks)'
@@ -112,7 +112,24 @@ jobs:
         run: make build-main build-preload
 
       - name: Run Terminal-Bench
-        run: make benchmark-terminal
+        run: |
+          echo "Starting Terminal-Bench run..."
+          echo "Dataset: $TB_DATASET"
+          echo "Concurrency: $TB_CONCURRENCY"
+          echo "Task timeout: $TB_TIMEOUT seconds"
+          echo "Sample size: ${TB_SAMPLE_SIZE:-all tasks}"
+          echo ""
+          
+          # Run benchmark with output redirected to file (logs saved as artifact)
+          # Only show progress indicator and final summary in console
+          make benchmark-terminal 2>&1 | tee benchmark.log | grep -E "(Running Terminal-Bench|Using timeout|Selected task IDs|Results Summary|Resolved Trials|Unresolved Trials|Accuracy)" || true
+          
+          # Show last 50 lines if there was an error
+          if [ ${PIPESTATUS[0]} -ne 0 ]; then
+            echo ""
+            echo "=== Benchmark failed, showing last 50 lines of output ==="
+            tail -50 benchmark.log
+          fi
         env:
           TB_DATASET: ${{ inputs.dataset }}
           TB_CONCURRENCY: ${{ inputs.concurrency }}
@@ -179,6 +196,7 @@ jobs:
           name: ${{ steps.artifact-name.outputs.name }}
           path: |
             runs/
+            benchmark.log
           if-no-files-found: warn
           retention-days: 30
 

From 9d0b7f14afb06c15103c4080cf278bf4da8e6e00 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 8 Nov 2025 17:36:45 +0000
Subject: [PATCH 6/7] refactor: simplify TB timeout handling and reduce
 complexity

- Remove task_timeout input parameter (use TB default: 1800s)
- Remove redundant shell-level timeout in cmux-run.sh
- Simplify workflow output (full logs saved in artifacts)
- Simplify results summary display
- Change nightly livestream default: true -> false

Net result: -2 LoC while maintaining functionality
---
 .github/workflows/nightly-terminal-bench.yml |  3 -
 .github/workflows/terminal-bench.yml         | 60 ++------------------
 benchmarks/terminal_bench/cmux-run.sh        | 19 +------
 benchmarks/terminal_bench/cmux_agent.py      |  4 +-
 4 files changed, 8 insertions(+), 78 deletions(-)

diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml
index c4758e00a..70226826b 100644
--- a/.github/workflows/nightly-terminal-bench.yml
+++ b/.github/workflows/nightly-terminal-bench.yml
@@ -45,9 +45,6 @@ jobs:
       dataset: "terminal-bench-core==0.1.1"
       concurrency: "4"
       livestream: false
-      # Set aggressive per-task timeout (15 min) to catch hung tasks faster
-      # Tasks that consistently hit this timeout indicate bugs in task handling
-      task_timeout: "900"
     secrets:
       ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index 2b0e0765e..50cb87418 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -34,11 +34,6 @@ on:
         description: 'Additional arguments to pass to terminal-bench'
         required: false
         type: string
-      task_timeout:
-        description: 'Per-task timeout in seconds (default: 1800 = 30 min)'
-        required: false
-        type: string
-        default: '1800'
     secrets:
       ANTHROPIC_API_KEY:
         required: true
@@ -77,11 +72,6 @@ on:
         description: 'Additional arguments to pass to terminal-bench'
         required: false
         type: string
-      task_timeout:
-        description: 'Per-task timeout in seconds (default: 1800 = 30 min)'
-        required: false
-        type: string
-        default: '1800'
 
 jobs:
   benchmark:
@@ -112,30 +102,12 @@ jobs:
         run: make build-main build-preload
 
       - name: Run Terminal-Bench
-        run: |
-          echo "Starting Terminal-Bench run..."
-          echo "Dataset: $TB_DATASET"
-          echo "Concurrency: $TB_CONCURRENCY"
-          echo "Task timeout: $TB_TIMEOUT seconds"
-          echo "Sample size: ${TB_SAMPLE_SIZE:-all tasks}"
-          echo ""
-          
-          # Run benchmark with output redirected to file (logs saved as artifact)
-          # Only show progress indicator and final summary in console
-          make benchmark-terminal 2>&1 | tee benchmark.log | grep -E "(Running Terminal-Bench|Using timeout|Selected task IDs|Results Summary|Resolved Trials|Unresolved Trials|Accuracy)" || true
-          
-          # Show last 50 lines if there was an error
-          if [ ${PIPESTATUS[0]} -ne 0 ]; then
-            echo ""
-            echo "=== Benchmark failed, showing last 50 lines of output ==="
-            tail -50 benchmark.log
-          fi
+        run: make benchmark-terminal 2>&1 | tee benchmark.log
         env:
           TB_DATASET: ${{ inputs.dataset }}
           TB_CONCURRENCY: ${{ inputs.concurrency }}
           TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }}
           TB_SAMPLE_SIZE: ${{ inputs.sample_size }}
-          TB_TIMEOUT: ${{ inputs.task_timeout }}
           TB_ARGS: ${{ inputs.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', inputs.model_name, inputs.thinking_level, inputs.extra_args) || inputs.extra_args }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -146,34 +118,10 @@ jobs:
           echo "=== Terminal-Bench Results Summary ==="
           if [ -f "$(find runs -name 'results.json' 2>/dev/null | head -1)" ]; then
             RESULTS_FILE=$(find runs -name 'results.json' | head -1)
-            echo "Results file: $RESULTS_FILE"
-            echo ""
-            echo "Summary statistics:"
-            cat "$RESULTS_FILE" | jq '{n_resolved, n_unresolved, accuracy}' || echo "Failed to parse summary"
-            echo ""
-            echo "Per-task results:"
-            cat "$RESULTS_FILE" | jq -r '.results[] | "\(.task_id): \(if .is_resolved then "✓ PASS" else "✗ FAIL" end) (\(.failure_mode // "unknown"))"' 2>/dev/null || echo "Failed to parse task details"
-            echo ""
-            
-            # Check for timeout-related failures
-            TOTAL=$(cat "$RESULTS_FILE" | jq '.results | length' 2>/dev/null || echo "0")
-            TIMED_OUT=$(cat "$RESULTS_FILE" | jq '[.results[] | select(.failure_mode == "agent_timeout")] | length' 2>/dev/null || echo "0")
-            if [ "$TIMED_OUT" -gt 0 ] && [ "$TOTAL" -gt 0 ]; then
-              echo "⚠️  WARNING: $TIMED_OUT/$TOTAL tasks hit agent_timeout"
-              echo "This may indicate tasks need more time or are genuinely stuck"
-            fi
-            
-            # Full results for debugging
-            echo ""
-            echo "Full results.json:"
-            cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
+            cat "$RESULTS_FILE" | jq '{n_resolved, n_unresolved, accuracy}' 2>/dev/null || cat "$RESULTS_FILE"
           else
-            echo "❌ No results.json found in runs/"
-            if [ -d "runs" ]; then
-              ls -laR runs/ || echo "Failed to list runs directory"
-            else
-              echo "runs/ directory does not exist"
-            fi
+            echo "❌ No results.json found"
+            ls -laR runs/ 2>/dev/null || echo "runs/ directory missing"
           fi
 
       - name: Set artifact name
diff --git a/benchmarks/terminal_bench/cmux-run.sh b/benchmarks/terminal_bench/cmux-run.sh
index de0e941ce..379256e6a 100644
--- a/benchmarks/terminal_bench/cmux-run.sh
+++ b/benchmarks/terminal_bench/cmux-run.sh
@@ -94,20 +94,7 @@ if [[ -n "${CMUX_THINKING_LEVEL}" ]]; then
   cmd+=(--thinking-level "${CMUX_THINKING_LEVEL}")
 fi
 
-# Run with timeout if available (fallback to running without timeout on older systems)
-if command -v timeout >/dev/null 2>&1 && [[ -n "${CMUX_TIMEOUT_MS}" ]]; then
-  # Add 60s buffer to allow cmux's internal timeout to trigger first
-  SHELL_TIMEOUT_SEC=$((CMUX_TIMEOUT_MS / 1000 + 60))
-  log "enforcing shell-level timeout of ${SHELL_TIMEOUT_SEC}s (cmux timeout: ${CMUX_TIMEOUT_MS}ms)"
-  if ! printf '%s' "${instruction}" | timeout "${SHELL_TIMEOUT_SEC}s" "${cmd[@]}"; then
-    EXIT_CODE=$?
-    if [ $EXIT_CODE -eq 124 ]; then
-      fatal "shell timeout reached (${SHELL_TIMEOUT_SEC}s) - task exceeded maximum duration"
-    fi
-    fatal "cmux agent session failed (exit code: $EXIT_CODE)"
-  fi
-else
-  if ! printf '%s' "${instruction}" | "${cmd[@]}"; then
-    fatal "cmux agent session failed"
-  fi
+# Terminal-bench enforces timeouts via --global-agent-timeout-sec
+if ! printf '%s' "${instruction}" | "${cmd[@]}"; then
+  fatal "cmux agent session failed"
 fi
diff --git a/benchmarks/terminal_bench/cmux_agent.py b/benchmarks/terminal_bench/cmux_agent.py
index c1fc7f4d8..2e9afe251 100644
--- a/benchmarks/terminal_bench/cmux_agent.py
+++ b/benchmarks/terminal_bench/cmux_agent.py
@@ -193,13 +193,11 @@ def _ensure_payload_staged(self, session: TmuxSession) -> None:
     def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]:
         escaped = shlex.quote(instruction)
         command = f"bash /installed-agent/{self._RUNNER_NAME} {escaped}"
-        # Don't override max_timeout_sec - let terminal-bench enforce its global timeout
-        # Previously this was set to float("inf") which bypassed timeout controls
+        # Don't set max_timeout_sec - terminal-bench enforces global timeout
         return [
             TerminalCommand(
                 command=command,
                 min_timeout_sec=0.0,
-                # max_timeout_sec uses default (180s) but global timeout takes precedence
                 block=True,
                 append_enter=True,
             )

From 077828b636e4ebf6cbd5e59e2fcf234351218bdd Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 8 Nov 2025 20:38:33 +0000
Subject: [PATCH 7/7] docs: add bench: prefix for benchmarking PRs

---
 docs/AGENTS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/AGENTS.md b/docs/AGENTS.md
index fc4c68c35..33444092d 100644
--- a/docs/AGENTS.md
+++ b/docs/AGENTS.md
@@ -107,6 +107,7 @@ Use these prefixes based on what best describes the PR:
 - **fix:** (conforming behavior to user expectations)
 - **feat:** (net new functionality)
 - **ci:** (concerned with build process or CI)
+- **bench:** (benchmarking infrastructure or Terminal-Bench integration)
 
 Examples:
 
@@ -115,6 +116,7 @@ Examples:
 - `🤖 fix: handle workspace rename edge cases`
 - `🤖 feat: add keyboard shortcuts for workspace navigation`
 - `🤖 ci: update wait_pr_checks script timeout`
+- `🤖 bench: simplify timeout handling in terminal-bench integration`
 
 ## Project Structure