From b623bae979ecdccad4a3a62c7080fce1346f9c38 Mon Sep 17 00:00:00 2001 From: Ammar Date: Sat, 8 Nov 2025 16:29:35 +0000 Subject: [PATCH 1/7] =?UTF-8?q?=F0=9F=A4=96=20fix:=20prevent=20terminal-be?= =?UTF-8?q?nch=20timeout=20bypass=20and=20add=20robust=20timeout=20handlin?= =?UTF-8?q?g?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Root Cause:** Agent was setting max_timeout_sec=float('inf') which bypassed terminal-bench's timeout enforcement, causing tasks to hang indefinitely. Nov 8 nightly run hit the 3-hour workflow timeout when tasks hung. **Changes:** 1. **Critical Fix (cmux_agent.py):** - Change max_timeout_sec from float('inf') to None - Allows terminal-bench to properly enforce timeouts - Prevents infinite task execution 2. **Defense-in-Depth Timeout Layers:** - Terminal-bench: --global-agent-timeout-sec (configurable) - Cmux agent: --timeout via CMUX_TIMEOUT_MS (Makefile exports) - Shell: timeout command with 60s buffer (cmux-run.sh) - Workflow: 240 min total timeout (up from 180 min) 3. **Nightly Configuration:** - Aggressive 15-min per-task timeout (down from 30 min default) - Faster detection of hung tasks - Added task_timeout parameter to workflows 4. **Improved Monitoring:** - Results summary shows pass rate percentage - Detects and reports timeout-related failures - Better error messages when results missing **Testing:** - Syntax validated (Python, Bash, YAML) - Typecheck passes - Formatting applied Successful runs typically complete in ~60-90 minutes. The 4-hour workflow timeout provides headroom for API slowdowns while preventing infinite hangs. _Generated with `cmux`_ --- .github/workflows/nightly-terminal-bench.yml | 3 ++ .github/workflows/terminal-bench.yml | 40 +++++++++++++++++--- Makefile | 1 + benchmarks/terminal_bench/cmux-run.sh | 18 ++++++++- benchmarks/terminal_bench/cmux_agent.py | 4 +- 5 files changed, 57 insertions(+), 9 deletions(-) diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml index 119c20dcc..5bb10a8e8 100644 --- a/.github/workflows/nightly-terminal-bench.yml +++ b/.github/workflows/nightly-terminal-bench.yml @@ -45,6 +45,9 @@ jobs: dataset: "terminal-bench-core==0.1.1" concurrency: "4" livestream: true + # Set aggressive per-task timeout (15 min) to catch hung tasks faster + # Tasks that consistently hit this timeout indicate bugs in task handling + task_timeout: "900" secrets: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 6db5b58fb..78d7db57d 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -34,6 +34,11 @@ on: description: 'Additional arguments to pass to terminal-bench' required: false type: string + task_timeout: + description: 'Per-task timeout in seconds (default: 1800 = 30 min)' + required: false + type: string + default: '1800' secrets: ANTHROPIC_API_KEY: required: true @@ -72,14 +77,20 @@ on: description: 'Additional arguments to pass to terminal-bench' required: false type: string + task_timeout: + description: 'Per-task timeout in seconds (default: 1800 = 30 min)' + required: false + type: string + default: '1800' jobs: benchmark: name: Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }} runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }} - # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes - # Allow 3 hours for safety margin and slower tasks - timeout-minutes: 180 + # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes typically + # Set 4-hour timeout to handle occasional API slowdowns while preventing infinite hangs + # If consistently hitting this timeout, investigate task-level issues + timeout-minutes: 240 steps: - name: Checkout code uses: actions/checkout@v4 @@ -107,6 +118,7 @@ jobs: TB_CONCURRENCY: ${{ inputs.concurrency }} TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }} TB_SAMPLE_SIZE: ${{ inputs.sample_size }} + TB_TIMEOUT: ${{ inputs.task_timeout }} TB_ARGS: ${{ inputs.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', inputs.model_name, inputs.thinking_level, inputs.extra_args) || inputs.extra_args }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -115,7 +127,7 @@ jobs: if: always() run: | echo "=== Terminal-Bench Results Summary ===" - if [ -f "$(find runs -name 'results.json' | head -1)" ]; then + if [ -f "$(find runs -name 'results.json' 2>/dev/null | head -1)" ]; then RESULTS_FILE=$(find runs -name 'results.json' | head -1) echo "Results file: $RESULTS_FILE" echo "" @@ -124,9 +136,25 @@ jobs: echo "" echo "Per-task summary:" cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details" + echo "" + echo "Pass rate:" + TOTAL=$(cat "$RESULTS_FILE" | jq '.trials | length') + PASSED=$(cat "$RESULTS_FILE" | jq '[.trials[] | select(.resolved)] | length') + echo "$PASSED/$TOTAL tasks passed ($(echo "scale=1; $PASSED * 100 / $TOTAL" | bc)%)" + + # Check for timeout indicators + TIMED_OUT=$(cat "$RESULTS_FILE" | jq '[.trials[] | select(.resolved == false and (.error // "" | contains("timeout") or contains("Timed out")))] | length') + if [ "$TIMED_OUT" -gt 0 ]; then + echo "⚠️ WARNING: $TIMED_OUT tasks failed due to timeout" + echo "Consider investigating task performance or increasing task_timeout" + fi else - echo "No results.json found in runs/" - ls -la runs/ + echo "❌ No results.json found in runs/" + if [ -d "runs" ]; then + ls -laR runs/ || echo "Failed to list runs directory" + else + echo "runs/ directory does not exist" + fi fi - name: Set artifact name diff --git a/Makefile b/Makefile index 12f66deb4..8392bd149 100644 --- a/Makefile +++ b/Makefile @@ -320,6 +320,7 @@ benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB fi; \ echo "Using timeout: $$TB_TIMEOUT seconds"; \ echo "Running Terminal-Bench with dataset $$TB_DATASET"; \ + export CMUX_TIMEOUT_MS=$$((TB_TIMEOUT * 1000)); \ uvx terminal-bench run \ --dataset "$$TB_DATASET" \ --agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \ diff --git a/benchmarks/terminal_bench/cmux-run.sh b/benchmarks/terminal_bench/cmux-run.sh index 82d126c6d..de0e941ce 100644 --- a/benchmarks/terminal_bench/cmux-run.sh +++ b/benchmarks/terminal_bench/cmux-run.sh @@ -94,6 +94,20 @@ if [[ -n "${CMUX_THINKING_LEVEL}" ]]; then cmd+=(--thinking-level "${CMUX_THINKING_LEVEL}") fi -if ! printf '%s' "${instruction}" | "${cmd[@]}"; then - fatal "cmux agent session failed" +# Run with timeout if available (fallback to running without timeout on older systems) +if command -v timeout >/dev/null 2>&1 && [[ -n "${CMUX_TIMEOUT_MS}" ]]; then + # Add 60s buffer to allow cmux's internal timeout to trigger first + SHELL_TIMEOUT_SEC=$((CMUX_TIMEOUT_MS / 1000 + 60)) + log "enforcing shell-level timeout of ${SHELL_TIMEOUT_SEC}s (cmux timeout: ${CMUX_TIMEOUT_MS}ms)" + if ! printf '%s' "${instruction}" | timeout "${SHELL_TIMEOUT_SEC}s" "${cmd[@]}"; then + EXIT_CODE=$? + if [ $EXIT_CODE -eq 124 ]; then + fatal "shell timeout reached (${SHELL_TIMEOUT_SEC}s) - task exceeded maximum duration" + fi + fatal "cmux agent session failed (exit code: $EXIT_CODE)" + fi +else + if ! printf '%s' "${instruction}" | "${cmd[@]}"; then + fatal "cmux agent session failed" + fi fi diff --git a/benchmarks/terminal_bench/cmux_agent.py b/benchmarks/terminal_bench/cmux_agent.py index 9bb9d93f9..c909b551f 100644 --- a/benchmarks/terminal_bench/cmux_agent.py +++ b/benchmarks/terminal_bench/cmux_agent.py @@ -193,11 +193,13 @@ def _ensure_payload_staged(self, session: TmuxSession) -> None: def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]: escaped = shlex.quote(instruction) command = f"bash /installed-agent/{self._RUNNER_NAME} {escaped}" + # Use terminal-bench's global timeout instead of bypassing it + # This allows terminal-bench to properly enforce timeouts and prevent hung tasks return [ TerminalCommand( command=command, min_timeout_sec=0.0, - max_timeout_sec=float("inf"), + max_timeout_sec=None, # Use global timeout from --global-agent-timeout-sec block=True, append_enter=True, ) From 9b645b94bca1c9fb14f2a4497b15c304f8b1ab6f Mon Sep 17 00:00:00 2001 From: Ammar Date: Sat, 8 Nov 2025 16:31:34 +0000 Subject: [PATCH 2/7] fix: use python3 instead of python in benchmark-terminal target --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8392bd149..a27559132 100644 --- a/Makefile +++ b/Makefile @@ -305,7 +305,7 @@ benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB echo "Ensuring dataset $$TB_DATASET is downloaded..."; \ uvx terminal-bench datasets download --dataset "$$TB_DATASET" 2>&1 | grep -v "already exists" || true; \ echo "Sampling $$TB_SAMPLE_SIZE tasks from $$TB_DATASET..."; \ - TASK_IDS=$$(python benchmarks/terminal_bench/sample_tasks.py --dataset "$$TB_DATASET" --sample-size "$$TB_SAMPLE_SIZE" --format space) || { \ + TASK_IDS=$$(python3 benchmarks/terminal_bench/sample_tasks.py --dataset "$$TB_DATASET" --sample-size "$$TB_SAMPLE_SIZE" --format space) || { \ echo "Error: Failed to sample tasks" >&2; \ exit 1; \ }; \ From d3df7974048e87bcef702e563b6f153e70c9c640 Mon Sep 17 00:00:00 2001 From: Ammar Date: Sat, 8 Nov 2025 16:33:47 +0000 Subject: [PATCH 3/7] fix: remove max_timeout_sec override to use terminal-bench defaults TerminalCommand requires max_timeout_sec to be a float, not None. By not setting it at all, we use the Pydantic default (180s) while still allowing terminal-bench's --global-agent-timeout-sec to take precedence. --- benchmarks/terminal_bench/cmux_agent.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/terminal_bench/cmux_agent.py b/benchmarks/terminal_bench/cmux_agent.py index c909b551f..c1fc7f4d8 100644 --- a/benchmarks/terminal_bench/cmux_agent.py +++ b/benchmarks/terminal_bench/cmux_agent.py @@ -193,13 +193,13 @@ def _ensure_payload_staged(self, session: TmuxSession) -> None: def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]: escaped = shlex.quote(instruction) command = f"bash /installed-agent/{self._RUNNER_NAME} {escaped}" - # Use terminal-bench's global timeout instead of bypassing it - # This allows terminal-bench to properly enforce timeouts and prevent hung tasks + # Don't override max_timeout_sec - let terminal-bench enforce its global timeout + # Previously this was set to float("inf") which bypassed timeout controls return [ TerminalCommand( command=command, min_timeout_sec=0.0, - max_timeout_sec=None, # Use global timeout from --global-agent-timeout-sec + # max_timeout_sec uses default (180s) but global timeout takes precedence block=True, append_enter=True, ) From 3e639dc37de5827bdf8ff39a4157fbc403294f84 Mon Sep 17 00:00:00 2001 From: Ammar Date: Sat, 8 Nov 2025 16:47:46 +0000 Subject: [PATCH 4/7] fix: correct results.json parsing in workflow summary Terminal-bench format uses .results[] not .trials[], and is_resolved not .resolved. Also improved output format to show failure_mode and avoid bc dependency. --- .github/workflows/terminal-bench.yml | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 78d7db57d..1fa95a193 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -131,23 +131,25 @@ jobs: RESULTS_FILE=$(find runs -name 'results.json' | head -1) echo "Results file: $RESULTS_FILE" echo "" - echo "Full results.json:" - cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE" + echo "Summary statistics:" + cat "$RESULTS_FILE" | jq '{n_resolved, n_unresolved, accuracy}' || echo "Failed to parse summary" echo "" - echo "Per-task summary:" - cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details" + echo "Per-task results:" + cat "$RESULTS_FILE" | jq -r '.results[] | "\(.task_id): \(if .is_resolved then "✓ PASS" else "✗ FAIL" end) (\(.failure_mode // "unknown"))"' 2>/dev/null || echo "Failed to parse task details" echo "" - echo "Pass rate:" - TOTAL=$(cat "$RESULTS_FILE" | jq '.trials | length') - PASSED=$(cat "$RESULTS_FILE" | jq '[.trials[] | select(.resolved)] | length') - echo "$PASSED/$TOTAL tasks passed ($(echo "scale=1; $PASSED * 100 / $TOTAL" | bc)%)" - # Check for timeout indicators - TIMED_OUT=$(cat "$RESULTS_FILE" | jq '[.trials[] | select(.resolved == false and (.error // "" | contains("timeout") or contains("Timed out")))] | length') - if [ "$TIMED_OUT" -gt 0 ]; then - echo "⚠️ WARNING: $TIMED_OUT tasks failed due to timeout" - echo "Consider investigating task performance or increasing task_timeout" + # Check for timeout-related failures + TOTAL=$(cat "$RESULTS_FILE" | jq '.results | length' 2>/dev/null || echo "0") + TIMED_OUT=$(cat "$RESULTS_FILE" | jq '[.results[] | select(.failure_mode == "agent_timeout")] | length' 2>/dev/null || echo "0") + if [ "$TIMED_OUT" -gt 0 ] && [ "$TOTAL" -gt 0 ]; then + echo "⚠️ WARNING: $TIMED_OUT/$TOTAL tasks hit agent_timeout" + echo "This may indicate tasks need more time or are genuinely stuck" fi + + # Full results for debugging + echo "" + echo "Full results.json:" + cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE" else echo "❌ No results.json found in runs/" if [ -d "runs" ]; then From a22638f12f883b71071dc4ae10283670977b06c8 Mon Sep 17 00:00:00 2001 From: Ammar Date: Sat, 8 Nov 2025 17:24:07 +0000 Subject: [PATCH 5/7] feat: reduce GitHub Actions log verbosity for terminal-bench MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agent logs are saved as artifacts, so no need to spam console output. Changes: - Disable livestream by default (was true, now false) - Filter output to show only key info: task selection, timeouts, results - Save full benchmark.log as artifact for debugging - Show last 50 lines on error for quick diagnosis Console now shows: ✓ Configuration (dataset, timeout, sample size) ✓ Selected task IDs ✓ Results summary (resolved/unresolved/accuracy) ✗ Not shown: verbose agent logs, docker output, intermediate steps Full logs still available in benchmark.log artifact. --- .github/workflows/nightly-terminal-bench.yml | 2 +- .github/workflows/terminal-bench.yml | 28 ++++++++++++++++---- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml index 5bb10a8e8..c4758e00a 100644 --- a/.github/workflows/nightly-terminal-bench.yml +++ b/.github/workflows/nightly-terminal-bench.yml @@ -44,7 +44,7 @@ jobs: thinking_level: "high" dataset: "terminal-bench-core==0.1.1" concurrency: "4" - livestream: true + livestream: false # Set aggressive per-task timeout (15 min) to catch hung tasks faster # Tasks that consistently hit this timeout indicate bugs in task handling task_timeout: "900" diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 1fa95a193..2b0e0765e 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -22,10 +22,10 @@ on: type: string default: '4' livestream: - description: 'Enable livestream mode' + description: 'Enable livestream mode (verbose output to console)' required: false type: boolean - default: true + default: false sample_size: description: 'Number of random tasks to run (empty = all tasks)' required: false @@ -57,9 +57,9 @@ on: default: '4' type: string livestream: - description: 'Enable livestream mode' + description: 'Enable livestream mode (verbose output to console)' required: false - default: true + default: false type: boolean sample_size: description: 'Number of random tasks to run (empty = all tasks)' @@ -112,7 +112,24 @@ jobs: run: make build-main build-preload - name: Run Terminal-Bench - run: make benchmark-terminal + run: | + echo "Starting Terminal-Bench run..." + echo "Dataset: $TB_DATASET" + echo "Concurrency: $TB_CONCURRENCY" + echo "Task timeout: $TB_TIMEOUT seconds" + echo "Sample size: ${TB_SAMPLE_SIZE:-all tasks}" + echo "" + + # Run benchmark with output redirected to file (logs saved as artifact) + # Only show progress indicator and final summary in console + make benchmark-terminal 2>&1 | tee benchmark.log | grep -E "(Running Terminal-Bench|Using timeout|Selected task IDs|Results Summary|Resolved Trials|Unresolved Trials|Accuracy)" || true + + # Show last 50 lines if there was an error + if [ ${PIPESTATUS[0]} -ne 0 ]; then + echo "" + echo "=== Benchmark failed, showing last 50 lines of output ===" + tail -50 benchmark.log + fi env: TB_DATASET: ${{ inputs.dataset }} TB_CONCURRENCY: ${{ inputs.concurrency }} @@ -179,6 +196,7 @@ jobs: name: ${{ steps.artifact-name.outputs.name }} path: | runs/ + benchmark.log if-no-files-found: warn retention-days: 30 From 9d0b7f14afb06c15103c4080cf278bf4da8e6e00 Mon Sep 17 00:00:00 2001 From: Ammar Date: Sat, 8 Nov 2025 17:36:45 +0000 Subject: [PATCH 6/7] refactor: simplify TB timeout handling and reduce complexity - Remove task_timeout input parameter (use TB default: 1800s) - Remove redundant shell-level timeout in cmux-run.sh - Simplify workflow output (full logs saved in artifacts) - Simplify results summary display - Change nightly livestream default: true -> false Net result: -2 LoC while maintaining functionality --- .github/workflows/nightly-terminal-bench.yml | 3 - .github/workflows/terminal-bench.yml | 60 ++------------------ benchmarks/terminal_bench/cmux-run.sh | 19 +------ benchmarks/terminal_bench/cmux_agent.py | 4 +- 4 files changed, 8 insertions(+), 78 deletions(-) diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml index c4758e00a..70226826b 100644 --- a/.github/workflows/nightly-terminal-bench.yml +++ b/.github/workflows/nightly-terminal-bench.yml @@ -45,9 +45,6 @@ jobs: dataset: "terminal-bench-core==0.1.1" concurrency: "4" livestream: false - # Set aggressive per-task timeout (15 min) to catch hung tasks faster - # Tasks that consistently hit this timeout indicate bugs in task handling - task_timeout: "900" secrets: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 2b0e0765e..50cb87418 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -34,11 +34,6 @@ on: description: 'Additional arguments to pass to terminal-bench' required: false type: string - task_timeout: - description: 'Per-task timeout in seconds (default: 1800 = 30 min)' - required: false - type: string - default: '1800' secrets: ANTHROPIC_API_KEY: required: true @@ -77,11 +72,6 @@ on: description: 'Additional arguments to pass to terminal-bench' required: false type: string - task_timeout: - description: 'Per-task timeout in seconds (default: 1800 = 30 min)' - required: false - type: string - default: '1800' jobs: benchmark: @@ -112,30 +102,12 @@ jobs: run: make build-main build-preload - name: Run Terminal-Bench - run: | - echo "Starting Terminal-Bench run..." - echo "Dataset: $TB_DATASET" - echo "Concurrency: $TB_CONCURRENCY" - echo "Task timeout: $TB_TIMEOUT seconds" - echo "Sample size: ${TB_SAMPLE_SIZE:-all tasks}" - echo "" - - # Run benchmark with output redirected to file (logs saved as artifact) - # Only show progress indicator and final summary in console - make benchmark-terminal 2>&1 | tee benchmark.log | grep -E "(Running Terminal-Bench|Using timeout|Selected task IDs|Results Summary|Resolved Trials|Unresolved Trials|Accuracy)" || true - - # Show last 50 lines if there was an error - if [ ${PIPESTATUS[0]} -ne 0 ]; then - echo "" - echo "=== Benchmark failed, showing last 50 lines of output ===" - tail -50 benchmark.log - fi + run: make benchmark-terminal 2>&1 | tee benchmark.log env: TB_DATASET: ${{ inputs.dataset }} TB_CONCURRENCY: ${{ inputs.concurrency }} TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }} TB_SAMPLE_SIZE: ${{ inputs.sample_size }} - TB_TIMEOUT: ${{ inputs.task_timeout }} TB_ARGS: ${{ inputs.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', inputs.model_name, inputs.thinking_level, inputs.extra_args) || inputs.extra_args }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -146,34 +118,10 @@ jobs: echo "=== Terminal-Bench Results Summary ===" if [ -f "$(find runs -name 'results.json' 2>/dev/null | head -1)" ]; then RESULTS_FILE=$(find runs -name 'results.json' | head -1) - echo "Results file: $RESULTS_FILE" - echo "" - echo "Summary statistics:" - cat "$RESULTS_FILE" | jq '{n_resolved, n_unresolved, accuracy}' || echo "Failed to parse summary" - echo "" - echo "Per-task results:" - cat "$RESULTS_FILE" | jq -r '.results[] | "\(.task_id): \(if .is_resolved then "✓ PASS" else "✗ FAIL" end) (\(.failure_mode // "unknown"))"' 2>/dev/null || echo "Failed to parse task details" - echo "" - - # Check for timeout-related failures - TOTAL=$(cat "$RESULTS_FILE" | jq '.results | length' 2>/dev/null || echo "0") - TIMED_OUT=$(cat "$RESULTS_FILE" | jq '[.results[] | select(.failure_mode == "agent_timeout")] | length' 2>/dev/null || echo "0") - if [ "$TIMED_OUT" -gt 0 ] && [ "$TOTAL" -gt 0 ]; then - echo "⚠️ WARNING: $TIMED_OUT/$TOTAL tasks hit agent_timeout" - echo "This may indicate tasks need more time or are genuinely stuck" - fi - - # Full results for debugging - echo "" - echo "Full results.json:" - cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE" + cat "$RESULTS_FILE" | jq '{n_resolved, n_unresolved, accuracy}' 2>/dev/null || cat "$RESULTS_FILE" else - echo "❌ No results.json found in runs/" - if [ -d "runs" ]; then - ls -laR runs/ || echo "Failed to list runs directory" - else - echo "runs/ directory does not exist" - fi + echo "❌ No results.json found" + ls -laR runs/ 2>/dev/null || echo "runs/ directory missing" fi - name: Set artifact name diff --git a/benchmarks/terminal_bench/cmux-run.sh b/benchmarks/terminal_bench/cmux-run.sh index de0e941ce..379256e6a 100644 --- a/benchmarks/terminal_bench/cmux-run.sh +++ b/benchmarks/terminal_bench/cmux-run.sh @@ -94,20 +94,7 @@ if [[ -n "${CMUX_THINKING_LEVEL}" ]]; then cmd+=(--thinking-level "${CMUX_THINKING_LEVEL}") fi -# Run with timeout if available (fallback to running without timeout on older systems) -if command -v timeout >/dev/null 2>&1 && [[ -n "${CMUX_TIMEOUT_MS}" ]]; then - # Add 60s buffer to allow cmux's internal timeout to trigger first - SHELL_TIMEOUT_SEC=$((CMUX_TIMEOUT_MS / 1000 + 60)) - log "enforcing shell-level timeout of ${SHELL_TIMEOUT_SEC}s (cmux timeout: ${CMUX_TIMEOUT_MS}ms)" - if ! printf '%s' "${instruction}" | timeout "${SHELL_TIMEOUT_SEC}s" "${cmd[@]}"; then - EXIT_CODE=$? - if [ $EXIT_CODE -eq 124 ]; then - fatal "shell timeout reached (${SHELL_TIMEOUT_SEC}s) - task exceeded maximum duration" - fi - fatal "cmux agent session failed (exit code: $EXIT_CODE)" - fi -else - if ! printf '%s' "${instruction}" | "${cmd[@]}"; then - fatal "cmux agent session failed" - fi +# Terminal-bench enforces timeouts via --global-agent-timeout-sec +if ! printf '%s' "${instruction}" | "${cmd[@]}"; then + fatal "cmux agent session failed" fi diff --git a/benchmarks/terminal_bench/cmux_agent.py b/benchmarks/terminal_bench/cmux_agent.py index c1fc7f4d8..2e9afe251 100644 --- a/benchmarks/terminal_bench/cmux_agent.py +++ b/benchmarks/terminal_bench/cmux_agent.py @@ -193,13 +193,11 @@ def _ensure_payload_staged(self, session: TmuxSession) -> None: def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]: escaped = shlex.quote(instruction) command = f"bash /installed-agent/{self._RUNNER_NAME} {escaped}" - # Don't override max_timeout_sec - let terminal-bench enforce its global timeout - # Previously this was set to float("inf") which bypassed timeout controls + # Don't set max_timeout_sec - terminal-bench enforces global timeout return [ TerminalCommand( command=command, min_timeout_sec=0.0, - # max_timeout_sec uses default (180s) but global timeout takes precedence block=True, append_enter=True, ) From 077828b636e4ebf6cbd5e59e2fcf234351218bdd Mon Sep 17 00:00:00 2001 From: Ammar Date: Sat, 8 Nov 2025 20:38:33 +0000 Subject: [PATCH 7/7] docs: add bench: prefix for benchmarking PRs --- docs/AGENTS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/AGENTS.md b/docs/AGENTS.md index fc4c68c35..33444092d 100644 --- a/docs/AGENTS.md +++ b/docs/AGENTS.md @@ -107,6 +107,7 @@ Use these prefixes based on what best describes the PR: - **fix:** (conforming behavior to user expectations) - **feat:** (net new functionality) - **ci:** (concerned with build process or CI) +- **bench:** (benchmarking infrastructure or Terminal-Bench integration) Examples: @@ -115,6 +116,7 @@ Examples: - `🤖 fix: handle workspace rename edge cases` - `🤖 feat: add keyboard shortcuts for workspace navigation` - `🤖 ci: update wait_pr_checks script timeout` +- `🤖 bench: simplify timeout handling in terminal-bench integration` ## Project Structure