diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml index 119c20dcc..70226826b 100644 --- a/.github/workflows/nightly-terminal-bench.yml +++ b/.github/workflows/nightly-terminal-bench.yml @@ -44,7 +44,7 @@ jobs: thinking_level: "high" dataset: "terminal-bench-core==0.1.1" concurrency: "4" - livestream: true + livestream: false secrets: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 6db5b58fb..50cb87418 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -22,10 +22,10 @@ on: type: string default: '4' livestream: - description: 'Enable livestream mode' + description: 'Enable livestream mode (verbose output to console)' required: false type: boolean - default: true + default: false sample_size: description: 'Number of random tasks to run (empty = all tasks)' required: false @@ -52,9 +52,9 @@ on: default: '4' type: string livestream: - description: 'Enable livestream mode' + description: 'Enable livestream mode (verbose output to console)' required: false - default: true + default: false type: boolean sample_size: description: 'Number of random tasks to run (empty = all tasks)' @@ -77,9 +77,10 @@ jobs: benchmark: name: Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }} runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }} - # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes - # Allow 3 hours for safety margin and slower tasks - timeout-minutes: 180 + # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes typically + # Set 4-hour timeout to handle occasional API slowdowns while preventing infinite hangs + # If consistently hitting this timeout, investigate task-level issues + timeout-minutes: 240 steps: - name: Checkout code uses: actions/checkout@v4 @@ -101,7 +102,7 @@ jobs: run: make build-main build-preload - name: Run Terminal-Bench - run: make benchmark-terminal + run: make benchmark-terminal 2>&1 | tee benchmark.log env: TB_DATASET: ${{ inputs.dataset }} TB_CONCURRENCY: ${{ inputs.concurrency }} @@ -115,18 +116,12 @@ jobs: if: always() run: | echo "=== Terminal-Bench Results Summary ===" - if [ -f "$(find runs -name 'results.json' | head -1)" ]; then + if [ -f "$(find runs -name 'results.json' 2>/dev/null | head -1)" ]; then RESULTS_FILE=$(find runs -name 'results.json' | head -1) - echo "Results file: $RESULTS_FILE" - echo "" - echo "Full results.json:" - cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE" - echo "" - echo "Per-task summary:" - cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details" + cat "$RESULTS_FILE" | jq '{n_resolved, n_unresolved, accuracy}' 2>/dev/null || cat "$RESULTS_FILE" else - echo "No results.json found in runs/" - ls -la runs/ + echo "❌ No results.json found" + ls -laR runs/ 2>/dev/null || echo "runs/ directory missing" fi - name: Set artifact name @@ -149,6 +144,7 @@ jobs: name: ${{ steps.artifact-name.outputs.name }} path: | runs/ + benchmark.log if-no-files-found: warn retention-days: 30 diff --git a/Makefile b/Makefile index 12f66deb4..a27559132 100644 --- a/Makefile +++ b/Makefile @@ -305,7 +305,7 @@ benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB echo "Ensuring dataset $$TB_DATASET is downloaded..."; \ uvx terminal-bench datasets download --dataset "$$TB_DATASET" 2>&1 | grep -v "already exists" || true; \ echo "Sampling $$TB_SAMPLE_SIZE tasks from $$TB_DATASET..."; \ - TASK_IDS=$$(python benchmarks/terminal_bench/sample_tasks.py --dataset "$$TB_DATASET" --sample-size "$$TB_SAMPLE_SIZE" --format space) || { \ + TASK_IDS=$$(python3 benchmarks/terminal_bench/sample_tasks.py --dataset "$$TB_DATASET" --sample-size "$$TB_SAMPLE_SIZE" --format space) || { \ echo "Error: Failed to sample tasks" >&2; \ exit 1; \ }; \ @@ -320,6 +320,7 @@ benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB fi; \ echo "Using timeout: $$TB_TIMEOUT seconds"; \ echo "Running Terminal-Bench with dataset $$TB_DATASET"; \ + export CMUX_TIMEOUT_MS=$$((TB_TIMEOUT * 1000)); \ uvx terminal-bench run \ --dataset "$$TB_DATASET" \ --agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \ diff --git a/benchmarks/terminal_bench/cmux-run.sh b/benchmarks/terminal_bench/cmux-run.sh index 82d126c6d..379256e6a 100644 --- a/benchmarks/terminal_bench/cmux-run.sh +++ b/benchmarks/terminal_bench/cmux-run.sh @@ -94,6 +94,7 @@ if [[ -n "${CMUX_THINKING_LEVEL}" ]]; then cmd+=(--thinking-level "${CMUX_THINKING_LEVEL}") fi +# Terminal-bench enforces timeouts via --global-agent-timeout-sec if ! printf '%s' "${instruction}" | "${cmd[@]}"; then fatal "cmux agent session failed" fi diff --git a/benchmarks/terminal_bench/cmux_agent.py b/benchmarks/terminal_bench/cmux_agent.py index 9bb9d93f9..2e9afe251 100644 --- a/benchmarks/terminal_bench/cmux_agent.py +++ b/benchmarks/terminal_bench/cmux_agent.py @@ -193,11 +193,11 @@ def _ensure_payload_staged(self, session: TmuxSession) -> None: def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]: escaped = shlex.quote(instruction) command = f"bash /installed-agent/{self._RUNNER_NAME} {escaped}" + # Don't set max_timeout_sec - terminal-bench enforces global timeout return [ TerminalCommand( command=command, min_timeout_sec=0.0, - max_timeout_sec=float("inf"), block=True, append_enter=True, ) diff --git a/docs/AGENTS.md b/docs/AGENTS.md index fc4c68c35..33444092d 100644 --- a/docs/AGENTS.md +++ b/docs/AGENTS.md @@ -107,6 +107,7 @@ Use these prefixes based on what best describes the PR: - **fix:** (conforming behavior to user expectations) - **feat:** (net new functionality) - **ci:** (concerned with build process or CI) +- **bench:** (benchmarking infrastructure or Terminal-Bench integration) Examples: @@ -115,6 +116,7 @@ Examples: - `🤖 fix: handle workspace rename edge cases` - `🤖 feat: add keyboard shortcuts for workspace navigation` - `🤖 ci: update wait_pr_checks script timeout` +- `🤖 bench: simplify timeout handling in terminal-bench integration` ## Project Structure