Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/nightly-terminal-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
thinking_level: "high"
dataset: "terminal-bench-core==0.1.1"
concurrency: "4"
livestream: true
livestream: false
secrets:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
32 changes: 14 additions & 18 deletions .github/workflows/terminal-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ on:
type: string
default: '4'
livestream:
description: 'Enable livestream mode'
description: 'Enable livestream mode (verbose output to console)'
required: false
type: boolean
default: true
default: false
sample_size:
description: 'Number of random tasks to run (empty = all tasks)'
required: false
Expand All @@ -52,9 +52,9 @@ on:
default: '4'
type: string
livestream:
description: 'Enable livestream mode'
description: 'Enable livestream mode (verbose output to console)'
required: false
default: true
default: false
type: boolean
sample_size:
description: 'Number of random tasks to run (empty = all tasks)'
Expand All @@ -77,9 +77,10 @@ jobs:
benchmark:
name: Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }}
runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
# Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes
# Allow 3 hours for safety margin and slower tasks
timeout-minutes: 180
# Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes typically
# Set 4-hour timeout to handle occasional API slowdowns while preventing infinite hangs
# If consistently hitting this timeout, investigate task-level issues
timeout-minutes: 240
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand All @@ -101,7 +102,7 @@ jobs:
run: make build-main build-preload

- name: Run Terminal-Bench
run: make benchmark-terminal
run: make benchmark-terminal 2>&1 | tee benchmark.log
env:
TB_DATASET: ${{ inputs.dataset }}
TB_CONCURRENCY: ${{ inputs.concurrency }}
Expand All @@ -115,18 +116,12 @@ jobs:
if: always()
run: |
echo "=== Terminal-Bench Results Summary ==="
if [ -f "$(find runs -name 'results.json' | head -1)" ]; then
if [ -f "$(find runs -name 'results.json' 2>/dev/null | head -1)" ]; then
RESULTS_FILE=$(find runs -name 'results.json' | head -1)
echo "Results file: $RESULTS_FILE"
echo ""
echo "Full results.json:"
cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
echo ""
echo "Per-task summary:"
cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
cat "$RESULTS_FILE" | jq '{n_resolved, n_unresolved, accuracy}' 2>/dev/null || cat "$RESULTS_FILE"
else
echo "No results.json found in runs/"
ls -la runs/
echo "No results.json found"
ls -laR runs/ 2>/dev/null || echo "runs/ directory missing"
fi

- name: Set artifact name
Expand All @@ -149,6 +144,7 @@ jobs:
name: ${{ steps.artifact-name.outputs.name }}
path: |
runs/
benchmark.log
if-no-files-found: warn
retention-days: 30

3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB
echo "Ensuring dataset $$TB_DATASET is downloaded..."; \
uvx terminal-bench datasets download --dataset "$$TB_DATASET" 2>&1 | grep -v "already exists" || true; \
echo "Sampling $$TB_SAMPLE_SIZE tasks from $$TB_DATASET..."; \
TASK_IDS=$$(python benchmarks/terminal_bench/sample_tasks.py --dataset "$$TB_DATASET" --sample-size "$$TB_SAMPLE_SIZE" --format space) || { \
TASK_IDS=$$(python3 benchmarks/terminal_bench/sample_tasks.py --dataset "$$TB_DATASET" --sample-size "$$TB_SAMPLE_SIZE" --format space) || { \
echo "Error: Failed to sample tasks" >&2; \
exit 1; \
}; \
Expand All @@ -320,6 +320,7 @@ benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB
fi; \
echo "Using timeout: $$TB_TIMEOUT seconds"; \
echo "Running Terminal-Bench with dataset $$TB_DATASET"; \
export CMUX_TIMEOUT_MS=$$((TB_TIMEOUT * 1000)); \
uvx terminal-bench run \
--dataset "$$TB_DATASET" \
--agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \
Expand Down
1 change: 1 addition & 0 deletions benchmarks/terminal_bench/cmux-run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ if [[ -n "${CMUX_THINKING_LEVEL}" ]]; then
cmd+=(--thinking-level "${CMUX_THINKING_LEVEL}")
fi

# Terminal-bench enforces timeouts via --global-agent-timeout-sec
if ! printf '%s' "${instruction}" | "${cmd[@]}"; then
fatal "cmux agent session failed"
fi
2 changes: 1 addition & 1 deletion benchmarks/terminal_bench/cmux_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,11 +193,11 @@ def _ensure_payload_staged(self, session: TmuxSession) -> None:
def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]:
escaped = shlex.quote(instruction)
command = f"bash /installed-agent/{self._RUNNER_NAME} {escaped}"
# Don't set max_timeout_sec - terminal-bench enforces global timeout
return [
TerminalCommand(
command=command,
min_timeout_sec=0.0,
max_timeout_sec=float("inf"),
block=True,
append_enter=True,
)
Expand Down
2 changes: 2 additions & 0 deletions docs/AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ Use these prefixes based on what best describes the PR:
- **fix:** (conforming behavior to user expectations)
- **feat:** (net new functionality)
- **ci:** (concerned with build process or CI)
- **bench:** (benchmarking infrastructure or Terminal-Bench integration)

Examples:

Expand All @@ -115,6 +116,7 @@ Examples:
- `🤖 fix: handle workspace rename edge cases`
- `🤖 feat: add keyboard shortcuts for workspace navigation`
- `🤖 ci: update wait_pr_checks script timeout`
- `🤖 bench: simplify timeout handling in terminal-bench integration`

## Project Structure

Expand Down