Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions .github/workflows/nightly-terminal-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,19 @@ on:
workflow_dispatch:
inputs:
models:
description: 'Models to test (comma-separated, or "all" for opus-4-7 + gpt-5.5 + google/gemini-3-pro-preview + google/gemini-3-flash-preview)'
description: 'Models to test (comma-separated, or "all" for opus-4-7 + gpt-5.5 + google/gemini-3-pro-preview + google/gemini-3-flash-preview + google/gemini-3.5-flash)'
required: false
default: "all"
type: string
experiments:
description: "Experiments to enable (comma-separated)"
required: false
type: string
mux_run_as_goal:
description: "Run nightly smoke/matrix tasks as strict mux CLI Goal Runs"
required: false
default: false
type: boolean

jobs:
# Smoke test: run chess-best-move task first to catch broken agent setup
Expand All @@ -33,6 +38,7 @@ jobs:
env: "daytona"
task_names: "chess-best-move"
experiments: ${{ inputs.experiments }}
mux_run_as_goal: ${{ github.event_name == 'workflow_dispatch' && inputs.mux_run_as_goal || false }}
# Keep least-privilege secret scope for reusable workflow calls.
secrets:
TERMINAL_BENCH_ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }}
Expand All @@ -58,6 +64,7 @@ jobs:
mux_project_path: "/testbed"
timeout: "3000"
experiments: ${{ inputs.experiments }}
mux_run_as_goal: ${{ github.event_name == 'workflow_dispatch' && inputs.mux_run_as_goal || false }}
secrets:
TERMINAL_BENCH_ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
Expand All @@ -79,6 +86,7 @@ jobs:
mux_project_path: "/app/src"
timeout: "600"
experiments: ${{ inputs.experiments }}
mux_run_as_goal: ${{ github.event_name == 'workflow_dispatch' && inputs.mux_run_as_goal || false }}
secrets:
TERMINAL_BENCH_ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
Expand All @@ -100,6 +108,7 @@ jobs:
mux_project_path: "/app"
timeout: "1800"
experiments: ${{ inputs.experiments }}
mux_run_as_goal: ${{ github.event_name == 'workflow_dispatch' && inputs.mux_run_as_goal || false }}
secrets:
TERMINAL_BENCH_ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
Expand All @@ -120,7 +129,7 @@ jobs:
INPUT_MODELS: ${{ inputs.models }}
run: |
if [ "$INPUT_MODELS" = "all" ] || [ -z "$INPUT_MODELS" ]; then
echo 'models=["anthropic/claude-opus-4-7","openai/gpt-5.5","google/gemini-3-pro-preview","google/gemini-3-flash-preview"]' >> "$GITHUB_OUTPUT"
echo 'models=["anthropic/claude-opus-4-7","openai/gpt-5.5","google/gemini-3-pro-preview","google/gemini-3-flash-preview","google/gemini-3.5-flash"]' >> "$GITHUB_OUTPUT"
else
# Convert comma-separated to JSON array
models_json=$(echo "$INPUT_MODELS" | jq -R -s -c 'split(",") | map(gsub("^\\s+|\\s+$"; ""))')
Expand All @@ -134,7 +143,7 @@ jobs:
matrix:
model: ${{ fromJSON(needs.determine-models.outputs.models) }}
fail-fast: false
max-parallel: 1 # Run models sequentially to stay within Daytona's 25-sandbox limit
max-parallel: 1 # Run models sequentially to stay within Daytona's 25-sandbox limit
uses: ./.github/workflows/terminal-bench.yml
with:
model_name: ${{ matrix.model }}
Expand All @@ -144,6 +153,7 @@ jobs:
concurrency: "48"
env: "daytona"
experiments: ${{ inputs.experiments }}
mux_run_as_goal: ${{ github.event_name == 'workflow_dispatch' && inputs.mux_run_as_goal || false }}
secrets:
TERMINAL_BENCH_ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
Expand Down
25 changes: 17 additions & 8 deletions .github/workflows/terminal-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ on:
required: false
type: string
extra_args:
description: "Additional arguments to pass to harbor"
description: "Additional arguments to pass to harbor (e.g., --n-tasks 5 for quick dispatch runs)"
required: false
type: string
experiments:
Expand All @@ -50,10 +50,15 @@ on:
type: string
default: ""
mux_run_args:
description: "Additional CLI flags passed to mux run (e.g., --thinking high --use-1m --budget 5.00)"
description: "Additional CLI flags passed to mux run (e.g., --thinking high --use-1m --budget 5.00; with goal mode, add --goal-turns/--goal-budget)"
required: false
type: string
default: ""
mux_run_as_goal:
description: "Run each task instruction as a mux CLI Goal Run"
required: false
type: boolean
default: false
secrets:
# Keep the runtime env name stable while routing benchmark spend to its own key.
TERMINAL_BENCH_ANTHROPIC_API_KEY:
Expand Down Expand Up @@ -92,11 +97,16 @@ on:
required: false
type: string
mux_run_args:
description: "Additional CLI flags passed to mux run (e.g., --thinking high --use-1m)"
description: "Additional CLI flags passed to mux run (e.g., --thinking high --use-1m; with goal mode, add --goal-turns/--goal-budget)"
required: false
type: string
mux_run_as_goal:
description: "Run each task instruction as a mux CLI Goal Run"
required: false
default: false
type: boolean
extra_args:
description: "Additional arguments to pass to harbor"
description: "Additional arguments to pass to harbor (e.g., --n-tasks 5 for quick dispatch runs)"
required: false
type: string
experiments:
Expand All @@ -107,10 +117,6 @@ on:
description: "Agent timeout in seconds (default: 1800 = 30 min)"
required: false
type: string
max_tasks:
description: "Maximum number of tasks to run (for faster iteration)"
required: false
type: string

jobs:
benchmark:
Expand Down Expand Up @@ -206,6 +212,7 @@ jobs:
${{ inputs.extra_args || '' }}
MUX_EXPERIMENTS: ${{ inputs.experiments }}
MUX_RUN_ARGS: ${{ inputs.mux_run_args }}
MUX_RUN_AS_GOAL: ${{ inputs.mux_run_as_goal && '1' || '' }}
ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
Expand Down Expand Up @@ -358,6 +365,7 @@ jobs:
GCP_PROJECT_ID: mux-benchmarks
BQ_DATASET: benchmarks
MUX_EXPERIMENTS: ${{ inputs.experiments }}
MUX_RUN_AS_GOAL: ${{ inputs.mux_run_as_goal && '1' || '' }}
run: |
if [ -z "$GCP_SA_KEY" ]; then
echo "GCP_SA_KEY not set, skipping BigQuery upload"
Expand All @@ -376,6 +384,7 @@ jobs:
GCP_PROJECT_ID: mux-benchmarks
BQ_DATASET: benchmarks
MUX_EXPERIMENTS: ${{ inputs.experiments }}
MUX_RUN_AS_GOAL: ${{ inputs.mux_run_as_goal && '1' || '' }}
run: |
if [ -z "$GCP_SA_KEY" ]; then
echo "GCP_SA_KEY not set, skipping BigQuery upload"
Expand Down
17 changes: 17 additions & 0 deletions .mux/skills/tbench/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ make benchmark-terminal TB_ENV=daytona TB_CONCURRENCY=48 TB_TASK_NAMES="chess-be
- `TB_TASK_NAMES`: Space-separated task names to run (default: all tasks)
- `TB_ARGS`: Additional arguments passed to harbor
- `MUX_RUN_ARGS`: CLI flags passed directly to `mux run` inside the container (e.g., `--thinking high --use-1m --budget 5.00`). This is the primary mechanism for all `mux run` flags — avoids per-flag plumbing.
- `MUX_RUN_AS_GOAL`: When set to `1`, runs each task instruction as a strict `mux run --goal` objective while still piping the instruction to stdin. Use `MUX_RUN_ARGS` for goal limits such as `--goal-turns` and `--goal-budget`.

### Timeout Handling

Expand Down Expand Up @@ -109,6 +110,22 @@ gh workflow run terminal-bench.yml \
-f mux_run_args="--thinking high --budget 5.00"
```

**Strict goal-mode runs:**

```bash
# Run a single task as a strict CLI Goal Run
MUX_RUN_AS_GOAL=1 \
MUX_RUN_ARGS="--thinking high --goal-turns 30 --goal-budget 10.00" \
make benchmark-terminal TB_TASK_NAMES="chess-best-move"

# CI dispatch
gh workflow run terminal-bench.yml \
-f model_name=anthropic/claude-sonnet-4-5 \
-f task_names=chess-best-move \
-f mux_run_as_goal=true \
-f mux_run_args="--thinking high --goal-turns 30 --goal-budget 10.00"
```

**Local runs:**

```bash
Expand Down
63 changes: 55 additions & 8 deletions benchmarks/terminal_bench/mux-run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,16 @@ MUX_MODEL="${MUX_MODEL:-anthropic:claude-sonnet-4-5}"
MUX_TIMEOUT_MS="${MUX_TIMEOUT_MS:-}"
MUX_WORKSPACE_ID="${MUX_WORKSPACE_ID:-mux-bench}"
MUX_EXPERIMENTS="${MUX_EXPERIMENTS:-}"
MUX_RUN_AS_GOAL="${MUX_RUN_AS_GOAL:-}"

mux_run_as_goal_normalized="${MUX_RUN_AS_GOAL,,}"
mux_run_as_goal_normalized="${mux_run_as_goal_normalized#"${mux_run_as_goal_normalized%%[![:space:]]*}"}"
mux_run_as_goal_normalized="${mux_run_as_goal_normalized%"${mux_run_as_goal_normalized##*[![:space:]]}"}"
case "${mux_run_as_goal_normalized}" in
"" | "0" | "false") mux_run_as_goal_enabled=0 ;;
"1" | "true") mux_run_as_goal_enabled=1 ;;
*) fatal "MUX_RUN_AS_GOAL must be one of: 1, true, 0, false" ;;
esac

resolve_project_path() {
if [[ -n "${MUX_PROJECT_PATH}" ]]; then
Expand Down Expand Up @@ -80,11 +90,27 @@ if [[ -n "${MUX_EXPERIMENTS}" ]]; then
done
fi

if [[ "${mux_run_as_goal_enabled}" == "1" ]]; then
log "strict mux goal mode enabled"
cmd+=(--goal "${instruction}")
else
log "strict mux goal mode disabled"
fi

mux_run_args=()
# Append arbitrary mux run flags (e.g., --thinking high --mode exec --use-1m --budget 5.00)
if [[ -n "${MUX_RUN_ARGS:-}" ]]; then
# Word-split intentional: MUX_RUN_ARGS contains space-separated CLI flags
# Word-split intentional: MUX_RUN_ARGS contains space-separated CLI flags.
# shellcheck disable=SC2206
cmd+=(${MUX_RUN_ARGS})
mux_run_args=(${MUX_RUN_ARGS})
if [[ "${mux_run_as_goal_enabled}" == "1" ]]; then
for arg in "${mux_run_args[@]}"; do
if [[ "${arg}" == "--goal" || "${arg}" == --goal=* ]]; then
fatal "MUX_RUN_ARGS must not include --goal when MUX_RUN_AS_GOAL is enabled"
fi
done
fi
cmd+=("${mux_run_args[@]}")
fi

# NOTE: Harbor only automatically collects /logs/agent on timeouts.
Expand All @@ -103,13 +129,19 @@ if [[ -n "${MUX_TIMEOUT_MS}" ]]; then
fi

# Capture output to file while streaming to terminal for token extraction.
# Keep stderr separate so the stdout log stays valid JSONL.
if ! printf '%s' "${instruction}" \
# Keep stderr separate so the stdout log stays valid JSONL. Temporarily disable
# errexit so token extraction still runs after mux returns a meaningful nonzero
# code such as strict goal-mode exit 3.
set +e
printf '%s' "${instruction}" \
| "${cmd[@]}" \
2> >(tee "${MUX_STDERR_FILE}" >&2) \
| tee "${MUX_OUTPUT_FILE}"; then
fatal "mux agent session failed"
fi
| tee "${MUX_OUTPUT_FILE}"
pipeline_status=("${PIPESTATUS[@]}")
set -e
stdin_status="${pipeline_status[0]}"
mux_status="${pipeline_status[1]}"
Comment thread
ThomasK33 marked this conversation as resolved.
tee_status="${pipeline_status[2]}"

# Extract usage and cost from the JSONL output.
# Prefer the run-complete event (emitted at end of --json run) which has aggregated
Expand Down Expand Up @@ -159,4 +191,19 @@ for usage in cumulative_by_msg.values():
result["input"] += subagent_input
result["output"] += subagent_output
print(json.dumps(result))
' "${MUX_OUTPUT_FILE}" > "${MUX_TOKEN_FILE}" 2>/dev/null || true
' "${MUX_OUTPUT_FILE}" >"${MUX_TOKEN_FILE}" 2>/dev/null || true

if [[ "${mux_status}" -ne 0 ]]; then
printf '[mux-run] ERROR: mux agent session failed (exit %s)\n' "${mux_status}" >&2
exit "${mux_status}"
fi

if [[ "${tee_status}" -ne 0 ]]; then
printf '[mux-run] ERROR: failed to capture mux stdout (exit %s)\n' "${tee_status}" >&2
exit "${tee_status}"
fi

if [[ "${stdin_status}" -ne 0 ]]; then
printf '[mux-run] ERROR: failed to send instruction to mux (exit %s)\n' "${stdin_status}" >&2
exit "${stdin_status}"
fi
30 changes: 30 additions & 0 deletions benchmarks/terminal_bench/mux_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ class MuxAgent(BaseInstalledAgent):
# Generic pass-through for arbitrary mux run CLI flags (e.g., --thinking
# high --use-1m --budget 5.00). Avoids per-flag plumbing.
"MUX_RUN_ARGS",
"MUX_RUN_AS_GOAL",
)

def __init__(
Expand Down Expand Up @@ -167,12 +168,31 @@ def _env(self) -> dict[str, str]:
if not project_path.strip():
raise ValueError("MUX_PROJECT_PATH must be non-empty when provided")

mux_run_as_goal = self._normalize_mux_run_as_goal(env.get("MUX_RUN_AS_GOAL"))
if mux_run_as_goal is None:
env.pop("MUX_RUN_AS_GOAL", None)
else:
env["MUX_RUN_AS_GOAL"] = mux_run_as_goal

# Set experiments from kwarg (takes precedence over env var)
if self._experiments:
env["MUX_EXPERIMENTS"] = self._experiments

return env

@staticmethod
def _normalize_mux_run_as_goal(value: str | None) -> str | None:
if value is None:
return None

normalized = value.strip().lower()
if normalized in ("", "0", "false"):
return None
if normalized in ("1", "true"):
return "1"

raise ValueError("MUX_RUN_AS_GOAL must be one of: 1, true, 0, false")

@property
def _install_agent_template_path(self) -> Path:
return Path(__file__).with_name("mux_setup.sh.j2")
Expand Down Expand Up @@ -288,6 +308,7 @@ async def run(
) -> None:
"""Run agent commands, download token file, then populate context."""
# Execute commands (from base class logic, but without calling populate_context)
failed_command: tuple[int, int] | None = None
for i, exec_input in enumerate(self.create_run_agent_commands(instruction)):
command_dir = self.logs_dir / f"command-{i}"
command_dir.mkdir(parents=True, exist_ok=True)
Expand All @@ -305,6 +326,9 @@ async def run(
(command_dir / "stdout.txt").write_text(result.stdout)
if result.stderr:
(command_dir / "stderr.txt").write_text(result.stderr)
if result.return_code != 0:
failed_command = (i, result.return_code)
break

# Download token file from container BEFORE populating context
# Clear any stale token file first to avoid reading outdated data if download fails
Expand All @@ -317,6 +341,12 @@ async def run(

self.populate_context_post_run(context)

if failed_command is not None:
command_index, return_code = failed_command
raise RuntimeError(
f"mux agent command failed (command {command_index}, exit {return_code})"
)

def populate_context_post_run(self, context: AgentContext) -> None:
"""Extract token usage and cost from the token file written by mux-run.sh."""
token_file = self.logs_dir / "mux-tokens.json"
Expand Down
Loading
Loading