coder · ThomasK33 · May 21, 2026 · May 21, 2026
diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml
@@ -10,14 +10,19 @@ on:
   workflow_dispatch:
     inputs:
       models:
-        description: 'Models to test (comma-separated, or "all" for opus-4-7 + gpt-5.5 + google/gemini-3-pro-preview + google/gemini-3-flash-preview)'
+        description: 'Models to test (comma-separated, or "all" for opus-4-7 + gpt-5.5 + google/gemini-3-pro-preview + google/gemini-3-flash-preview + google/gemini-3.5-flash)'
         required: false
         default: "all"
         type: string
       experiments:
         description: "Experiments to enable (comma-separated)"
         required: false
         type: string
+      mux_run_as_goal:
+        description: "Run nightly smoke/matrix tasks as strict mux CLI Goal Runs"
+        required: false
+        default: false
+        type: boolean
 
 jobs:
   # Smoke test: run chess-best-move task first to catch broken agent setup
@@ -33,6 +38,7 @@ jobs:
       env: "daytona"
       task_names: "chess-best-move"
       experiments: ${{ inputs.experiments }}
+      mux_run_as_goal: ${{ github.event_name == 'workflow_dispatch' && inputs.mux_run_as_goal || false }}
     # Keep least-privilege secret scope for reusable workflow calls.
     secrets:
       TERMINAL_BENCH_ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }}
@@ -58,6 +64,7 @@ jobs:
       mux_project_path: "/testbed"
       timeout: "3000"
       experiments: ${{ inputs.experiments }}
+      mux_run_as_goal: ${{ github.event_name == 'workflow_dispatch' && inputs.mux_run_as_goal || false }}
     secrets:
       TERMINAL_BENCH_ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }}
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -79,6 +86,7 @@ jobs:
       mux_project_path: "/app/src"
       timeout: "600"
       experiments: ${{ inputs.experiments }}
+      mux_run_as_goal: ${{ github.event_name == 'workflow_dispatch' && inputs.mux_run_as_goal || false }}
     secrets:
       TERMINAL_BENCH_ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }}
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -100,6 +108,7 @@ jobs:
       mux_project_path: "/app"
       timeout: "1800"
       experiments: ${{ inputs.experiments }}
+      mux_run_as_goal: ${{ github.event_name == 'workflow_dispatch' && inputs.mux_run_as_goal || false }}
     secrets:
       TERMINAL_BENCH_ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }}
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -120,7 +129,7 @@ jobs:
           INPUT_MODELS: ${{ inputs.models }}
         run: |
           if [ "$INPUT_MODELS" = "all" ] || [ -z "$INPUT_MODELS" ]; then
-            echo 'models=["anthropic/claude-opus-4-7","openai/gpt-5.5","google/gemini-3-pro-preview","google/gemini-3-flash-preview"]' >> "$GITHUB_OUTPUT"
+            echo 'models=["anthropic/claude-opus-4-7","openai/gpt-5.5","google/gemini-3-pro-preview","google/gemini-3-flash-preview","google/gemini-3.5-flash"]' >> "$GITHUB_OUTPUT"
           else
             # Convert comma-separated to JSON array
             models_json=$(echo "$INPUT_MODELS" | jq -R -s -c 'split(",") | map(gsub("^\\s+|\\s+$"; ""))')
@@ -134,7 +143,7 @@ jobs:
       matrix:
         model: ${{ fromJSON(needs.determine-models.outputs.models) }}
       fail-fast: false
-      max-parallel: 1  # Run models sequentially to stay within Daytona's 25-sandbox limit
+      max-parallel: 1 # Run models sequentially to stay within Daytona's 25-sandbox limit
     uses: ./.github/workflows/terminal-bench.yml
     with:
       model_name: ${{ matrix.model }}
@@ -144,6 +153,7 @@ jobs:
       concurrency: "48"
       env: "daytona"
       experiments: ${{ inputs.experiments }}
+      mux_run_as_goal: ${{ github.event_name == 'workflow_dispatch' && inputs.mux_run_as_goal || false }}
     secrets:
       TERMINAL_BENCH_ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }}
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
@@ -27,7 +27,7 @@ on:
         required: false
         type: string
       extra_args:
-        description: "Additional arguments to pass to harbor"
+        description: "Additional arguments to pass to harbor (e.g., --n-tasks 5 for quick dispatch runs)"
         required: false
         type: string
       experiments:
@@ -50,10 +50,15 @@ on:
         type: string
         default: ""
       mux_run_args:
-        description: "Additional CLI flags passed to mux run (e.g., --thinking high --use-1m --budget 5.00)"
+        description: "Additional CLI flags passed to mux run (e.g., --thinking high --use-1m --budget 5.00; with goal mode, add --goal-turns/--goal-budget)"
         required: false
         type: string
         default: ""
+      mux_run_as_goal:
+        description: "Run each task instruction as a mux CLI Goal Run"
+        required: false
+        type: boolean
+        default: false
     secrets:
       # Keep the runtime env name stable while routing benchmark spend to its own key.
       TERMINAL_BENCH_ANTHROPIC_API_KEY:
@@ -92,11 +97,16 @@ on:
         required: false
         type: string
       mux_run_args:
-        description: "Additional CLI flags passed to mux run (e.g., --thinking high --use-1m)"
+        description: "Additional CLI flags passed to mux run (e.g., --thinking high --use-1m; with goal mode, add --goal-turns/--goal-budget)"
         required: false
         type: string
+      mux_run_as_goal:
+        description: "Run each task instruction as a mux CLI Goal Run"
+        required: false
+        default: false
+        type: boolean
       extra_args:
-        description: "Additional arguments to pass to harbor"
+        description: "Additional arguments to pass to harbor (e.g., --n-tasks 5 for quick dispatch runs)"
         required: false
         type: string
       experiments:
@@ -107,10 +117,6 @@ on:
         description: "Agent timeout in seconds (default: 1800 = 30 min)"
         required: false
         type: string
-      max_tasks:
-        description: "Maximum number of tasks to run (for faster iteration)"
-        required: false
-        type: string
 
 jobs:
   benchmark:
@@ -206,6 +212,7 @@ jobs:
             ${{ inputs.extra_args || '' }}
           MUX_EXPERIMENTS: ${{ inputs.experiments }}
           MUX_RUN_ARGS: ${{ inputs.mux_run_args }}
+          MUX_RUN_AS_GOAL: ${{ inputs.mux_run_as_goal && '1' || '' }}
           ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
@@ -358,6 +365,7 @@ jobs:
           GCP_PROJECT_ID: mux-benchmarks
           BQ_DATASET: benchmarks
           MUX_EXPERIMENTS: ${{ inputs.experiments }}
+          MUX_RUN_AS_GOAL: ${{ inputs.mux_run_as_goal && '1' || '' }}
         run: |
           if [ -z "$GCP_SA_KEY" ]; then
             echo "GCP_SA_KEY not set, skipping BigQuery upload"
@@ -376,6 +384,7 @@ jobs:
           GCP_PROJECT_ID: mux-benchmarks
           BQ_DATASET: benchmarks
           MUX_EXPERIMENTS: ${{ inputs.experiments }}
+          MUX_RUN_AS_GOAL: ${{ inputs.mux_run_as_goal && '1' || '' }}
         run: |
           if [ -z "$GCP_SA_KEY" ]; then
             echo "GCP_SA_KEY not set, skipping BigQuery upload"

diff --git a/.mux/skills/tbench/SKILL.md b/.mux/skills/tbench/SKILL.md
@@ -59,6 +59,7 @@ make benchmark-terminal TB_ENV=daytona TB_CONCURRENCY=48 TB_TASK_NAMES="chess-be
 - `TB_TASK_NAMES`: Space-separated task names to run (default: all tasks)
 - `TB_ARGS`: Additional arguments passed to harbor
 - `MUX_RUN_ARGS`: CLI flags passed directly to `mux run` inside the container (e.g., `--thinking high --use-1m --budget 5.00`). This is the primary mechanism for all `mux run` flags — avoids per-flag plumbing.
+- `MUX_RUN_AS_GOAL`: When set to `1`, runs each task instruction as a strict `mux run --goal` objective while still piping the instruction to stdin. Use `MUX_RUN_ARGS` for goal limits such as `--goal-turns` and `--goal-budget`.
 
 ### Timeout Handling
 
@@ -109,6 +110,22 @@ gh workflow run terminal-bench.yml \
   -f mux_run_args="--thinking high --budget 5.00"
 ```
 
+**Strict goal-mode runs:**
+
+```bash
+# Run a single task as a strict CLI Goal Run
+MUX_RUN_AS_GOAL=1 \
+MUX_RUN_ARGS="--thinking high --goal-turns 30 --goal-budget 10.00" \
+make benchmark-terminal TB_TASK_NAMES="chess-best-move"
+
+# CI dispatch
+gh workflow run terminal-bench.yml \
+  -f model_name=anthropic/claude-sonnet-4-5 \
+  -f task_names=chess-best-move \
+  -f mux_run_as_goal=true \
+  -f mux_run_args="--thinking high --goal-turns 30 --goal-budget 10.00"
+```
+
 **Local runs:**
 
 ```bash

diff --git a/benchmarks/terminal_bench/mux-run.sh b/benchmarks/terminal_bench/mux-run.sh
@@ -34,6 +34,16 @@ MUX_MODEL="${MUX_MODEL:-anthropic:claude-sonnet-4-5}"
 MUX_TIMEOUT_MS="${MUX_TIMEOUT_MS:-}"
 MUX_WORKSPACE_ID="${MUX_WORKSPACE_ID:-mux-bench}"
 MUX_EXPERIMENTS="${MUX_EXPERIMENTS:-}"
+MUX_RUN_AS_GOAL="${MUX_RUN_AS_GOAL:-}"
+
+mux_run_as_goal_normalized="${MUX_RUN_AS_GOAL,,}"
+mux_run_as_goal_normalized="${mux_run_as_goal_normalized#"${mux_run_as_goal_normalized%%[![:space:]]*}"}"
+mux_run_as_goal_normalized="${mux_run_as_goal_normalized%"${mux_run_as_goal_normalized##*[![:space:]]}"}"
+case "${mux_run_as_goal_normalized}" in
+  "" | "0" | "false") mux_run_as_goal_enabled=0 ;;
+  "1" | "true") mux_run_as_goal_enabled=1 ;;
+  *) fatal "MUX_RUN_AS_GOAL must be one of: 1, true, 0, false" ;;
+esac
 
 resolve_project_path() {
   if [[ -n "${MUX_PROJECT_PATH}" ]]; then
@@ -80,11 +90,27 @@ if [[ -n "${MUX_EXPERIMENTS}" ]]; then
   done
 fi
 
+if [[ "${mux_run_as_goal_enabled}" == "1" ]]; then
+  log "strict mux goal mode enabled"
+  cmd+=(--goal "${instruction}")
+else
+  log "strict mux goal mode disabled"
+fi
+
+mux_run_args=()
 # Append arbitrary mux run flags (e.g., --thinking high --mode exec --use-1m --budget 5.00)
 if [[ -n "${MUX_RUN_ARGS:-}" ]]; then
-  # Word-split intentional: MUX_RUN_ARGS contains space-separated CLI flags
+  # Word-split intentional: MUX_RUN_ARGS contains space-separated CLI flags.
   # shellcheck disable=SC2206
-  cmd+=(${MUX_RUN_ARGS})
+  mux_run_args=(${MUX_RUN_ARGS})
+  if [[ "${mux_run_as_goal_enabled}" == "1" ]]; then
+    for arg in "${mux_run_args[@]}"; do
+      if [[ "${arg}" == "--goal" || "${arg}" == --goal=* ]]; then
+        fatal "MUX_RUN_ARGS must not include --goal when MUX_RUN_AS_GOAL is enabled"
+      fi
+    done
+  fi
+  cmd+=("${mux_run_args[@]}")
 fi
 
 # NOTE: Harbor only automatically collects /logs/agent on timeouts.
@@ -103,13 +129,19 @@ if [[ -n "${MUX_TIMEOUT_MS}" ]]; then
 fi
 
 # Capture output to file while streaming to terminal for token extraction.
-# Keep stderr separate so the stdout log stays valid JSONL.
-if ! printf '%s' "${instruction}" \
+# Keep stderr separate so the stdout log stays valid JSONL. Temporarily disable
+# errexit so token extraction still runs after mux returns a meaningful nonzero
+# code such as strict goal-mode exit 3.
+set +e
+printf '%s' "${instruction}" \
   | "${cmd[@]}" \
     2> >(tee "${MUX_STDERR_FILE}" >&2) \
-  | tee "${MUX_OUTPUT_FILE}"; then
-  fatal "mux agent session failed"
-fi
+  | tee "${MUX_OUTPUT_FILE}"
+pipeline_status=("${PIPESTATUS[@]}")
+set -e
+stdin_status="${pipeline_status[0]}"
+mux_status="${pipeline_status[1]}"
+tee_status="${pipeline_status[2]}"
 
 # Extract usage and cost from the JSONL output.
 # Prefer the run-complete event (emitted at end of --json run) which has aggregated
@@ -159,4 +191,19 @@ for usage in cumulative_by_msg.values():
 result["input"] += subagent_input
 result["output"] += subagent_output
 print(json.dumps(result))
-' "${MUX_OUTPUT_FILE}" > "${MUX_TOKEN_FILE}" 2>/dev/null || true
+' "${MUX_OUTPUT_FILE}" >"${MUX_TOKEN_FILE}" 2>/dev/null || true
+
+if [[ "${mux_status}" -ne 0 ]]; then
+  printf '[mux-run] ERROR: mux agent session failed (exit %s)\n' "${mux_status}" >&2
+  exit "${mux_status}"
+fi
+
+if [[ "${tee_status}" -ne 0 ]]; then
+  printf '[mux-run] ERROR: failed to capture mux stdout (exit %s)\n' "${tee_status}" >&2
+  exit "${tee_status}"
+fi
+
+if [[ "${stdin_status}" -ne 0 ]]; then
+  printf '[mux-run] ERROR: failed to send instruction to mux (exit %s)\n' "${stdin_status}" >&2
+  exit "${stdin_status}"
+fi
diff --git a/benchmarks/terminal_bench/mux_agent.py b/benchmarks/terminal_bench/mux_agent.py
@@ -78,6 +78,7 @@ class MuxAgent(BaseInstalledAgent):
         # Generic pass-through for arbitrary mux run CLI flags (e.g., --thinking
         # high --use-1m --budget 5.00). Avoids per-flag plumbing.
         "MUX_RUN_ARGS",
+        "MUX_RUN_AS_GOAL",
     )
 
     def __init__(
@@ -167,12 +168,31 @@ def _env(self) -> dict[str, str]:
             if not project_path.strip():
                 raise ValueError("MUX_PROJECT_PATH must be non-empty when provided")
 
+        mux_run_as_goal = self._normalize_mux_run_as_goal(env.get("MUX_RUN_AS_GOAL"))
+        if mux_run_as_goal is None:
+            env.pop("MUX_RUN_AS_GOAL", None)
+        else:
+            env["MUX_RUN_AS_GOAL"] = mux_run_as_goal
+
         # Set experiments from kwarg (takes precedence over env var)
         if self._experiments:
             env["MUX_EXPERIMENTS"] = self._experiments
 
         return env
 
+    @staticmethod
+    def _normalize_mux_run_as_goal(value: str | None) -> str | None:
+        if value is None:
+            return None
+
+        normalized = value.strip().lower()
+        if normalized in ("", "0", "false"):
+            return None
+        if normalized in ("1", "true"):
+            return "1"
+
+        raise ValueError("MUX_RUN_AS_GOAL must be one of: 1, true, 0, false")
+
     @property
     def _install_agent_template_path(self) -> Path:
         return Path(__file__).with_name("mux_setup.sh.j2")
@@ -288,6 +308,7 @@ async def run(
     ) -> None:
         """Run agent commands, download token file, then populate context."""
         # Execute commands (from base class logic, but without calling populate_context)
+        failed_command: tuple[int, int] | None = None
         for i, exec_input in enumerate(self.create_run_agent_commands(instruction)):
             command_dir = self.logs_dir / f"command-{i}"
             command_dir.mkdir(parents=True, exist_ok=True)
@@ -305,6 +326,9 @@ async def run(
                 (command_dir / "stdout.txt").write_text(result.stdout)
             if result.stderr:
                 (command_dir / "stderr.txt").write_text(result.stderr)
+            if result.return_code != 0:
+                failed_command = (i, result.return_code)
+                break
 
         # Download token file from container BEFORE populating context
         # Clear any stale token file first to avoid reading outdated data if download fails
@@ -317,6 +341,12 @@ async def run(
 
         self.populate_context_post_run(context)
 
+        if failed_command is not None:
+            command_index, return_code = failed_command
+            raise RuntimeError(
+                f"mux agent command failed (command {command_index}, exit {return_code})"
+            )
+
     def populate_context_post_run(self, context: AgentContext) -> None:
         """Extract token usage and cost from the token file written by mux-run.sh."""
         token_file = self.logs_dir / "mux-tokens.json"