diff --git a/.github/workflows/nightly-terminal-bench.yml b/.github/workflows/nightly-terminal-bench.yml index 1d3c145f0f..4c58964c73 100644 --- a/.github/workflows/nightly-terminal-bench.yml +++ b/.github/workflows/nightly-terminal-bench.yml @@ -10,7 +10,7 @@ on: workflow_dispatch: inputs: models: - description: 'Models to test (comma-separated, or "all" for opus-4-7 + gpt-5.5 + google/gemini-3-pro-preview + google/gemini-3-flash-preview)' + description: 'Models to test (comma-separated, or "all" for opus-4-7 + gpt-5.5 + google/gemini-3-pro-preview + google/gemini-3-flash-preview + google/gemini-3.5-flash)' required: false default: "all" type: string @@ -18,6 +18,11 @@ on: description: "Experiments to enable (comma-separated)" required: false type: string + mux_run_as_goal: + description: "Run nightly smoke/matrix tasks as strict mux CLI Goal Runs" + required: false + default: false + type: boolean jobs: # Smoke test: run chess-best-move task first to catch broken agent setup @@ -33,6 +38,7 @@ jobs: env: "daytona" task_names: "chess-best-move" experiments: ${{ inputs.experiments }} + mux_run_as_goal: ${{ github.event_name == 'workflow_dispatch' && inputs.mux_run_as_goal || false }} # Keep least-privilege secret scope for reusable workflow calls. secrets: TERMINAL_BENCH_ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }} @@ -58,6 +64,7 @@ jobs: mux_project_path: "/testbed" timeout: "3000" experiments: ${{ inputs.experiments }} + mux_run_as_goal: ${{ github.event_name == 'workflow_dispatch' && inputs.mux_run_as_goal || false }} secrets: TERMINAL_BENCH_ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -79,6 +86,7 @@ jobs: mux_project_path: "/app/src" timeout: "600" experiments: ${{ inputs.experiments }} + mux_run_as_goal: ${{ github.event_name == 'workflow_dispatch' && inputs.mux_run_as_goal || false }} secrets: TERMINAL_BENCH_ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -100,6 +108,7 @@ jobs: mux_project_path: "/app" timeout: "1800" experiments: ${{ inputs.experiments }} + mux_run_as_goal: ${{ github.event_name == 'workflow_dispatch' && inputs.mux_run_as_goal || false }} secrets: TERMINAL_BENCH_ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -120,7 +129,7 @@ jobs: INPUT_MODELS: ${{ inputs.models }} run: | if [ "$INPUT_MODELS" = "all" ] || [ -z "$INPUT_MODELS" ]; then - echo 'models=["anthropic/claude-opus-4-7","openai/gpt-5.5","google/gemini-3-pro-preview","google/gemini-3-flash-preview"]' >> "$GITHUB_OUTPUT" + echo 'models=["anthropic/claude-opus-4-7","openai/gpt-5.5","google/gemini-3-pro-preview","google/gemini-3-flash-preview","google/gemini-3.5-flash"]' >> "$GITHUB_OUTPUT" else # Convert comma-separated to JSON array models_json=$(echo "$INPUT_MODELS" | jq -R -s -c 'split(",") | map(gsub("^\\s+|\\s+$"; ""))') @@ -134,7 +143,7 @@ jobs: matrix: model: ${{ fromJSON(needs.determine-models.outputs.models) }} fail-fast: false - max-parallel: 1 # Run models sequentially to stay within Daytona's 25-sandbox limit + max-parallel: 1 # Run models sequentially to stay within Daytona's 25-sandbox limit uses: ./.github/workflows/terminal-bench.yml with: model_name: ${{ matrix.model }} @@ -144,6 +153,7 @@ jobs: concurrency: "48" env: "daytona" experiments: ${{ inputs.experiments }} + mux_run_as_goal: ${{ github.event_name == 'workflow_dispatch' && inputs.mux_run_as_goal || false }} secrets: TERMINAL_BENCH_ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index c79e0ea23e..7d316dec9e 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -27,7 +27,7 @@ on: required: false type: string extra_args: - description: "Additional arguments to pass to harbor" + description: "Additional arguments to pass to harbor (e.g., --n-tasks 5 for quick dispatch runs)" required: false type: string experiments: @@ -50,10 +50,15 @@ on: type: string default: "" mux_run_args: - description: "Additional CLI flags passed to mux run (e.g., --thinking high --use-1m --budget 5.00)" + description: "Additional CLI flags passed to mux run (e.g., --thinking high --use-1m --budget 5.00; with goal mode, add --goal-turns/--goal-budget)" required: false type: string default: "" + mux_run_as_goal: + description: "Run each task instruction as a mux CLI Goal Run" + required: false + type: boolean + default: false secrets: # Keep the runtime env name stable while routing benchmark spend to its own key. TERMINAL_BENCH_ANTHROPIC_API_KEY: @@ -92,11 +97,16 @@ on: required: false type: string mux_run_args: - description: "Additional CLI flags passed to mux run (e.g., --thinking high --use-1m)" + description: "Additional CLI flags passed to mux run (e.g., --thinking high --use-1m; with goal mode, add --goal-turns/--goal-budget)" required: false type: string + mux_run_as_goal: + description: "Run each task instruction as a mux CLI Goal Run" + required: false + default: false + type: boolean extra_args: - description: "Additional arguments to pass to harbor" + description: "Additional arguments to pass to harbor (e.g., --n-tasks 5 for quick dispatch runs)" required: false type: string experiments: @@ -107,10 +117,6 @@ on: description: "Agent timeout in seconds (default: 1800 = 30 min)" required: false type: string - max_tasks: - description: "Maximum number of tasks to run (for faster iteration)" - required: false - type: string jobs: benchmark: @@ -206,6 +212,7 @@ jobs: ${{ inputs.extra_args || '' }} MUX_EXPERIMENTS: ${{ inputs.experiments }} MUX_RUN_ARGS: ${{ inputs.mux_run_args }} + MUX_RUN_AS_GOAL: ${{ inputs.mux_run_as_goal && '1' || '' }} ANTHROPIC_API_KEY: ${{ secrets.TERMINAL_BENCH_ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} @@ -358,6 +365,7 @@ jobs: GCP_PROJECT_ID: mux-benchmarks BQ_DATASET: benchmarks MUX_EXPERIMENTS: ${{ inputs.experiments }} + MUX_RUN_AS_GOAL: ${{ inputs.mux_run_as_goal && '1' || '' }} run: | if [ -z "$GCP_SA_KEY" ]; then echo "GCP_SA_KEY not set, skipping BigQuery upload" @@ -376,6 +384,7 @@ jobs: GCP_PROJECT_ID: mux-benchmarks BQ_DATASET: benchmarks MUX_EXPERIMENTS: ${{ inputs.experiments }} + MUX_RUN_AS_GOAL: ${{ inputs.mux_run_as_goal && '1' || '' }} run: | if [ -z "$GCP_SA_KEY" ]; then echo "GCP_SA_KEY not set, skipping BigQuery upload" diff --git a/.mux/skills/tbench/SKILL.md b/.mux/skills/tbench/SKILL.md index 9c7d14f7ef..6db67bb6ba 100644 --- a/.mux/skills/tbench/SKILL.md +++ b/.mux/skills/tbench/SKILL.md @@ -59,6 +59,7 @@ make benchmark-terminal TB_ENV=daytona TB_CONCURRENCY=48 TB_TASK_NAMES="chess-be - `TB_TASK_NAMES`: Space-separated task names to run (default: all tasks) - `TB_ARGS`: Additional arguments passed to harbor - `MUX_RUN_ARGS`: CLI flags passed directly to `mux run` inside the container (e.g., `--thinking high --use-1m --budget 5.00`). This is the primary mechanism for all `mux run` flags — avoids per-flag plumbing. +- `MUX_RUN_AS_GOAL`: When set to `1`, runs each task instruction as a strict `mux run --goal` objective while still piping the instruction to stdin. Use `MUX_RUN_ARGS` for goal limits such as `--goal-turns` and `--goal-budget`. ### Timeout Handling @@ -109,6 +110,22 @@ gh workflow run terminal-bench.yml \ -f mux_run_args="--thinking high --budget 5.00" ``` +**Strict goal-mode runs:** + +```bash +# Run a single task as a strict CLI Goal Run +MUX_RUN_AS_GOAL=1 \ +MUX_RUN_ARGS="--thinking high --goal-turns 30 --goal-budget 10.00" \ +make benchmark-terminal TB_TASK_NAMES="chess-best-move" + +# CI dispatch +gh workflow run terminal-bench.yml \ + -f model_name=anthropic/claude-sonnet-4-5 \ + -f task_names=chess-best-move \ + -f mux_run_as_goal=true \ + -f mux_run_args="--thinking high --goal-turns 30 --goal-budget 10.00" +``` + **Local runs:** ```bash diff --git a/benchmarks/terminal_bench/mux-run.sh b/benchmarks/terminal_bench/mux-run.sh index 006628681c..9d87610348 100644 --- a/benchmarks/terminal_bench/mux-run.sh +++ b/benchmarks/terminal_bench/mux-run.sh @@ -34,6 +34,16 @@ MUX_MODEL="${MUX_MODEL:-anthropic:claude-sonnet-4-5}" MUX_TIMEOUT_MS="${MUX_TIMEOUT_MS:-}" MUX_WORKSPACE_ID="${MUX_WORKSPACE_ID:-mux-bench}" MUX_EXPERIMENTS="${MUX_EXPERIMENTS:-}" +MUX_RUN_AS_GOAL="${MUX_RUN_AS_GOAL:-}" + +mux_run_as_goal_normalized="${MUX_RUN_AS_GOAL,,}" +mux_run_as_goal_normalized="${mux_run_as_goal_normalized#"${mux_run_as_goal_normalized%%[![:space:]]*}"}" +mux_run_as_goal_normalized="${mux_run_as_goal_normalized%"${mux_run_as_goal_normalized##*[![:space:]]}"}" +case "${mux_run_as_goal_normalized}" in + "" | "0" | "false") mux_run_as_goal_enabled=0 ;; + "1" | "true") mux_run_as_goal_enabled=1 ;; + *) fatal "MUX_RUN_AS_GOAL must be one of: 1, true, 0, false" ;; +esac resolve_project_path() { if [[ -n "${MUX_PROJECT_PATH}" ]]; then @@ -80,11 +90,27 @@ if [[ -n "${MUX_EXPERIMENTS}" ]]; then done fi +if [[ "${mux_run_as_goal_enabled}" == "1" ]]; then + log "strict mux goal mode enabled" + cmd+=(--goal "${instruction}") +else + log "strict mux goal mode disabled" +fi + +mux_run_args=() # Append arbitrary mux run flags (e.g., --thinking high --mode exec --use-1m --budget 5.00) if [[ -n "${MUX_RUN_ARGS:-}" ]]; then - # Word-split intentional: MUX_RUN_ARGS contains space-separated CLI flags + # Word-split intentional: MUX_RUN_ARGS contains space-separated CLI flags. # shellcheck disable=SC2206 - cmd+=(${MUX_RUN_ARGS}) + mux_run_args=(${MUX_RUN_ARGS}) + if [[ "${mux_run_as_goal_enabled}" == "1" ]]; then + for arg in "${mux_run_args[@]}"; do + if [[ "${arg}" == "--goal" || "${arg}" == --goal=* ]]; then + fatal "MUX_RUN_ARGS must not include --goal when MUX_RUN_AS_GOAL is enabled" + fi + done + fi + cmd+=("${mux_run_args[@]}") fi # NOTE: Harbor only automatically collects /logs/agent on timeouts. @@ -103,13 +129,19 @@ if [[ -n "${MUX_TIMEOUT_MS}" ]]; then fi # Capture output to file while streaming to terminal for token extraction. -# Keep stderr separate so the stdout log stays valid JSONL. -if ! printf '%s' "${instruction}" \ +# Keep stderr separate so the stdout log stays valid JSONL. Temporarily disable +# errexit so token extraction still runs after mux returns a meaningful nonzero +# code such as strict goal-mode exit 3. +set +e +printf '%s' "${instruction}" \ | "${cmd[@]}" \ 2> >(tee "${MUX_STDERR_FILE}" >&2) \ - | tee "${MUX_OUTPUT_FILE}"; then - fatal "mux agent session failed" -fi + | tee "${MUX_OUTPUT_FILE}" +pipeline_status=("${PIPESTATUS[@]}") +set -e +stdin_status="${pipeline_status[0]}" +mux_status="${pipeline_status[1]}" +tee_status="${pipeline_status[2]}" # Extract usage and cost from the JSONL output. # Prefer the run-complete event (emitted at end of --json run) which has aggregated @@ -159,4 +191,19 @@ for usage in cumulative_by_msg.values(): result["input"] += subagent_input result["output"] += subagent_output print(json.dumps(result)) -' "${MUX_OUTPUT_FILE}" > "${MUX_TOKEN_FILE}" 2>/dev/null || true +' "${MUX_OUTPUT_FILE}" >"${MUX_TOKEN_FILE}" 2>/dev/null || true + +if [[ "${mux_status}" -ne 0 ]]; then + printf '[mux-run] ERROR: mux agent session failed (exit %s)\n' "${mux_status}" >&2 + exit "${mux_status}" +fi + +if [[ "${tee_status}" -ne 0 ]]; then + printf '[mux-run] ERROR: failed to capture mux stdout (exit %s)\n' "${tee_status}" >&2 + exit "${tee_status}" +fi + +if [[ "${stdin_status}" -ne 0 ]]; then + printf '[mux-run] ERROR: failed to send instruction to mux (exit %s)\n' "${stdin_status}" >&2 + exit "${stdin_status}" +fi diff --git a/benchmarks/terminal_bench/mux_agent.py b/benchmarks/terminal_bench/mux_agent.py index ba5ffe7278..530687f729 100644 --- a/benchmarks/terminal_bench/mux_agent.py +++ b/benchmarks/terminal_bench/mux_agent.py @@ -78,6 +78,7 @@ class MuxAgent(BaseInstalledAgent): # Generic pass-through for arbitrary mux run CLI flags (e.g., --thinking # high --use-1m --budget 5.00). Avoids per-flag plumbing. "MUX_RUN_ARGS", + "MUX_RUN_AS_GOAL", ) def __init__( @@ -167,12 +168,31 @@ def _env(self) -> dict[str, str]: if not project_path.strip(): raise ValueError("MUX_PROJECT_PATH must be non-empty when provided") + mux_run_as_goal = self._normalize_mux_run_as_goal(env.get("MUX_RUN_AS_GOAL")) + if mux_run_as_goal is None: + env.pop("MUX_RUN_AS_GOAL", None) + else: + env["MUX_RUN_AS_GOAL"] = mux_run_as_goal + # Set experiments from kwarg (takes precedence over env var) if self._experiments: env["MUX_EXPERIMENTS"] = self._experiments return env + @staticmethod + def _normalize_mux_run_as_goal(value: str | None) -> str | None: + if value is None: + return None + + normalized = value.strip().lower() + if normalized in ("", "0", "false"): + return None + if normalized in ("1", "true"): + return "1" + + raise ValueError("MUX_RUN_AS_GOAL must be one of: 1, true, 0, false") + @property def _install_agent_template_path(self) -> Path: return Path(__file__).with_name("mux_setup.sh.j2") @@ -288,6 +308,7 @@ async def run( ) -> None: """Run agent commands, download token file, then populate context.""" # Execute commands (from base class logic, but without calling populate_context) + failed_command: tuple[int, int] | None = None for i, exec_input in enumerate(self.create_run_agent_commands(instruction)): command_dir = self.logs_dir / f"command-{i}" command_dir.mkdir(parents=True, exist_ok=True) @@ -305,6 +326,9 @@ async def run( (command_dir / "stdout.txt").write_text(result.stdout) if result.stderr: (command_dir / "stderr.txt").write_text(result.stderr) + if result.return_code != 0: + failed_command = (i, result.return_code) + break # Download token file from container BEFORE populating context # Clear any stale token file first to avoid reading outdated data if download fails @@ -317,6 +341,12 @@ async def run( self.populate_context_post_run(context) + if failed_command is not None: + command_index, return_code = failed_command + raise RuntimeError( + f"mux agent command failed (command {command_index}, exit {return_code})" + ) + def populate_context_post_run(self, context: AgentContext) -> None: """Extract token usage and cost from the token file written by mux-run.sh.""" token_file = self.logs_dir / "mux-tokens.json" diff --git a/benchmarks/terminal_bench/mux_agent_test.py b/benchmarks/terminal_bench/mux_agent_test.py index a3524b840d..469e167d07 100644 --- a/benchmarks/terminal_bench/mux_agent_test.py +++ b/benchmarks/terminal_bench/mux_agent_test.py @@ -1,8 +1,11 @@ from __future__ import annotations +import asyncio import io import tarfile +from dataclasses import dataclass from pathlib import Path +from types import SimpleNamespace import pytest @@ -21,27 +24,127 @@ def _repo_root() -> Path: return Path(__file__).resolve().parents[2] -def test_env_defaults_are_normalized(monkeypatch: pytest.MonkeyPatch) -> None: +def test_env_defaults_are_normalized( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: monkeypatch.setenv("MUX_AGENT_REPO_ROOT", str(_repo_root())) - agent = MuxAgent(model_name="anthropic/claude-sonnet-4-5") + agent = MuxAgent(logs_dir=tmp_path, model_name="anthropic/claude-sonnet-4-5") env = agent._env assert env["MUX_MODEL"] == "anthropic:claude-sonnet-4-5" - assert env["MUX_THINKING_LEVEL"] == "high" - assert env["MUX_MODE"] == "exec" assert env["MUX_PROJECT_CANDIDATES"] == agent._DEFAULT_PROJECT_CANDIDATES -def test_timeout_must_be_numeric(monkeypatch: pytest.MonkeyPatch) -> None: +def test_goal_mode_env_is_forwarded( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + monkeypatch.setenv("MUX_AGENT_REPO_ROOT", str(_repo_root())) + monkeypatch.setenv("MUX_RUN_AS_GOAL", "true") + + agent = MuxAgent(logs_dir=tmp_path) + + assert agent._env["MUX_RUN_AS_GOAL"] == "1" + + +def test_goal_mode_defaults_to_disabled( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + monkeypatch.setenv("MUX_AGENT_REPO_ROOT", str(_repo_root())) + + agent = MuxAgent(logs_dir=tmp_path) + + assert "MUX_RUN_AS_GOAL" not in agent._env + + +def test_goal_mode_rejects_invalid_values( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + monkeypatch.setenv("MUX_AGENT_REPO_ROOT", str(_repo_root())) + monkeypatch.setenv("MUX_RUN_AS_GOAL", "yes") + + agent = MuxAgent(logs_dir=tmp_path) + with pytest.raises(ValueError, match="MUX_RUN_AS_GOAL"): + _ = agent._env + + +def test_timeout_must_be_numeric( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: monkeypatch.setenv("MUX_AGENT_REPO_ROOT", str(_repo_root())) monkeypatch.setenv("MUX_TIMEOUT_MS", "not-a-number") - agent = MuxAgent() + agent = MuxAgent(logs_dir=tmp_path) with pytest.raises(ValueError): _ = agent._env +@dataclass +class _ExecResult: + return_code: int + stdout: str = "" + stderr: str = "" + + +class _FakeEnvironment: + def __init__(self, result: _ExecResult) -> None: + self.result = result + self.download_attempts: list[tuple[str, Path]] = [] + + async def exec(self, **_kwargs: object) -> _ExecResult: + return self.result + + async def download_file(self, source_path: str, target_path: Path) -> None: + self.download_attempts.append((source_path, target_path)) + target_path.write_text('{"input": 7, "output": 11, "cost_usd": 0.42}') + + +def test_run_raises_after_preserving_logs_for_nonzero_exit( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + monkeypatch.setenv("MUX_AGENT_REPO_ROOT", str(_repo_root())) + agent = MuxAgent(logs_dir=tmp_path) + environment = _FakeEnvironment( + _ExecResult(return_code=3, stdout="out", stderr="err") + ) + context = SimpleNamespace() + + with pytest.raises(RuntimeError, match="mux agent command failed"): + asyncio.run(agent.run("do the task", environment, context)) + + command_dir = tmp_path / "command-0" + assert (command_dir / "return-code.txt").read_text() == "3" + assert (command_dir / "stdout.txt").read_text() == "out" + assert (command_dir / "stderr.txt").read_text() == "err" + assert environment.download_attempts == [ + (agent._TOKEN_FILE_PATH, tmp_path / "mux-tokens.json") + ] + assert getattr(context, "n_input_tokens") == 7 + assert getattr(context, "n_output_tokens") == 11 + assert getattr(context, "cost_usd") == 0.42 + + +def test_run_populates_context_for_successful_exit( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + monkeypatch.setenv("MUX_AGENT_REPO_ROOT", str(_repo_root())) + agent = MuxAgent(logs_dir=tmp_path) + environment = _FakeEnvironment( + _ExecResult(return_code=0, stdout="out", stderr="err") + ) + context = SimpleNamespace() + + asyncio.run(agent.run("do the task", environment, context)) + + command_dir = tmp_path / "command-0" + assert (command_dir / "return-code.txt").read_text() == "0" + assert (command_dir / "stdout.txt").read_text() == "out" + assert (command_dir / "stderr.txt").read_text() == "err" + assert getattr(context, "n_input_tokens") == 7 + assert getattr(context, "n_output_tokens") == 11 + assert getattr(context, "cost_usd") == 0.42 + + def test_app_archive_includes_postinstall_script() -> None: assert "scripts/postinstall.sh" in MuxAgent._INCLUDE_PATHS diff --git a/docs/adr/0004-cli-goal-runs-are-not-strict-goal-aliases.md b/docs/adr/0004-cli-goal-runs-are-not-strict-goal-aliases.md new file mode 100644 index 0000000000..41e2216045 --- /dev/null +++ b/docs/adr/0004-cli-goal-runs-are-not-strict-goal-aliases.md @@ -0,0 +1,31 @@ +--- +title: CLI Goal Runs are not strict /goal aliases +description: Architecture decision for giving mux run --goal CLI-specific completion and limit semantics +--- + +# 0004. CLI Goal Runs are not strict /goal aliases + +## Status + +Accepted + +## Context + +`mux run` is designed for automation: it normally sends one request, streams the result, and exits. Interactive `/goal` is a workspace lifecycle command with defaults, controls, and cooldown behavior that assume a user can intervene from the UI. + +Adding `mux run --goal` creates a different automation need. A script needs one process to keep driving an objective until there is an authoritative completion signal, while still preserving goal accounting and model-facing goal tools. + +## Decision + +Mux will model `mux run --goal` as a CLI Goal Run, not as a strict alias for interactive `/goal`. + +A CLI Goal Run creates an ephemeral goal for the `mux run` process, sends either the provided message/stdin or the goal text as the kickoff message, and continues in exec mode until the persisted goal status is `complete` or a stop condition is reached. Interactive goal defaults are not applied; omitted `--goal-budget` and `--goal-turns` mean no goal-specific limit. The existing session `--budget` remains a separate hard stop. + +CLI Goal Runs bypass the interactive goal continuation cooldown because the process itself is the automation boundary. They still use the shared goal service for prompts, accounting, tool availability, budget-limited wrap-up, and persisted completion state. + +## Consequences + +- `mux run` remains single-request by default, with `--goal` documented as the explicit multi-continuation exception. +- Scripts can trust exit code `0` only when the persisted goal is complete; free-text claims are not enough unless existing goal completion fallback persisted them. +- Goal and session budgets can stop the same process for different reasons, so CLI output and JSON events must identify which limit won. +- CLI-specific continuation behavior is parameterized in the shared goal service instead of duplicating goal prompt/accounting logic in the CLI. diff --git a/docs/docs.json b/docs/docs.json index 217683d273..6fadfcdd6b 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -123,6 +123,7 @@ "adr/0001-experimental-image-generation-tool", "adr/0002-image-editing-visual-mockups", "adr/0003-context-boundaries-for-compaction-and-reset", + "adr/0004-cli-goal-runs-are-not-strict-goal-aliases", "AGENTS" ] } diff --git a/docs/reference/cli.mdx b/docs/reference/cli.mdx index eac228cb42..5b7c6264f5 100644 --- a/docs/reference/cli.mdx +++ b/docs/reference/cli.mdx @@ -10,7 +10,7 @@ description: Run one-off agent tasks from the command line with `mux run` Code](https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview) or similar TUIs. -Mux provides a CLI for running one-off agent tasks without the desktop app. Unlike the interactive desktop experience, `mux run` executes a single request to completion and exits. +Mux provides a CLI for running one-off agent tasks without the desktop app. Unlike the interactive desktop experience, `mux run` normally executes a single request to completion and exits. The `--goal` option is an explicit exception: it starts a CLI Goal Run that may perform automatic continuations until the goal is complete or a limit is reached. Learn how to use `mux run` in CI/CD pipelines @@ -62,10 +62,42 @@ mux run --json "List all TypeScript files" | jq '.type' | `--mode ` | | Agent mode: `plan` or `exec` | `exec` | | `--thinking ` | `-t` | Thinking level: `OFF`, `LOW`, `MED`, `HIGH`, `MAX`, or `0`–`9` (model-relative, see [Models](/config/models#thinking-levels)) | `MED` | | `--budget ` | `-b` | Stop when session cost exceeds budget (USD) | No limit | +| `--goal ` | | Start a CLI Goal Run and continue until the persisted goal is complete or a limit stops it | Off | +| `--goal-budget ` | | Goal budget (`$5`, `5.00`, or `500c`); separate from `--budget` | No limit | +| `--goal-turns ` | | Maximum automatic goal continuation turns | No limit | | `--experiment ` | `-e` | Enable experiment (repeatable) | None | | `--json` | | Output NDJSON for programmatic use | Off | | `--quiet` | `-q` | Only output final result | Off | +### CLI Goal Runs + +Use `--goal` when a task should keep going across automatic continuations until the agent marks the persisted goal complete: + +```bash +# Goal text is also used as the initial message when no message/stdin is provided +mux run --goal "Fix the failing tests and verify the suite passes" + +# Provide separate kickoff instructions while keeping the objective active +mux run --goal "Ship the migration safely" "Start by inspecting the schema and propose a plan" + +# Bound automatic continuations with a goal-specific budget and turn cap +mux run --goal "Complete the refactor" --goal-budget 5.00 --goal-turns 10 +``` + +A CLI Goal Run is intentionally not a strict alias for interactive `/goal`. It is ephemeral to the `mux run` process, does not apply interactive goal defaults, bypasses the interactive continuation cooldown, and exits successfully only when the persisted goal status is `complete`. If neither `--goal-budget` nor `--goal-turns` is provided, Mux warns that the goal is uncapped. + +`--budget` remains the hard session spending limit in USD. `--goal-budget` is goal accounting, accepts forms like `$5`, `5.00`, and `500c`, and may allow a final budget-limit wrap-up turn. If the session `--budget` is exceeded, the run stops immediately. + +Exit codes for CLI Goal Runs: + +| Code | Meaning | +| ----- | ---------------------------------------------------------- | +| `0` | Goal completed (unless the agent set a nonzero exit code) | +| `1` | Operational, model, or tool error | +| `2` | Session `--budget` exceeded | +| `3` | Goal stopped incomplete, including goal budget/turn limits | +| `130` | User interrupt | + ### Runtimes - **`local`** (default): Runs directly in the specified directory. Best for one-off tasks. @@ -82,6 +114,9 @@ mux run --json "List all TypeScript files" | jq '.type' ### Examples ```bash +# Goal run with automatic continuations +mux run --goal "Update dependencies, fix resulting tests, and verify the suite passes" + # Quick fix in current directory mux run "Fix the TypeScript errors" diff --git a/scripts/upload-harbor-results.py b/scripts/upload-harbor-results.py index a820b6788c..878d465171 100644 --- a/scripts/upload-harbor-results.py +++ b/scripts/upload-harbor-results.py @@ -44,6 +44,11 @@ def load_json(path: Path) -> dict | None: return None +def env_flag(name: str) -> bool: + """Return True for the env boolean spellings emitted by workflows.""" + return (os.environ.get(name) or "").strip().lower() in {"1", "true"} + + def extract_trial_score(trial_result: dict) -> float | None: """Extract score from trial result, supporting multiple Harbor formats.""" score = trial_result.get("score") @@ -185,6 +190,7 @@ def build_rows(job_folder: Path) -> list[dict]: dataset = job_config.get("dataset") experiments = os.environ.get("MUX_EXPERIMENTS") + mux_run_as_goal = env_flag("MUX_RUN_AS_GOAL") # Raw JSON for future-proofing run_result_json = json.dumps(job_result) if job_result else None @@ -233,6 +239,7 @@ def build_rows(job_folder: Path) -> list[dict]: "mode": mode, "dataset": dataset, "experiments": experiments, + "mux_run_as_goal": mux_run_as_goal, "run_started_at": None, # Not available in Harbor format "run_completed_at": None, "n_resolved": None, # Will be set after counting all trials diff --git a/scripts/upload-tbench-results.py b/scripts/upload-tbench-results.py index 809ade170f..549a95f82b 100755 --- a/scripts/upload-tbench-results.py +++ b/scripts/upload-tbench-results.py @@ -42,6 +42,9 @@ def load_json(path: Path) -> dict | None: return None +def env_flag(name: str) -> bool: + """Return True for the env boolean spellings emitted by workflows.""" + return (os.environ.get(name) or "").strip().lower() in {"1", "true"} def extract_thinking_from_config(config: dict) -> str | None: @@ -198,6 +201,7 @@ def build_rows(job_folder: Path) -> list[dict]: dataset = job_config.get("dataset") experiments = os.environ.get("MUX_EXPERIMENTS") + mux_run_as_goal = env_flag("MUX_RUN_AS_GOAL") # Raw JSON for future-proofing run_result_json = json.dumps(job_result) if job_result else None @@ -250,6 +254,7 @@ def build_rows(job_folder: Path) -> list[dict]: "mode": mode, "dataset": dataset, "experiments": experiments, + "mux_run_as_goal": mux_run_as_goal, "run_started_at": None, # Not available in Harbor format "run_completed_at": None, "n_resolved": None, # Will be set after counting all trials diff --git a/src/cli/goalRunDriver.test.ts b/src/cli/goalRunDriver.test.ts new file mode 100644 index 0000000000..7f3f513445 --- /dev/null +++ b/src/cli/goalRunDriver.test.ts @@ -0,0 +1,278 @@ +import { describe, expect, test } from "bun:test"; +import type { GoalRecordV1 } from "@/common/types/goal"; +import type { SendMessageOptions } from "@/common/orpc/types"; +import { + describeCliGoalStop, + driveCliGoalUntilTerminal, + type DriveCliGoalUntilTerminalOptions, +} from "./goalRunDriver"; + +function goal(overrides: Partial = {}): GoalRecordV1 { + return { + version: 1, + goalId: "goal-1", + objective: "finish", + status: "active", + budgetCents: null, + costCents: 0, + costMicroCents: 0, + turnCap: null, + turnsUsed: 0, + attributedChildren: [], + budgetLimitInjectedForGoalId: null, + requireUserAcknowledgmentSinceMs: null, + lastContinuationFiredAtMs: null, + createdAtMs: 1, + updatedAtMs: 1, + ...overrides, + }; +} + +function sendOptions(): SendMessageOptions { + return { model: "openai:gpt-4o", agentId: "exec" }; +} + +function options( + overrides: Partial = {} +): DriveCliGoalUntilTerminalOptions { + return { + workspaceId: "workspace-1", + getGoal: () => Promise.resolve(goal()), + buildExecSendOptions: sendOptions, + requestContinuationAfterStreamEnd: () => Promise.resolve(), + requestDispatch: () => Promise.resolve(), + checkGoalContinuationEligibility: () => Promise.resolve({}), + prepareForContinuation: () => undefined, + waitForStreamStarted: () => Promise.resolve(), + waitForCompletion: () => Promise.resolve(), + isSessionBudgetExceeded: () => false, + nowMs: () => 123, + emitJsonLine: () => undefined, + writeHumanLineClosed: () => undefined, + setGoalStopReason: () => undefined, + describeError: String, + ...overrides, + }; +} + +describe("driveCliGoalUntilTerminal", () => { + test("continues an active goal until completion", async () => { + const goals = [goal(), goal({ status: "complete", completionSummary: "done" })]; + const events: unknown[] = []; + const lines: string[] = []; + const reasons: string[] = []; + const continuations: Array<{ streamEndedAtMs: number; sendOptions: SendMessageOptions }> = []; + let dispatches = 0; + let waitStarts = 0; + let waitCompletions = 0; + + const result = await driveCliGoalUntilTerminal( + options({ + getGoal: () => Promise.resolve(goals.shift() ?? goals[goals.length - 1] ?? null), + requestContinuationAfterStreamEnd: (input) => { + continuations.push(input); + return Promise.resolve(); + }, + requestDispatch: () => { + dispatches += 1; + return Promise.resolve(); + }, + waitForStreamStarted: () => { + waitStarts += 1; + return Promise.resolve(); + }, + waitForCompletion: () => { + waitCompletions += 1; + return Promise.resolve(); + }, + emitJsonLine: (event) => events.push(event), + writeHumanLineClosed: (line = "") => lines.push(line), + setGoalStopReason: (reason) => reasons.push(reason), + }) + ); + + expect(result?.status).toBe("complete"); + expect(continuations).toHaveLength(1); + expect(continuations[0]?.streamEndedAtMs).toBe(123); + expect(dispatches).toBe(1); + expect(waitStarts).toBe(1); + expect(waitCompletions).toBe(1); + expect(events).toMatchObject([{ type: "goal-continuing" }, { type: "goal-completed" }]); + expect(lines).toEqual(["[goal] continuing...", "[goal] completed: done"]); + expect(reasons).toEqual(["complete"]); + }); + + test("passes the stream-start timeout to continuation waits", async () => { + const goals = [goal(), goal({ status: "complete" })]; + const timeouts: Array = []; + + await driveCliGoalUntilTerminal( + options({ + getGoal: () => Promise.resolve(goals.shift() ?? goals[goals.length - 1] ?? null), + streamStartTimeoutMs: 123, + waitForStreamStarted: (timeoutMs) => { + timeouts.push(timeoutMs); + return Promise.resolve(); + }, + }) + ); + + expect(timeouts).toEqual([123]); + }); + + test("drives a budget-limited goal through its wrap-up", async () => { + const goals = [ + goal({ status: "budget_limited", budgetCents: 100, costCents: 100 }), + goal({ status: "complete", completionSummary: "wrapped" }), + ]; + const lines: string[] = []; + const result = await driveCliGoalUntilTerminal( + options({ + getGoal: () => Promise.resolve(goals.shift() ?? goals[goals.length - 1] ?? null), + writeHumanLineClosed: (line = "") => lines.push(line), + }) + ); + + expect(result?.status).toBe("complete"); + expect(lines).toEqual(["[goal] budget wrap-up...", "[goal] completed: wrapped"]); + }); + + test("stops when a budget wrap-up already fired", async () => { + const reasons: string[] = []; + const result = await driveCliGoalUntilTerminal( + options({ + getGoal: () => + Promise.resolve( + goal({ + status: "budget_limited", + budgetCents: 100, + costCents: 100, + budgetLimitInjectedForGoalId: "goal-1", + }) + ), + requestContinuationAfterStreamEnd: () => Promise.reject(new Error("should not continue")), + requestDispatch: () => Promise.reject(new Error("should not dispatch")), + prepareForContinuation: () => { + throw new Error("should not prepare"); + }, + setGoalStopReason: (reason) => reasons.push(reason), + }) + ); + + expect(result?.status).toBe("budget_limited"); + expect(reasons).toEqual(["goal budget reached"]); + }); + + test("returns the latest goal when session budget stops after a continuation", async () => { + const goals = [goal(), goal({ turnsUsed: 1 })]; + const reasons: string[] = []; + const result = await driveCliGoalUntilTerminal( + options({ + getGoal: () => Promise.resolve(goals.shift() ?? null), + isSessionBudgetExceeded: () => true, + setGoalStopReason: (reason) => reasons.push(reason), + }) + ); + + expect(result?.turnsUsed).toBe(1); + expect(reasons).toEqual(["session budget exceeded"]); + }); + + test("reports completion when the goal completes during a session-budgeted continuation", async () => { + const goals = [goal(), goal({ status: "complete", completionSummary: "finished" })]; + const events: unknown[] = []; + const lines: string[] = []; + const reasons: string[] = []; + + const result = await driveCliGoalUntilTerminal( + options({ + getGoal: () => Promise.resolve(goals.shift() ?? goals[goals.length - 1] ?? null), + isSessionBudgetExceeded: () => true, + emitJsonLine: (event) => events.push(event), + writeHumanLineClosed: (line = "") => lines.push(line), + setGoalStopReason: (reason) => reasons.push(reason), + }) + ); + + expect(result?.status).toBe("complete"); + expect(reasons).toEqual(["complete"]); + expect(events).toMatchObject([{ type: "goal-continuing" }, { type: "goal-completed" }]); + expect(lines).toEqual(["[goal] continuing...", "[goal] completed: finished"]); + }); + + test("throws when the continuation safety limit is reached", () => + expect(driveCliGoalUntilTerminal(options({ continuationSafetyLimit: 1 }))).rejects.toThrow( + "continuation safety guard" + )); + + test("returns null when the goal disappears", async () => { + const reasons: string[] = []; + const result = await driveCliGoalUntilTerminal( + options({ + getGoal: () => Promise.resolve(null), + setGoalStopReason: (reason) => reasons.push(reason), + }) + ); + + expect(result).toBeNull(); + expect(reasons).toEqual(["goal missing"]); + }); + + test("returns paused goals without requesting another continuation", async () => { + const reasons: string[] = []; + const result = await driveCliGoalUntilTerminal( + options({ + getGoal: () => Promise.resolve(goal({ status: "paused" })), + requestContinuationAfterStreamEnd: () => Promise.reject(new Error("should not continue")), + setGoalStopReason: (reason) => reasons.push(reason), + }) + ); + + expect(result?.status).toBe("paused"); + expect(reasons).toEqual(["goal paused"]); + }); + + test("reports continuation eligibility when no stream starts", () => + expect( + driveCliGoalUntilTerminal( + options({ + checkGoalContinuationEligibility: () => Promise.resolve({ reason: "cooldown" }), + waitForStreamStarted: () => Promise.reject(new Error("timeout")), + waitForCompletion: () => Promise.reject(new Error("should not wait for completion")), + }) + ) + ).rejects.toThrow("CLI Goal Run made no progress (cooldown)")); +}); + +describe("describeCliGoalStop", () => { + const cases: Array<[string, GoalRecordV1 | null, string]> = [ + ["missing goal", null, "goal missing"], + [ + "budget and turn caps reached", + goal({ + status: "budget_limited", + budgetCents: 100, + costCents: 100, + turnCap: 2, + turnsUsed: 2, + }), + "goal budget and turn caps reached", + ], + [ + "budget cap reached", + goal({ status: "budget_limited", budgetCents: 100, costCents: 100 }), + "goal budget reached", + ], + [ + "turn cap reached", + goal({ status: "budget_limited", turnCap: 2, turnsUsed: 2 }), + "goal turn cap reached", + ], + ["generic limit reached", goal({ status: "budget_limited" }), "goal limit reached"], + ["paused goal", goal({ status: "paused" }), "goal paused"], + ]; + + test.each(cases)("describes %s", (_name, input, expected) => { + expect(describeCliGoalStop(input)).toBe(expected); + }); +}); diff --git a/src/cli/goalRunDriver.ts b/src/cli/goalRunDriver.ts new file mode 100644 index 0000000000..feab26ad41 --- /dev/null +++ b/src/cli/goalRunDriver.ts @@ -0,0 +1,130 @@ +import assert from "@/common/utils/assert"; +import type { GoalRecordV1 } from "@/common/types/goal"; +import type { SendMessageOptions } from "@/common/orpc/types"; +import { CLI_GOAL_CONTINUATION_SAFETY_LIMIT } from "@/constants/goals"; + +interface EligibilityHint { + reason?: string | null; +} + +interface GoalContinuationRequest { + sendOptions: SendMessageOptions; + streamEndedAtMs: number; +} + +export interface DriveCliGoalUntilTerminalOptions { + workspaceId: string; + getGoal: () => Promise; + buildExecSendOptions: () => SendMessageOptions; + requestContinuationAfterStreamEnd: (input: GoalContinuationRequest) => Promise; + requestDispatch: () => Promise; + checkGoalContinuationEligibility: (nowMs: number) => Promise; + prepareForContinuation: () => void; + waitForStreamStarted: (timeoutMs?: number) => Promise; + waitForCompletion: () => Promise; + isSessionBudgetExceeded: () => boolean; + nowMs: () => number; + emitJsonLine: (payload: unknown) => void; + writeHumanLineClosed: (text?: string) => void; + setGoalStopReason: (reason: string) => void; + describeError: (error: unknown) => string; + continuationSafetyLimit?: number; + streamStartTimeoutMs?: number; +} + +/** Records the same terminal completion event regardless of where the loop observes it. */ +function recordCliGoalCompleted( + opts: DriveCliGoalUntilTerminalOptions, + goal: GoalRecordV1 +): GoalRecordV1 { + opts.setGoalStopReason("complete"); + opts.emitJsonLine({ + type: "goal-completed", + workspaceId: opts.workspaceId, + goalId: goal.goalId, + completionSummary: goal.completionSummary ?? null, + }); + opts.writeHumanLineClosed(`[goal] completed: ${goal.completionSummary ?? "complete"}`); + return goal; +} + +/** Returns the stable stop-reason string surfaced in CLI JSON and human output. */ +export function describeCliGoalStop(goal: GoalRecordV1 | null): string { + if (!goal) return "goal missing"; + if (goal.status === "budget_limited") { + const hitTurnCap = goal.turnCap != null && goal.turnsUsed >= goal.turnCap; + const hitBudget = goal.budgetCents != null && goal.costCents >= goal.budgetCents; + if (hitBudget && hitTurnCap) return "goal budget and turn caps reached"; + if (hitBudget) return "goal budget reached"; + if (hitTurnCap) return "goal turn cap reached"; + return "goal limit reached"; + } + return `goal ${goal.status}`; +} + +/** + * Drives a CLI goal by requesting continuations until the persisted goal reaches + * a terminal state. Returns the last goal record, or null if the goal disappears; + * throws only when continuation dispatch fails before a terminal goal state exists. + */ +export async function driveCliGoalUntilTerminal( + opts: DriveCliGoalUntilTerminalOptions +): Promise { + const continuationSafetyLimit = + opts.continuationSafetyLimit ?? CLI_GOAL_CONTINUATION_SAFETY_LIMIT; + const streamStartTimeoutMs = opts.streamStartTimeoutMs; + let continuationCount = 0; + + while (true) { + const goal = await opts.getGoal(); + if (goal?.status === "complete") { + return recordCliGoalCompleted(opts, goal); + } + if (!goal || goal.status === "paused") { + opts.setGoalStopReason(describeCliGoalStop(goal)); + return goal; + } + if (goal.status === "budget_limited" && goal.budgetLimitInjectedForGoalId === goal.goalId) { + opts.setGoalStopReason(describeCliGoalStop(goal)); + return goal; + } + + continuationCount += 1; + assert( + continuationCount < continuationSafetyLimit, + "CLI Goal Run exceeded the continuation safety guard" + ); + opts.prepareForContinuation(); + const phase = goal.status === "budget_limited" ? "budget wrap-up" : "continuing"; + opts.emitJsonLine({ + type: "goal-continuing", + workspaceId: opts.workspaceId, + goalId: goal.goalId, + status: goal.status, + continuation: continuationCount, + }); + opts.writeHumanLineClosed(`[goal] ${phase}...`); + await opts.requestContinuationAfterStreamEnd({ + sendOptions: opts.buildExecSendOptions(), + streamEndedAtMs: opts.nowMs(), + }); + await opts.requestDispatch(); + try { + await opts.waitForStreamStarted(streamStartTimeoutMs); + } catch (error) { + const eligibility = await opts.checkGoalContinuationEligibility(opts.nowMs()); + throw new Error( + `CLI Goal Run made no progress (${eligibility.reason ?? opts.describeError(error)})` + ); + } + await opts.waitForCompletion(); + if (opts.isSessionBudgetExceeded()) { + const latestGoal = await opts.getGoal(); + if (latestGoal?.status === "complete") { + return recordCliGoalCompleted(opts, latestGoal); + } + opts.setGoalStopReason("session budget exceeded"); + return latestGoal; + } + } +} diff --git a/src/cli/run.test.ts b/src/cli/run.test.ts index 876081af49..f2f6a1d14d 100644 --- a/src/cli/run.test.ts +++ b/src/cli/run.test.ts @@ -157,6 +157,9 @@ describe("mux CLI", () => { expect(result.stdout).toContain("--mode"); expect(result.stdout).toContain("--thinking"); expect(result.stdout).toContain("--hide-costs"); + expect(result.stdout).toContain("--goal"); + expect(result.stdout).toContain("--goal-budget"); + expect(result.stdout).toContain("--goal-turns"); expect(result.stdout).toContain("--json"); expect(result.stdout).toContain("--quiet"); }); @@ -181,6 +184,42 @@ describe("mux CLI", () => { expect(result.output).toContain("No message provided"); }); + test("empty --goal shows a goal-specific error", async () => { + const result = await runRunDirect(["--goal", ""]); + expect(result.exitCode).toBe(1); + expect(result.output).toContain("--goal requires a non-empty objective"); + }); + + test("--goal supplies the initial message when no message or stdin is provided", async () => { + const result = await runRunDirect([ + "--goal", + "finish the objective", + "--dir", + "/nonexistent/path/for/goal/test", + ]); + expect(result.output).not.toContain("No message provided"); + expect(result.output).not.toContain("--goal requires a non-empty objective"); + expect(result.exitCode).toBe(1); + }); + + test("--goal-budget and --goal-turns require --goal", async () => { + const result = await runRunDirect(["--goal-budget", "5", "test message"]); + expect(result.exitCode).toBe(1); + expect(result.output).toContain("--goal-budget and --goal-turns require --goal"); + }); + + test("invalid --goal-budget shows error", async () => { + const result = await runRunDirect(["--goal", "ship", "--goal-budget", "five"]); + expect(result.exitCode).toBe(1); + expect(result.output).toContain("Invalid --goal-budget"); + }); + + test("invalid --goal-turns shows error", async () => { + const result = await runRunDirect(["--goal", "ship", "--goal-turns", "0"]); + expect(result.exitCode).toBe(1); + expect(result.output).toContain("Invalid --goal-turns"); + }); + test("xhigh thinking level is accepted", async () => { const result = await runRunDirect([ "--thinking", diff --git a/src/cli/run.ts b/src/cli/run.ts index 7596cdab67..e4371d0354 100644 --- a/src/cli/run.ts +++ b/src/cli/run.ts @@ -77,6 +77,16 @@ import { execSync } from "child_process"; import { getParseOptions } from "./argv"; import { EXPERIMENT_IDS } from "../common/constants/experiments"; import { getErrorMessage } from "@/common/utils/errors"; +import { describeCliGoalStop, driveCliGoalUntilTerminal } from "./goalRunDriver"; +import { + parseGoalBudgetInputCents, + parseGoalTurnCapInput, +} from "@/common/utils/goals/budgetParser"; +import { + CLI_GOAL_STREAM_START_TIMEOUT_MS, + GOAL_CONTINUATION_IDLE_CONSUMER_NAME, +} from "@/constants/goals"; +import type { GoalRecordV1 } from "@/common/types/goal"; // Display labels for CLI help (OFF, LOW, MED, HIGH, MAX). // Deduplicate because xhigh and max both display as "MAX" for default/Anthropic @@ -150,6 +160,26 @@ function parseMode(value: string | undefined): CLIMode { throw new Error(`Invalid mode "${value}". Expected: plan, exec`); } +function parseGoalBudgetFlag(value: string | undefined): number | null | undefined { + if (value == null) return undefined; + const parsed = parseGoalBudgetInputCents(value); + if (parsed === undefined) { + throw new Error( + 'Invalid --goal-budget "' + value + '". Expected dollars like 5, $5.00, or cents like 500c' + ); + } + return parsed; +} + +function parseGoalTurnsFlag(value: string | undefined): number | undefined { + if (value == null) return undefined; + const parsed = parseGoalTurnCapInput(value); + if (parsed == null) { + throw new Error('Invalid --goal-turns "' + value + '". Expected a positive integer'); + } + return parsed; +} + function generateWorkspaceId(): string { const timestamp = Date.now(); const random = Math.random().toString(36).substring(2, 8); @@ -305,6 +335,9 @@ program .option("--no-mcp-config", "ignore global + repo MCP config files (use only --mcp servers)") .option("-e, --experiment ", "enable experiment (can be repeated)", collectExperiments, []) .option("-b, --budget ", "stop when session cost exceeds budget (USD)", parseFloat) + .option("--goal ", "drive an ephemeral CLI Goal Run until complete") + .option("--goal-budget ", "goal budget, e.g. $5, 5.00, or 500c") + .option("--goal-turns ", "maximum automatic goal continuation turns") .option("--service-tier ", "OpenAI service tier: auto, default, flex, priority") .option("--use-1m", "enable 1M context window for supported Anthropic models") .option( @@ -318,6 +351,8 @@ Examples: $ mux run "Fix the failing tests" $ mux run --dir /path/to/project "Add authentication" $ mux run --runtime "ssh user@host" "Deploy changes" + $ mux run --goal "Fix tests and verify they pass" + $ mux run --goal "Ship the refactor" --goal-budget 5.00 --goal-turns 10 $ mux run --mode plan "Refactor the auth module" $ mux run --budget 1.50 "Quick code review" $ echo "Add logging" | mux run @@ -344,6 +379,9 @@ interface CLIOptions { mcpConfig: boolean; experiment: string[]; budget?: number; + goal?: string; + goalBudget?: string; + goalTurns?: string; serviceTier?: ServiceTier; use1m?: boolean; keepBackgroundProcesses?: boolean; @@ -371,10 +409,17 @@ async function main(): Promise { // Get message from arg or stdin const stdinMessage = await gatherMessageFromStdin(); - const message = messageArg?.trim() || stdinMessage.trim(); + const goalObjective = opts.goal?.trim() ?? ""; + const hasGoal = opts.goal !== undefined; + if (hasGoal && goalObjective.length === 0) { + console.error("Error: --goal requires a non-empty objective"); + process.exit(1); + } + + const message = messageArg?.trim() || stdinMessage.trim() || goalObjective; if (!message) { - console.error("Error: No message provided. Pass as argument or pipe via stdin."); + console.error("Error: No message provided. Pass as argument, pipe via stdin, or use --goal."); console.error('Usage: mux run "Your instruction here"'); process.exit(1); } @@ -453,6 +498,12 @@ async function main(): Promise { } } + const goalBudgetCents = parseGoalBudgetFlag(opts.goalBudget); + const goalTurnCap = parseGoalTurnsFlag(opts.goalTurns); + if (!hasGoal && (goalBudgetCents !== undefined || goalTurnCap !== undefined)) { + console.error("Error: --goal-budget and --goal-turns require --goal"); + process.exit(1); + } const suppressHumanOutput = emitJson || quiet; const stdoutIsTTY = process.stdout.isTTY === true; const stderrIsTTY = process.stderr.isTTY === true; @@ -508,6 +559,8 @@ async function main(): Promise { mcpServerManager, providerService, workspaceService, + workspaceGoalService, + idleDispatcher, } = createCoreServices({ config, extensionMetadataPath: path.join(tempDir.path, "extensionMetadata.json"), @@ -516,6 +569,13 @@ async function main(): Promise { inlineServers, ignoreConfigFile: !opts.mcpConfig, }, + goalServiceOptions: hasGoal + ? { + continuationCooldownMs: 0, + allowUserOriginBudgetWrapup: true, + suppressKickoffContinuation: true, + } + : undefined, }); // `mux run` uses createCoreServices directly (without ServiceContainer), so wire @@ -556,6 +616,7 @@ async function main(): Promise { aiService, initStateManager, backgroundProcessManager, + workspaceGoalService, keepBackgroundProcesses, }); // Register with WorkspaceService so TaskService operations that target the parent @@ -718,6 +779,37 @@ async function main(): Promise { // Plan agent instructions are handled by the backend (has access to plan file path) }); + let goalStopReason: string | null = null; + if (hasGoal) { + const setGoalResult = await workspaceGoalService.setGoal({ + workspaceId, + objective: goalObjective, + budgetCents: goalBudgetCents ?? null, + turnCap: goalTurnCap ?? null, + initiator: "user", + }); + if (!setGoalResult.success) { + throw new Error(`Failed to set CLI goal: ${setGoalResult.error.type}`); + } + const warning = + goalBudgetCents == null && goalTurnCap == null + ? "CLI Goal Run has no --goal-budget or --goal-turns limit. It will continue until the goal is complete or another stop condition occurs." + : null; + if (warning) { + emitJsonLine({ type: "goal-warning", workspaceId, warning }); + writeHumanLine(`[goal] warning: ${warning}`); + } + emitJsonLine({ + type: "goal-started", + workspaceId, + goalId: setGoalResult.data.goalId, + objective: goalObjective, + budgetCents: setGoalResult.data.budgetCents, + turnCap: setGoalResult.data.turnCap, + }); + writeHumanLine(`[goal] started: ${goalObjective}`); + } + const liveEvents: WorkspaceChatMessage[] = []; let readyForLive = false; @@ -795,8 +887,14 @@ async function main(): Promise { let rejectCompletion: ((reason?: unknown) => void) | null = null; let completionPromise: Promise = Promise.resolve(); + let resolveStreamStarted: (() => void) | null = null; + let streamStartedPromise: Promise = Promise.resolve(); + const createCompletionPromise = (): Promise => { streamEnded = false; + streamStartedPromise = new Promise((resolve) => { + resolveStreamStarted = resolve; + }); return new Promise((resolve, reject) => { resolveCompletion = resolve; rejectCompletion = reject; @@ -811,9 +909,35 @@ async function main(): Promise { } }; + const waitForStreamStarted = async (timeoutMs?: number): Promise => { + let timer: ReturnType | null = null; + const streamFailedOrEndedBeforeStart = completionPromise.then(() => { + throw new Error("Goal continuation stream ended before it started"); + }); + const waits: Array> = [streamStartedPromise, streamFailedOrEndedBeforeStart]; + if (timeoutMs != null) { + waits.push( + new Promise((_, reject) => { + timer = setTimeout(() => { + reject(new Error("Timed out waiting for goal continuation stream to start")); + }, timeoutMs); + timer.unref?.(); + }) + ); + } + try { + await Promise.race(waits); + } finally { + if (timer != null) { + clearTimeout(timer); + } + } + }; + const resetCompletionHandlers = () => { resolveCompletion = null; rejectCompletion = null; + resolveStreamStarted = null; }; const rejectStream = (error: Error) => { @@ -855,6 +979,11 @@ async function main(): Promise { await waitForCompletion(); }; + const getGoal = async (): Promise => { + if (!hasGoal) return null; + return workspaceGoalService.getGoal(workspaceId); + }; + const handleToolStart = (payload: WorkspaceChatMessage): boolean => { if (!isToolCallStart(payload)) return false; @@ -941,6 +1070,7 @@ async function main(): Promise { ); return; } + resolveStreamStarted?.(); activeMessageId = payload.messageId; return; } @@ -1133,6 +1263,9 @@ async function main(): Promise { } }; + let finalGoalRecord: GoalRecordV1 | null = null; + let goalDriverError: unknown = null; + const unsubscribe = await session.subscribeChat(chatListener); try { @@ -1145,7 +1278,10 @@ async function main(): Promise { const planWasProposed = planProposed; planProposed = false; if (initialMode === "plan" && !planWasProposed) { - throw new Error("Plan mode was requested, but the assistant never proposed a plan."); + const goalAfterFirstTurn = await getGoal(); + if (!hasGoal || goalAfterFirstTurn?.status !== "budget_limited") { + throw new Error("Plan mode was requested, but the assistant never proposed a plan."); + } } if (planWasProposed) { writeHumanLineClosed( @@ -1153,23 +1289,73 @@ async function main(): Promise { ); await sendAndAwait("Plan approved. Execute it.", buildSendOptions("exec")); } + if (hasGoal && !budgetExceeded) { + try { + await driveCliGoalUntilTerminal({ + workspaceId, + getGoal, + buildExecSendOptions: () => buildSendOptions("exec"), + requestContinuationAfterStreamEnd: (input) => + workspaceGoalService.requestContinuationAfterStreamEnd({ + workspaceId, + ...input, + }), + requestDispatch: () => + idleDispatcher.requestDispatch(workspaceId, GOAL_CONTINUATION_IDLE_CONSUMER_NAME), + checkGoalContinuationEligibility: (nowMs) => + workspaceGoalService.checkGoalContinuationEligibility(workspaceId, nowMs), + prepareForContinuation: () => { + completionPromise = createCompletionPromise(); + }, + waitForStreamStarted, + waitForCompletion, + streamStartTimeoutMs: CLI_GOAL_STREAM_START_TIMEOUT_MS, + isSessionBudgetExceeded: () => budgetExceeded, + nowMs: Date.now, + emitJsonLine, + writeHumanLineClosed, + setGoalStopReason: (reason) => { + goalStopReason = reason; + }, + describeError: getErrorMessage, + }); + } catch (error) { + goalDriverError = error; + goalStopReason = getErrorMessage(error); + } + } + } + + finalGoalRecord = await getGoal(); + + if ( + budgetExceeded && + hasGoal && + goalStopReason == null && + finalGoalRecord?.status !== "complete" + ) { + goalStopReason = "session budget exceeded"; } // Output final result for --quiet mode if (quiet) { - let finalEvent: WorkspaceChatMessage | undefined; - for (let i = liveEvents.length - 1; i >= 0; i--) { - if (isStreamEnd(liveEvents[i])) { - finalEvent = liveEvents[i]; - break; + if (finalGoalRecord?.status === "complete" && finalGoalRecord.completionSummary) { + console.log(finalGoalRecord.completionSummary); + } else { + let finalEvent: WorkspaceChatMessage | undefined; + for (let i = liveEvents.length - 1; i >= 0; i--) { + if (isStreamEnd(liveEvents[i])) { + finalEvent = liveEvents[i]; + break; + } } - } - if (finalEvent && isStreamEnd(finalEvent)) { - const parts = (finalEvent as unknown as { parts?: unknown[] }).parts ?? []; - for (const part of parts) { - if (part && typeof part === "object" && "type" in part && part.type === "text") { - const text = (part as { text?: string }).text; - if (text) console.log(text); + if (finalEvent && isStreamEnd(finalEvent)) { + const parts = (finalEvent as unknown as { parts?: unknown[] }).parts ?? []; + for (const part of parts) { + if (part && typeof part === "object" && "type" in part && part.type === "text") { + const text = (part as { text?: string }).text; + if (text) console.log(text); + } } } } @@ -1194,6 +1380,16 @@ async function main(): Promise { } : null, cost_usd: totalCost ?? null, + goal: finalGoalRecord + ? { + status: finalGoalRecord.status, + goalId: finalGoalRecord.goalId, + completionSummary: finalGoalRecord.completionSummary ?? null, + stopReason: goalStopReason, + costCents: finalGoalRecord.costCents, + turnsUsed: finalGoalRecord.turnsUsed, + } + : null, }); } @@ -1216,8 +1412,19 @@ async function main(): Promise { } } - // Exit codes: 2 for budget exceeded, agent-specified exit code, or 0 for success if (budgetExceeded) return 2; + if (hasGoal && (goalDriverError != null || finalGoalRecord?.status !== "complete")) { + const reason = goalStopReason ?? describeCliGoalStop(finalGoalRecord); + writeHumanLineClosed(`[goal] stopped: ${reason}`); + emitJsonLine({ + type: "goal-incomplete", + workspaceId, + goalId: finalGoalRecord?.goalId ?? null, + status: finalGoalRecord?.status ?? null, + stopReason: reason, + }); + return 3; + } return agentExitCode ?? 0; } diff --git a/src/constants/goals.ts b/src/constants/goals.ts index f73e5a3a2b..f101d8d31f 100644 --- a/src/constants/goals.ts +++ b/src/constants/goals.ts @@ -1,6 +1,14 @@ export const GOAL_CONTINUATION_IDLE_CONSUMER_NAME = "goal_continuation"; export const GOAL_CONTINUATION_IDLE_CONSUMER_PRIORITY = 100; export const DEFAULT_GOAL_CONTINUATION_COOLDOWN_MS = 60_000; +export const CLI_GOAL_CONTINUATION_SAFETY_LIMIT = 10_000; + +/** + * Upper bound for waiting on a CLI goal continuation to actually start. This is + * intentionally much longer than normal stream startup so slow CI/runtime warmup + * does not fail goal runs, while still preventing indefinite benchmark hangs. + */ +export const CLI_GOAL_STREAM_START_TIMEOUT_MS = 5 * 60 * 1000; export const GOAL_CONTINUATION_KIND = "goal_continuation"; export const GOAL_BUDGET_LIMIT_KIND = "goal_budget_limit"; export const GOAL_OBJECTIVE_OPEN_TAG = ""; diff --git a/src/node/builtinSkills/mux-docs.md b/src/node/builtinSkills/mux-docs.md index c7549df8e2..786ad58e5b 100644 --- a/src/node/builtinSkills/mux-docs.md +++ b/src/node/builtinSkills/mux-docs.md @@ -108,6 +108,7 @@ Use this index to find a page's: - Experimental Image Generation Tool (`/adr/0001-experimental-image-generation-tool`) → `references/docs/adr/0001-experimental-image-generation-tool.md` — Architecture decision for Mux's experimental image generation tool and generated-image display messages - Image Editing Uses a Separate General-Purpose Tool (`/adr/0002-image-editing-visual-mockups`) → `references/docs/adr/0002-image-editing-visual-mockups.md` — Architecture decision for Mux's experimental image editing tool and edited image display messages - Context Boundaries for Compaction and Reset (`/adr/0003-context-boundaries-for-compaction-and-reset`) → `references/docs/adr/0003-context-boundaries-for-compaction-and-reset.md` — Architecture decision for modeling provider context windows separately from transcript history + - CLI Goal Runs are not strict /goal aliases (`/adr/0004-cli-goal-runs-are-not-strict-goal-aliases`) → `references/docs/adr/0004-cli-goal-runs-are-not-strict-goal-aliases.md` — Architecture decision for giving mux run --goal CLI-specific completion and limit semantics - AGENTS.md (`/AGENTS`) → `references/docs/AGENTS.md` — Agent instructions for AI assistants working on the Mux codebase diff --git a/src/node/services/agentSkills/builtInSkillContent.generated.ts b/src/node/services/agentSkills/builtInSkillContent.generated.ts index 9a68090ee1..7b42fe3c55 100644 --- a/src/node/services/agentSkills/builtInSkillContent.generated.ts +++ b/src/node/services/agentSkills/builtInSkillContent.generated.ts @@ -403,6 +403,40 @@ export const BUILTIN_SKILL_FILES: Record> = { "- Persisted boundary metadata should distinguish boundary kinds instead of representing context resets as fake compaction summaries.", "", ].join("\n"), + "references/docs/adr/0004-cli-goal-runs-are-not-strict-goal-aliases.md": [ + "---", + "title: CLI Goal Runs are not strict /goal aliases", + "description: Architecture decision for giving mux run --goal CLI-specific completion and limit semantics", + "---", + "", + "# 0004. CLI Goal Runs are not strict /goal aliases", + "", + "## Status", + "", + "Accepted", + "", + "## Context", + "", + "`mux run` is designed for automation: it normally sends one request, streams the result, and exits. Interactive `/goal` is a workspace lifecycle command with defaults, controls, and cooldown behavior that assume a user can intervene from the UI.", + "", + "Adding `mux run --goal` creates a different automation need. A script needs one process to keep driving an objective until there is an authoritative completion signal, while still preserving goal accounting and model-facing goal tools.", + "", + "## Decision", + "", + "Mux will model `mux run --goal` as a CLI Goal Run, not as a strict alias for interactive `/goal`.", + "", + "A CLI Goal Run creates an ephemeral goal for the `mux run` process, sends either the provided message/stdin or the goal text as the kickoff message, and continues in exec mode until the persisted goal status is `complete` or a stop condition is reached. Interactive goal defaults are not applied; omitted `--goal-budget` and `--goal-turns` mean no goal-specific limit. The existing session `--budget` remains a separate hard stop.", + "", + "CLI Goal Runs bypass the interactive goal continuation cooldown because the process itself is the automation boundary. They still use the shared goal service for prompts, accounting, tool availability, budget-limited wrap-up, and persisted completion state.", + "", + "## Consequences", + "", + "- `mux run` remains single-request by default, with `--goal` documented as the explicit multi-continuation exception.", + "- Scripts can trust exit code `0` only when the persisted goal is complete; free-text claims are not enough unless existing goal completion fallback persisted them.", + "- Goal and session budgets can stop the same process for different reasons, so CLI output and JSON events must identify which limit won.", + "- CLI-specific continuation behavior is parameterized in the shared goal service instead of duplicating goal prompt/accounting logic in the CLI.", + "", + ].join("\n"), "references/docs/AGENTS.md": [ "---", "title: AGENTS.md", @@ -3523,6 +3557,7 @@ export const BUILTIN_SKILL_FILES: Record> = { ' "adr/0001-experimental-image-generation-tool",', ' "adr/0002-image-editing-visual-mockups",', ' "adr/0003-context-boundaries-for-compaction-and-reset",', + ' "adr/0004-cli-goal-runs-are-not-strict-goal-aliases",', ' "AGENTS"', " ]", " }", @@ -5247,7 +5282,7 @@ export const BUILTIN_SKILL_FILES: Record> = { " Code](https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview) or similar TUIs.", "", "", - "Mux provides a CLI for running one-off agent tasks without the desktop app. Unlike the interactive desktop experience, `mux run` executes a single request to completion and exits.", + "Mux provides a CLI for running one-off agent tasks without the desktop app. Unlike the interactive desktop experience, `mux run` normally executes a single request to completion and exits. The `--goal` option is an explicit exception: it starts a CLI Goal Run that may perform automatic continuations until the goal is complete or a limit is reached.", "", '', " Learn how to use `mux run` in CI/CD pipelines", @@ -5299,10 +5334,42 @@ export const BUILTIN_SKILL_FILES: Record> = { "| `--mode ` | | Agent mode: `plan` or `exec` | `exec` |", "| `--thinking ` | `-t` | Thinking level: `OFF`, `LOW`, `MED`, `HIGH`, `MAX`, or `0`–`9` (model-relative, see [Models](/config/models#thinking-levels)) | `MED` |", "| `--budget ` | `-b` | Stop when session cost exceeds budget (USD) | No limit |", + "| `--goal ` | | Start a CLI Goal Run and continue until the persisted goal is complete or a limit stops it | Off |", + "| `--goal-budget ` | | Goal budget (`$5`, `5.00`, or `500c`); separate from `--budget` | No limit |", + "| `--goal-turns ` | | Maximum automatic goal continuation turns | No limit |", "| `--experiment ` | `-e` | Enable experiment (repeatable) | None |", "| `--json` | | Output NDJSON for programmatic use | Off |", "| `--quiet` | `-q` | Only output final result | Off |", "", + "### CLI Goal Runs", + "", + "Use `--goal` when a task should keep going across automatic continuations until the agent marks the persisted goal complete:", + "", + "```bash", + "# Goal text is also used as the initial message when no message/stdin is provided", + 'mux run --goal "Fix the failing tests and verify the suite passes"', + "", + "# Provide separate kickoff instructions while keeping the objective active", + 'mux run --goal "Ship the migration safely" "Start by inspecting the schema and propose a plan"', + "", + "# Bound automatic continuations with a goal-specific budget and turn cap", + 'mux run --goal "Complete the refactor" --goal-budget 5.00 --goal-turns 10', + "```", + "", + "A CLI Goal Run is intentionally not a strict alias for interactive `/goal`. It is ephemeral to the `mux run` process, does not apply interactive goal defaults, bypasses the interactive continuation cooldown, and exits successfully only when the persisted goal status is `complete`. If neither `--goal-budget` nor `--goal-turns` is provided, Mux warns that the goal is uncapped.", + "", + "`--budget` remains the hard session spending limit in USD. `--goal-budget` is goal accounting, accepts forms like `$5`, `5.00`, and `500c`, and may allow a final budget-limit wrap-up turn. If the session `--budget` is exceeded, the run stops immediately.", + "", + "Exit codes for CLI Goal Runs:", + "", + "| Code | Meaning |", + "| ----- | ---------------------------------------------------------- |", + "| `0` | Goal completed (unless the agent set a nonzero exit code) |", + "| `1` | Operational, model, or tool error |", + "| `2` | Session `--budget` exceeded |", + "| `3` | Goal stopped incomplete, including goal budget/turn limits |", + "| `130` | User interrupt |", + "", "### Runtimes", "", "- **`local`** (default): Runs directly in the specified directory. Best for one-off tasks.", @@ -5319,6 +5386,9 @@ export const BUILTIN_SKILL_FILES: Record> = { "### Examples", "", "```bash", + "# Goal run with automatic continuations", + 'mux run --goal "Update dependencies, fix resulting tests, and verify the suite passes"', + "", "# Quick fix in current directory", 'mux run "Fix the TypeScript errors"', "", @@ -6599,6 +6669,7 @@ export const BUILTIN_SKILL_FILES: Record> = { " - Experimental Image Generation Tool (`/adr/0001-experimental-image-generation-tool`) → `references/docs/adr/0001-experimental-image-generation-tool.md` — Architecture decision for Mux's experimental image generation tool and generated-image display messages", " - Image Editing Uses a Separate General-Purpose Tool (`/adr/0002-image-editing-visual-mockups`) → `references/docs/adr/0002-image-editing-visual-mockups.md` — Architecture decision for Mux's experimental image editing tool and edited image display messages", " - Context Boundaries for Compaction and Reset (`/adr/0003-context-boundaries-for-compaction-and-reset`) → `references/docs/adr/0003-context-boundaries-for-compaction-and-reset.md` — Architecture decision for modeling provider context windows separately from transcript history", + " - CLI Goal Runs are not strict /goal aliases (`/adr/0004-cli-goal-runs-are-not-strict-goal-aliases`) → `references/docs/adr/0004-cli-goal-runs-are-not-strict-goal-aliases.md` — Architecture decision for giving mux run --goal CLI-specific completion and limit semantics", " - AGENTS.md (`/AGENTS`) → `references/docs/AGENTS.md` — Agent instructions for AI assistants working on the Mux codebase", "", "", diff --git a/src/node/services/coreServices.ts b/src/node/services/coreServices.ts index c672c9d28a..980f5a2f3a 100644 --- a/src/node/services/coreServices.ts +++ b/src/node/services/coreServices.ts @@ -16,6 +16,7 @@ import { log } from "@/node/services/log"; import { WorkspaceGoalService, type GoalLifecycleAnalyticsSink, + type WorkspaceGoalServiceOptions, } from "@/node/services/workspaceGoalService"; import { MCPConfigService } from "@/node/services/mcpConfigService"; import { MCPServerManager, type MCPServerManagerOptions } from "@/node/services/mcpServerManager"; @@ -41,6 +42,7 @@ export interface CoreServicesOptions { policyService?: PolicyService; telemetryService?: TelemetryService; analyticsService?: GoalLifecycleAnalyticsSink; + goalServiceOptions?: WorkspaceGoalServiceOptions; experimentsService?: ExperimentsService; sessionTimingService?: SessionTimingService; opResolver?: ExternalSecretResolver; @@ -83,7 +85,8 @@ export function createCoreServices(opts: CoreServicesOptions): CoreServices { config, historyService, extensionMetadata, - opts.analyticsService + opts.analyticsService, + opts.goalServiceOptions ); const aiService = new AIService( diff --git a/src/node/services/workspaceGoalService.test.ts b/src/node/services/workspaceGoalService.test.ts index 1860c05a20..763cdbd1a1 100644 --- a/src/node/services/workspaceGoalService.test.ts +++ b/src/node/services/workspaceGoalService.test.ts @@ -434,6 +434,44 @@ describe("WorkspaceGoalService", () => { ); }); + test("can suppress setGoal kickoff continuation for CLI-controlled kickoff", async () => { + service = new WorkspaceGoalService(config, historyService, extensionMetadata, analytics, { + suppressKickoffContinuation: true, + }); + const dispatcher = new IdleDispatcher(); + const execute = mock(() => Promise.resolve(true)); + service.registerGoalContinuationConsumer(dispatcher, continuationBridge(execute)); + + await setGoalOk(service, { workspaceId, objective: "Wait for the CLI kickoff message" }); + await dispatcher.requestDispatch(workspaceId, GOAL_CONTINUATION_IDLE_CONSUMER_NAME); + + expect(execute).not.toHaveBeenCalled(); + }); + + test("allows zero cooldown for immediate CLI-style continuations", async () => { + service = new WorkspaceGoalService(config, historyService, extensionMetadata, analytics, { + continuationCooldownMs: 0, + }); + await setGoalOk(service, { workspaceId, objective: "Keep going without idle delay" }); + const dispatcher = new IdleDispatcher(); + const execute = mock(() => Promise.resolve(true)); + service.registerGoalContinuationConsumer(dispatcher, continuationBridge(execute)); + + await service.requestContinuationAfterStreamEnd({ + workspaceId, + sendOptions: { model: "openai:gpt-4o", agentId: "exec" }, + streamEndedAtMs: 10_000, + }); + await service.requestContinuationAfterStreamEnd({ + workspaceId, + sendOptions: { model: "openai:gpt-4o", agentId: "exec" }, + streamEndedAtMs: 10_001, + }); + await dispatcher.requestDispatch(workspaceId, GOAL_CONTINUATION_IDLE_CONSUMER_NAME); + + expect(execute).toHaveBeenCalledTimes(2); + }); + test("dispatches one budget-limit wrap-up after a continuation-origin stream exhausts the budget", async () => { const created = await setGoalOk(service, { workspaceId, @@ -744,6 +782,39 @@ describe("WorkspaceGoalService", () => { }); }); + test("can allow budget-limit wrap-up after user-origin stream exhaustion", async () => { + service = new WorkspaceGoalService(config, historyService, extensionMetadata, analytics, { + allowUserOriginBudgetWrapup: true, + }); + const created = await setGoalOk(service, { + workspaceId, + objective: "CLI owns over-budget kickoff", + budgetCents: 100, + }); + const dispatcher = new IdleDispatcher(); + const execute = mock(() => Promise.resolve(true)); + service.registerGoalContinuationConsumer(dispatcher, continuationBridge(execute)); + + await service.recordStreamAccounting({ + workspaceId, + costUsd: 1.25, + streamStartedAtMs: created.createdAtMs + 1, + streamOriginKind: "user", + }); + await service.requestContinuationAfterStreamEnd({ + workspaceId, + sendOptions: { model: "openai:gpt-4o", agentId: "exec" }, + streamEndedAtMs: 20_000, + }); + await dispatcher.requestDispatch(workspaceId, GOAL_CONTINUATION_IDLE_CONSUMER_NAME); + + expect(execute).toHaveBeenCalledTimes(1); + expect(await service.getGoal(workspaceId)).toMatchObject({ + status: "budget_limited", + budgetLimitInjectedForGoalId: created.goalId, + }); + }); + test("recoverPendingDispatchAfterRestart re-arms a stranded budget_limited wrap-up", async () => { // Regression: Simulates a process // restart by: diff --git a/src/node/services/workspaceGoalService.ts b/src/node/services/workspaceGoalService.ts index ecaeab37bb..4f9c1c5bb1 100644 --- a/src/node/services/workspaceGoalService.ts +++ b/src/node/services/workspaceGoalService.ts @@ -329,8 +329,20 @@ function continuationSendOptions(sendOptions: SendMessageOptions): SendMessageOp return pickStartupRetrySendOptions(sendOptions) as SendMessageOptions; } +export interface WorkspaceGoalServiceOptions { + /** Override interactive continuation cooldown; CLI goal runs use 0 to drive immediately. */ + continuationCooldownMs?: number; + /** Allow CLI kickoff turns to receive the same budget-limit wrap-up as continuations. */ + allowUserOriginBudgetWrapup?: boolean; + /** Prevent setGoal from queuing an automatic kickoff when the CLI sends its own message. */ + suppressKickoffContinuation?: boolean; +} + export class WorkspaceGoalService { private readonly fileLocks = workspaceFileLocks; + private readonly continuationCooldownMs: number; + private readonly allowUserOriginBudgetWrapup: boolean; + private readonly suppressKickoffContinuation: boolean; private readonly pendingGoalMutations = new Map(); private readonly pendingGoalSnapshots = new Map(); @@ -359,8 +371,18 @@ export class WorkspaceGoalService { private readonly config: Config, private readonly historyService: HistoryService, private readonly extensionMetadata: ExtensionMetadataService, - private readonly analytics?: GoalLifecycleAnalyticsSink - ) {} + private readonly analytics?: GoalLifecycleAnalyticsSink, + options: WorkspaceGoalServiceOptions = {} + ) { + this.continuationCooldownMs = + options.continuationCooldownMs ?? DEFAULT_GOAL_CONTINUATION_COOLDOWN_MS; + this.allowUserOriginBudgetWrapup = options.allowUserOriginBudgetWrapup === true; + this.suppressKickoffContinuation = options.suppressKickoffContinuation === true; + assert( + Number.isFinite(this.continuationCooldownMs) && this.continuationCooldownMs >= 0, + "WorkspaceGoalService requires a non-negative continuation cooldown" + ); + } setOnActivityChange( listener: (workspaceId: string, snapshot: WorkspaceActivitySnapshot) => void @@ -1012,12 +1034,12 @@ export class WorkspaceGoalService { const lastContinuationFiredAtMs = goal.lastContinuationFiredAtMs ?? null; if ( lastContinuationFiredAtMs != null && - nowMs - lastContinuationFiredAtMs < DEFAULT_GOAL_CONTINUATION_COOLDOWN_MS + nowMs - lastContinuationFiredAtMs < this.continuationCooldownMs ) { return { eligible: false, reason: "cooldown", - deferUntilMs: lastContinuationFiredAtMs + DEFAULT_GOAL_CONTINUATION_COOLDOWN_MS, + deferUntilMs: lastContinuationFiredAtMs + this.continuationCooldownMs, }; } @@ -1115,7 +1137,7 @@ export class WorkspaceGoalService { } private isBudgetWrapupEligibleOrigin(originKind: GoalStreamOriginKind): boolean { - return originKind !== "user"; + return this.allowUserOriginBudgetWrapup || originKind !== "user"; } private async tryMarkBudgetLimitInjected( @@ -1836,6 +1858,9 @@ export class WorkspaceGoalService { } private armKickoffContinuationIfIdle(workspaceId: string, goal: GoalRecordV1): void { + if (this.suppressKickoffContinuation) { + return; + } if (goal.status !== "active") { return; }