diff --git a/benchmarks/terminal_bench/cmux-run.sh b/benchmarks/terminal_bench/cmux-run.sh index 4aaa69895..da0d33617 100644 --- a/benchmarks/terminal_bench/cmux-run.sh +++ b/benchmarks/terminal_bench/cmux-run.sh @@ -30,12 +30,6 @@ CMUX_WORKSPACE_ID="${CMUX_WORKSPACE_ID:-cmux-bench}" CMUX_THINKING_LEVEL="${CMUX_THINKING_LEVEL:-high}" CMUX_MODE="${CMUX_MODE:-exec}" -ensure_bun() { - if ! command -v bun >/dev/null 2>&1; then - fatal "bun must be installed before running the cmux agent" - fi -} - resolve_project_path() { if [[ -n "${CMUX_PROJECT_PATH}" ]]; then if [[ -d "${CMUX_PROJECT_PATH}" ]]; then @@ -59,40 +53,27 @@ resolve_project_path() { ensure_git_repo() { local project_path=$1 - if command -v git >/dev/null 2>&1; then - if git -C "${project_path}" rev-parse --is-inside-work-tree >/dev/null 2>&1; then - # Ensure trunk branch exists even on pre-existing repos. - if ! git -C "${project_path}" rev-parse --verify "${CMUX_TRUNK}" >/dev/null 2>&1; then - git -C "${project_path}" checkout -b "${CMUX_TRUNK}" >/dev/null 2>&1 || true - else - git -C "${project_path}" checkout "${CMUX_TRUNK}" >/dev/null 2>&1 || true - fi - return 0 - fi + command -v git >/dev/null 2>&1 || return 0 - log "initialising git repository at ${project_path}" - if git -C "${project_path}" init --initial-branch="${CMUX_TRUNK}" >/dev/null 2>&1; then - : - else - git -C "${project_path}" init >/dev/null - git -C "${project_path}" checkout -B "${CMUX_TRUNK}" >/dev/null - fi - git -C "${project_path}" config user.name "cmux-bench" - git -C "${project_path}" config user.email "bench@cmux.local" - git -C "${project_path}" add -A >/dev/null - git -C "${project_path}" commit -m "chore: initial snapshot" --allow-empty >/dev/null - git -C "${project_path}" branch -M "${CMUX_TRUNK}" >/dev/null - else - log "git not available; skipping repository initialisation" + if git -C "${project_path}" rev-parse --is-inside-work-tree >/dev/null 2>&1; then + git -C "${project_path}" checkout "${CMUX_TRUNK}" 2>/dev/null || \ + git -C "${project_path}" checkout -b "${CMUX_TRUNK}" 2>/dev/null || true + return 0 fi + + log "initialising git repository at ${project_path}" + git -C "${project_path}" init --initial-branch="${CMUX_TRUNK}" 2>/dev/null || \ + (git -C "${project_path}" init && git -C "${project_path}" checkout -B "${CMUX_TRUNK}") >/dev/null + git -C "${project_path}" config user.name "cmux-bench" + git -C "${project_path}" config user.email "bench@cmux.local" + git -C "${project_path}" add -A >/dev/null + git -C "${project_path}" commit -m "chore: initial snapshot" --allow-empty >/dev/null } -ensure_bun +command -v bun >/dev/null 2>&1 || fatal "bun is not installed" project_path=$(resolve_project_path) ensure_git_repo "${project_path}" -bun --version >/dev/null 2>&1 || fatal "bun not available after ensure_bun" - log "starting cmux agent session for ${project_path}" cd "${CMUX_APP_ROOT}" diff --git a/benchmarks/terminal_bench/cmux_agent.py b/benchmarks/terminal_bench/cmux_agent.py index 29cd3f9e7..9bb9d93f9 100644 --- a/benchmarks/terminal_bench/cmux_agent.py +++ b/benchmarks/terminal_bench/cmux_agent.py @@ -33,6 +33,7 @@ class CmuxAgent(AbstractInstalledAgent): "tsconfig.json", "tsconfig.main.json", "src", + "dist", ) _PROVIDER_ENV_KEYS: Sequence[str] = ( @@ -140,33 +141,22 @@ def _env(self) -> dict[str, str]: else: raise ValueError("CMUX_MODE must be one of plan, exec, or execute") - config_root = env["CMUX_CONFIG_ROOT"].strip() - app_root = env["CMUX_APP_ROOT"].strip() - workspace_id = env["CMUX_WORKSPACE_ID"].strip() - project_candidates = env["CMUX_PROJECT_CANDIDATES"].strip() - if not config_root: - raise ValueError("CMUX_CONFIG_ROOT must be set") - if not app_root: - raise ValueError("CMUX_APP_ROOT must be set") - if not workspace_id: - raise ValueError("CMUX_WORKSPACE_ID must be set") - if not project_candidates: - raise ValueError("CMUX_PROJECT_CANDIDATES must be set") - env["CMUX_CONFIG_ROOT"] = config_root - env["CMUX_APP_ROOT"] = app_root - env["CMUX_WORKSPACE_ID"] = workspace_id - env["CMUX_PROJECT_CANDIDATES"] = project_candidates - - timeout_value = env.get("CMUX_TIMEOUT_MS") - if timeout_value: - timeout_value = timeout_value.strip() - if not timeout_value.isdigit(): - raise ValueError("CMUX_TIMEOUT_MS must be an integer expressed in ms") - env["CMUX_TIMEOUT_MS"] = timeout_value - - project_path = env.get("CMUX_PROJECT_PATH") - if project_path is not None and not project_path.strip(): - raise ValueError("CMUX_PROJECT_PATH must be non-empty when provided") + # These env vars are all set with defaults above, no need to validate + for key in ( + "CMUX_CONFIG_ROOT", + "CMUX_APP_ROOT", + "CMUX_WORKSPACE_ID", + "CMUX_PROJECT_CANDIDATES", + ): + env[key] = env[key].strip() + + if timeout_value := env.get("CMUX_TIMEOUT_MS"): + if not timeout_value.strip().isdigit(): + raise ValueError("CMUX_TIMEOUT_MS must be an integer") + + if project_path := env.get("CMUX_PROJECT_PATH"): + if not project_path.strip(): + raise ValueError("CMUX_PROJECT_PATH must be non-empty when provided") return env @@ -180,37 +170,25 @@ def perform_task( session: TmuxSession, logging_dir=None, ) -> AgentResult: - if not instruction or not instruction.strip(): + if not instruction.strip(): raise ValueError("instruction must be a non-empty string") - self._ensure_payload_staged(session) - return super().perform_task( - instruction=instruction, session=session, logging_dir=logging_dir - ) + return super().perform_task(instruction, session, logging_dir) def _ensure_payload_staged(self, session: TmuxSession) -> None: container_id = getattr(session.container, "id", None) - if container_id and container_id == self._staged_container_id: + if container_id == self._staged_container_id: return - archive = self._build_archive() + if not self._archive_bytes: + self._archive_bytes = build_app_archive( + self._repo_root, self._INCLUDE_PATHS + ) + stage_payload( - session=session, - archive_bytes=archive, - archive_name=self._ARCHIVE_NAME, - runner_path=self._runner_path, + session, self._archive_bytes, self._ARCHIVE_NAME, self._runner_path ) - - if container_id: - self._staged_container_id = container_id - - def _build_archive(self) -> bytes: - if self._archive_bytes is not None: - return self._archive_bytes - - archive = build_app_archive(self._repo_root, self._INCLUDE_PATHS) - self._archive_bytes = archive - return archive + self._staged_container_id = container_id def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]: escaped = shlex.quote(instruction) diff --git a/benchmarks/terminal_bench/cmux_payload.py b/benchmarks/terminal_bench/cmux_payload.py index 715d370bc..c98093c6f 100644 --- a/benchmarks/terminal_bench/cmux_payload.py +++ b/benchmarks/terminal_bench/cmux_payload.py @@ -11,8 +11,7 @@ def build_app_archive(repo_root: Path, include_paths: Iterable[str]) -> bytes: """Pack the cmux workspace into a gzipped tarball.""" - - if not repo_root or not repo_root.exists(): + if not repo_root.exists(): raise FileNotFoundError(f"cmux repo root {repo_root} not found") buffer = io.BytesIO() @@ -22,8 +21,6 @@ def build_app_archive(repo_root: Path, include_paths: Iterable[str]) -> bytes: if not source.exists(): raise FileNotFoundError(f"Required file {source} missing") archive.add(source, arcname=relative_path, recursive=True) - - buffer.seek(0) return buffer.getvalue() @@ -34,27 +31,13 @@ def stage_payload( runner_path: Path, ) -> None: """Copy the cmux bundle and runner into the task container.""" - - if not archive_bytes: - raise ValueError("archive_bytes must be non-empty") - if not runner_path or not runner_path.is_file(): - raise FileNotFoundError(f"cmux runner missing at {runner_path}") - with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as temp_file: temp_file.write(archive_bytes) temp_path = Path(temp_file.name) try: - session.copy_to_container( - paths=temp_path, - container_dir="/installed-agent", - container_filename=archive_name, - ) + session.copy_to_container(temp_path, "/installed-agent", archive_name) finally: temp_path.unlink(missing_ok=True) - session.copy_to_container( - paths=runner_path, - container_dir="/installed-agent", - container_filename=runner_path.name, - ) + session.copy_to_container(runner_path, "/installed-agent", runner_path.name) diff --git a/benchmarks/terminal_bench/cmux_setup.sh.j2 b/benchmarks/terminal_bench/cmux_setup.sh.j2 index e5b1542f0..e9b879dd9 100644 --- a/benchmarks/terminal_bench/cmux_setup.sh.j2 +++ b/benchmarks/terminal_bench/cmux_setup.sh.j2 @@ -37,23 +37,15 @@ CMUX_APP_ROOT="${CMUX_APP_ROOT:-/opt/cmux-app}" CMUX_CONFIG_ROOT="${CMUX_CONFIG_ROOT:-/root/.cmux}" CMUX_AGENT_VERSION="{{ version if version is not none else '' }}" -mkdir -p "$CMUX_APP_ROOT" - +rm -rf "${CMUX_APP_ROOT}" if [[ -n "${CMUX_AGENT_VERSION}" ]]; then - : "${CMUX_AGENT_GIT_URL:?CMUX_AGENT_GIT_URL must be set when version is provided}" + : "${CMUX_AGENT_GIT_URL:?CMUX_AGENT_GIT_URL required when version is set}" log "cloning cmux from ${CMUX_AGENT_GIT_URL} @ ${CMUX_AGENT_VERSION}" - rm -rf "${CMUX_APP_ROOT}" git clone --depth 1 --branch "${CMUX_AGENT_VERSION}" "${CMUX_AGENT_GIT_URL}" "${CMUX_APP_ROOT}" else - ARCHIVE_PATH="/installed-agent/cmux-app.tar.gz" - if [[ ! -s "${ARCHIVE_PATH}" ]]; then - printf 'Expected cmux archive at %s\n' "${ARCHIVE_PATH}" >&2 - exit 1 - fi log "extracting cmux archive" - rm -rf "${CMUX_APP_ROOT}" mkdir -p "${CMUX_APP_ROOT}" - tar -xzf "${ARCHIVE_PATH}" -C "${CMUX_APP_ROOT}" + tar -xzf "/installed-agent/cmux-app.tar.gz" -C "${CMUX_APP_ROOT}" fi cd "${CMUX_APP_ROOT}"