From f287bbbfb381657dcc03d44a2ddbdebcb56fbce4 Mon Sep 17 00:00:00 2001
From: Prakhar Khatri <prakharkhatri123@gmail.com>
Date: Mon, 20 Apr 2026 11:12:46 +0000
Subject: [PATCH 1/9] fix: capture new untracked files in codex, fix claude
 model lookup, remove duplicate codex capture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

capture-codex.py: `git diff` misses brand-new untracked files. Add
`git ls-files --others --exclude-standard` pass so Codex attribution
works when it creates a file from scratch. Also updated `get_dirty_file_names`
to include untracked files in the pre-task snapshot for correct exclusion.

capture-claude.py: `get_model_and_prompt` was guessing the session file
path from the session ID, but Claude Code organises sessions by repo path
slug, not session ID. Switch to a recursive glob search across
~/.claude/projects/**/{session_id}.jsonl so the model name is always found.

codex.rs: `agentdiff configure` was writing both `notify` in config.toml
AND `UserPromptSubmit`/`Stop` in hooks.json. When codex_hooks=true, Codex
fires both for the same task — doubling every session.jsonl entry. Remove
the `notify` key when enabling codex_hooks so only hooks.json fires.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 scripts/capture-claude.py | 77 ++++++++++++++++-----------------
 scripts/capture-codex.py  | 45 ++++++++++++++++---
 src/configure/codex.rs    | 91 +++++++++++++++++----------------------
 3 files changed, 118 insertions(+), 95 deletions(-)

diff --git a/scripts/capture-claude.py b/scripts/capture-claude.py
index 043ff9b..e55e7b8 100644
--- a/scripts/capture-claude.py
+++ b/scripts/capture-claude.py
@@ -86,47 +86,46 @@ def get_session_log(cwd: str):
 
 
 def get_model_and_prompt(cwd: str, session_id: str) -> tuple:
-    """Read model and prompt from Claude session JSONL."""
+    """Read model and prompt from Claude Code session JSONL.
+
+    Claude Code stores session files at:
+      ~/.claude/projects/{repo-slug}/{session_id}.jsonl
+    where the repo slug is the repo path with slashes replaced by dashes.
+    We glob-search all project dirs to avoid reconstructing the slug.
+    """
+    import glob as _glob
     try:
-        # Try to find the session file
         home = os.path.expanduser("~")
-        parts = session_id.split("-")
-        # Construct likely path
-        possible_paths = [
-            os.path.join(home, ".claude", "projects", parts[-1] if parts else "", f"{session_id}.jsonl"),
-            os.path.join(home, ".claude", "projects", session_id[:8], f"{session_id}.jsonl"),
-        ]
-
-        for session_path in possible_paths:
-            if os.path.exists(session_path):
-                with open(session_path) as f:
-                    lines = f.readlines()
-
-                # Find last assistant message for model
-                model = "unknown"
-                for line in reversed(lines):
-                    try:
-                        entry = json.loads(line)
-                        if entry.get("type") == "assistant" and entry.get("message", {}).get("model"):
-                            model = entry["message"]["model"]
-                            break
-                    except:
-                        continue
-
-                # Find last-prompt for the actual user request
-                prompt = "unknown"
-                for line in reversed(lines):
-                    try:
-                        entry = json.loads(line)
-                        if entry.get("type") == "last-prompt":
-                            prompt = entry.get("lastPrompt", "unknown")
-                            break
-                    except:
-                        continue
-
-                return model, prompt
-
-        return "unknown", "unknown"
+        pattern = os.path.join(home, ".claude", "projects", "**", f"{session_id}.jsonl")
+        matches = _glob.glob(pattern, recursive=True)
+        if not matches:
+            return "unknown", "unknown"
+
+        session_path = matches[0]
+        with open(session_path, encoding="utf-8", errors="replace") as f:
+            lines = f.readlines()
+
+        model = "unknown"
+        for line in reversed(lines):
+            try:
+                entry = json.loads(line)
+                if entry.get("type") == "assistant" and entry.get("message", {}).get("model"):
+                    model = entry["message"]["model"]
+                    break
+            except Exception:
+                continue
+
+        prompt = "unknown"
+        for line in reversed(lines):
+            try:
+                entry = json.loads(line)
+                if entry.get("type") == "last-prompt":
+                    prompt = entry.get("lastPrompt", "unknown")
+                    break
+            except Exception:
+                continue
+
+        return model, prompt
     except Exception:
         return "unknown", "unknown"
 
diff --git a/scripts/capture-codex.py b/scripts/capture-codex.py
index 2ed2dfd..1936e8a 100644
--- a/scripts/capture-codex.py
+++ b/scripts/capture-codex.py
@@ -131,21 +131,56 @@ def collect_changed_lines(repo_root: str) -> Dict[str, List[int]]:
             result.setdefault(path, [])
             result[path].extend(lines)
 
+    # git diff does not show brand-new untracked files. Detect them separately
+    # so Codex attribution works when it creates a file from scratch.
+    try:
+        untracked = subprocess.run(
+            ["git", "ls-files", "--others", "--exclude-standard"],
+            capture_output=True, text=True, cwd=repo_root,
+        )
+        if untracked.returncode == 0:
+            for rel_path in untracked.stdout.splitlines():
+                rel_path = rel_path.strip()
+                if not rel_path or rel_path in result:
+                    continue
+                abs_path = os.path.join(repo_root, rel_path)
+                try:
+                    with open(abs_path, "r", encoding="utf-8", errors="replace") as fh:
+                        line_count = sum(1 for _ in fh)
+                    if line_count > 0:
+                        result[rel_path] = list(range(1, line_count + 1))
+                    else:
+                        result[rel_path] = [1]
+                except (OSError, IOError):
+                    result[rel_path] = [1]
+    except Exception:
+        pass
+
     return {k: sorted(set(v)) for k, v in result.items() if v}
 
 
 def get_dirty_file_names(repo_root: str) -> List[str]:
-    """Return repo-relative paths of all files currently differing from HEAD."""
+    """Return repo-relative paths of all files currently differing from HEAD, including untracked."""
+    files: List[str] = []
     try:
         out = subprocess.run(
             ["git", "diff", "HEAD", "--name-only"],
             capture_output=True, text=True, cwd=repo_root,
         )
-        if out.returncode != 0:
-            return []
-        return [line.strip() for line in out.stdout.splitlines() if line.strip()]
+        if out.returncode == 0:
+            files.extend(line.strip() for line in out.stdout.splitlines() if line.strip())
     except Exception:
-        return []
+        pass
+    try:
+        untracked = subprocess.run(
+            ["git", "ls-files", "--others", "--exclude-standard"],
+            capture_output=True, text=True, cwd=repo_root,
+        )
+        if untracked.returncode == 0:
+            files.extend(line.strip() for line in untracked.stdout.splitlines() if line.strip())
+    except Exception:
+        pass
+    return list(dict.fromkeys(files))  # deduplicate, preserve order
 
 
 def pre_task_state_path(repo_root: str) -> str:
diff --git a/src/configure/codex.rs b/src/configure/codex.rs
index 3f7fbfa..cd3a496 100644
--- a/src/configure/codex.rs
+++ b/src/configure/codex.rs
@@ -17,15 +17,17 @@ pub fn step_configure_codex(config: &Config) -> Result<()> {
     Ok(())
 }
 
-/// Write notify + codex_hooks=true into ~/.codex/config.toml.
-/// The notify array is kept for backwards compat with older Codex builds that
-/// predate the hooks.json event system.
+/// Write codex_hooks=true into ~/.codex/config.toml.
+/// When hooks.json is active (codex_hooks=true), the legacy `notify` key is
+/// removed — newer Codex fires both notify AND hooks.json Stop for the same
+/// event, causing duplicate session.jsonl entries per task.
+/// The notify key is only kept when codex_hooks cannot be enabled (old Codex).
 fn step_configure_codex_toml(
     config: &Config,
     codex_dir: &std::path::Path,
     config_path: &std::path::Path,
 ) -> Result<()> {
-    let capture_script = config.scripts_root().join("capture-codex.py");
+    let _ = config; // capture_script path no longer needed (notify removed)
     let raw = fs::read_to_string(config_path).unwrap_or_default();
     let mut cfg_val: toml::Value = if raw.trim().is_empty() {
         toml::Value::Table(Default::default())
@@ -38,52 +40,7 @@ fn step_configure_codex_toml(
         .context("Codex config root must be a table")?;
     let mut changed = false;
 
-    let current_notify = table.get("notify").and_then(toml_array_to_strings);
-    let wanted_base = vec![
-        "python3".to_string(),
-        capture_script.to_string_lossy().to_string(),
-    ];
-
-    let next_notify = match current_notify {
-        None => wanted_base.clone(),
-        Some(existing) => {
-            if existing.iter().any(|part| part.contains("capture-codex.py")) {
-                if let Some(forward_idx) = existing.iter().position(|p| p == "--forward") {
-                    let forward = existing.get(forward_idx + 1).cloned().unwrap_or_default();
-                    if forward.is_empty() {
-                        wanted_base.clone()
-                    } else {
-                        let mut with_forward = wanted_base.clone();
-                        with_forward.push("--forward".to_string());
-                        with_forward.push(forward);
-                        with_forward
-                    }
-                } else {
-                    wanted_base.clone()
-                }
-            } else if existing.is_empty() {
-                wanted_base.clone()
-            } else {
-                let forward = serde_json::to_string(&existing)?;
-                let mut chained = wanted_base.clone();
-                chained.push("--forward".to_string());
-                chained.push(forward);
-                chained
-            }
-        }
-    };
-
-    if table
-        .get("notify")
-        .and_then(toml_array_to_strings)
-        .unwrap_or_default()
-        != next_notify
-    {
-        table.insert("notify".to_string(), string_array_to_toml(&next_notify));
-        changed = true;
-    }
-
-    // Ensure codex_hooks = true so hooks.json events are emitted.
+    // Enable hooks.json event system — this is the primary capture path.
     let features = table
         .entry("features".to_string())
         .or_insert(toml::Value::Table(Default::default()));
@@ -95,11 +52,43 @@ fn step_configure_codex_toml(
         changed = true;
     }
 
+    // With codex_hooks=true, hooks.json handles all events. Remove the legacy
+    // `notify` key so Codex doesn't fire capture-codex.py twice per task
+    // (once via notify, once via the hooks.json Stop event).
+    // If a prior notify value was forwarding to another tool, preserve that
+    // tool but strip our own capture script out of the chain.
+    let current_notify = table.get("notify").and_then(toml_array_to_strings);
+    if let Some(existing) = current_notify {
+        if existing.iter().any(|p| p.contains("capture-codex.py")) {
+            // Find what, if anything, was being forwarded to.
+            let forward_idx = existing.iter().position(|p| p == "--forward");
+            let forward_val = forward_idx
+                .and_then(|i| existing.get(i + 1))
+                .cloned()
+                .unwrap_or_default();
+
+            if forward_val.is_empty() {
+                // notify was only our hook — remove the key entirely.
+                table.remove("notify");
+            } else {
+                // notify was chaining into another tool — restore just that tool.
+                if let Ok(other_cmd) = serde_json::from_str::<Vec<String>>(&forward_val) {
+                    table.insert("notify".to_string(), string_array_to_toml(&other_cmd));
+                } else {
+                    table.remove("notify");
+                }
+            }
+            changed = true;
+        }
+        // If notify doesn't contain our script, leave it untouched.
+    }
+    // If notify was absent, nothing to do.
+
     if changed {
         fs::create_dir_all(codex_dir)?;
         fs::write(config_path, toml::to_string_pretty(&cfg_val)?)?;
         println!(
-            "{} Codex config.toml updated (notify + codex_hooks=true) in {}",
+            "{} Codex config.toml updated (codex_hooks=true, notify removed) in {}",
             ok(),
             config_path.display()
         );

From 685244326eeca6d0c66fc6f58faba7c88d1387ce Mon Sep 17 00:00:00 2001
From: Prakhar Khatri <prakharkhatri123@gmail.com>
Date: Mon, 20 Apr 2026 11:12:57 +0000
Subject: [PATCH 2/9] =?UTF-8?q?docs:=20fix=20stale=20README=20=E2=80=94=20?=
 =?UTF-8?q?correct=20tracking=20claim,=20remove=20removed=20commands,=20up?=
 =?UTF-8?q?date=20CI=20section?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix incorrect claim that configure tracks all repos globally (init is required per repo)
- Remove commands that no longer exist: stats, log, remote-status, migrate, export
- Add install-ci to commands table
- Fix example flags: --out-md/--out-annotations → --out, agentdiff stats → agentdiff report
- Replace manual CI YAML with agentdiff install-ci workflow + correct manual example
- Fix install.sh URL: master → main
- Remove stale config.toml keys (data_dir, auto_amend_ledger)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md | 87 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 46 insertions(+), 41 deletions(-)

diff --git a/README.md b/README.md
index 864d90c..ccc3bcf 100644
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@ agentdiff stats
 
 That's it. From here every commit is attributed to whichever agent (or human) wrote it.
 
-> **Note:** `agentdiff configure` installs capture hooks globally — all repos you work on with AI agents will be tracked. To track only specific repos, you can skip the global configure and run `agentdiff init` per-repo only (you will need to configure hooks manually).
+> **Note:** `agentdiff configure` installs capture scripts globally, but capture only fires in repos where `agentdiff init` has been run (the `.git/agentdiff/` directory must exist). Running `configure` on its own does not track any repo — you must also run `agentdiff init` inside each repo you want to track.
 
 ---
 
@@ -87,16 +87,16 @@ That's it. From here every commit is attributed to whichever agent (or human) wr
 
 | Command | Description |
 |---------|-------------|
-| `agentdiff configure` | Install global agent hooks — run once per machine |
-| `agentdiff init` | Initialize tracking in current repository |
+| `agentdiff configure` | Install global agent capture hooks — run once per machine |
+| `agentdiff init` | Initialize tracking in current repository (required per repo) |
+| `agentdiff install-ci` | Write CI workflow YAMLs to `.github/workflows/` — run once per repo |
 | `agentdiff list` | List attribution entries |
 | `agentdiff blame <file>` | Line-level attribution, like `git blame` |
-| `agentdiff stats` | Aggregate stats by agent, model, file |
-| `agentdiff log` | Chronological AI contribution history |
 | `agentdiff diff [<sha>]` | Attribution diff for a commit or range |
 | `agentdiff show <sha>` | Full details for one trace entry |
-| `agentdiff report` | CI report in Markdown or GitHub annotations |
+| `agentdiff report` | Aggregate report (text, markdown, annotations, JSONL) |
 | `agentdiff status` | Health check — hooks, keys, traces |
+| `agentdiff status --remote` | Show remote trace ref state (`refs/agentdiff/*` on origin) |
 | `agentdiff push` | Push local traces to per-branch ref on origin |
 | `agentdiff consolidate` | Merge per-branch traces into permanent store (CI) |
 | `agentdiff verify` | Verify ed25519 signatures on trace entries |
@@ -104,9 +104,6 @@ That's it. From here every commit is attributed to whichever agent (or human) wr
 | `agentdiff keys register` | Register your public key in the git key registry |
 | `agentdiff keys rotate` | Rotate your keypair and register the new key |
 | `agentdiff policy check` | Enforce AI attribution policy rules |
-| `agentdiff export` | Export traces in Agent Trace JSONL format |
-| `agentdiff remote-status` | Show remote trace ref state (`refs/agentdiff/*` on origin) |
-| `agentdiff migrate` | Import legacy ledger.jsonl into new storage |
 | `agentdiff config` | Manage global configuration |
 
 <details>
@@ -120,15 +117,19 @@ agentdiff list --limit 50
 # Blame for a specific agent only
 agentdiff blame src/api.rs --agent claude-code
 
-# Stats broken down by file and model
-agentdiff stats --by-file --by-model
+# Report broken down by file and model
+agentdiff report --by-file --by-model
 
-# Stats from a specific date
-agentdiff stats --since 2026-01-01T00:00:00Z
+# Report from a specific date
+agentdiff report --since 2026-01-01T00:00:00Z
 
-# CI report to file
-agentdiff report --format markdown --out-md report.md
-agentdiff report --format annotations --out-annotations annotations.json
+# Report to file
+agentdiff report --format markdown --out report.md
+agentdiff report --format annotations --out annotations.json
+
+# Post report as a PR comment (auto-detects PR from current branch)
+agentdiff report --format markdown --post-pr-comment
+agentdiff report --format markdown --post-pr-comment 42   # explicit PR number
 
 # Attribution diff for last 3 commits
 agentdiff diff HEAD~3
@@ -147,6 +148,9 @@ agentdiff push
 # Consolidate a branch's traces into permanent store (CI step)
 agentdiff consolidate --branch feature/my-branch --push
 
+# Write CI workflows to .github/workflows/ (run once per repo)
+agentdiff install-ci
+
 # Skip specific agents during configure
 agentdiff configure --no-copilot --no-antigravity
 
@@ -154,8 +158,8 @@ agentdiff configure --no-copilot --no-antigravity
 agentdiff init --no-git-hook
 
 # Check remote trace ref state after pushing
-agentdiff remote-status
-agentdiff remote-status --no-fetch   # fast: show refs + SHAs only, skip trace counts
+agentdiff status --remote
+agentdiff status --remote --no-fetch   # fast: show refs + SHAs only, skip trace counts
 ```
 
 </details>
@@ -174,7 +178,7 @@ agentdiff remote-status --no-fetch   # fast: show refs + SHAs only, skip trace c
 | **Codex CLI** | `notify` hook (`~/.codex/config.toml`) | Task-level file changes |
 | **Gemini / Antigravity** | `BeforeTool`/`AfterTool` hooks (`~/.gemini/settings.json`) | `write_file`, `replace` |
 
-Agent hooks for Claude, Cursor, Codex, Windsurf, OpenCode, and Gemini are all installed **globally once** via `agentdiff configure` — no per-repo setup needed for those.
+Agent hooks for Claude, Cursor, Codex, Windsurf, OpenCode, and Gemini are all installed **globally once** via `agentdiff configure`. However, capture only fires in repos where `agentdiff init` has been run — the `.git/agentdiff/` directory must exist for any data to be written.
 
 ---
 
@@ -456,11 +460,27 @@ Exits 0 on pass, 1 on violation. Use `--since <sha>` to scope to a specific rang
 
 ## CI Integration
 
-**Full pipeline** — report, verify, and enforce policy on every PR:
+Run once to write both workflow files into your repo:
+
+```bash
+agentdiff install-ci
+git add .github/workflows/agentdiff-*.yml
+git commit -m "ci: add agentdiff consolidation and policy workflows"
+```
+
+This writes two workflows:
+
+- **`agentdiff-consolidate.yml`** — triggers on PR merge: consolidates per-branch traces into the permanent store and posts an attribution comment to the PR.
+- **`agentdiff-policy.yml`** — triggers on every PR: runs `agentdiff policy check` and posts GitHub check annotations if rules are violated.
+
+For repos that need a custom pipeline, the manual equivalent:
 
 ```yaml
-# .github/workflows/agentdiff.yml
+# .github/workflows/agentdiff-policy.yml
 on: [pull_request]
+permissions:
+  contents: read
+  checks: write
 
 jobs:
   agentdiff:
@@ -472,31 +492,17 @@ jobs:
 
       - name: Install agentdiff
         run: |
-          curl -fsSL https://raw.githubusercontent.com/codeprakhar25/agentdiff/master/install.sh | bash
+          curl -fsSL https://raw.githubusercontent.com/codeprakhar25/agentdiff/main/install.sh | bash
           echo "$HOME/.local/bin" >> $GITHUB_PATH
 
-      - name: Init repo
-        run: agentdiff init --no-git-hook
-
       - name: Fetch agentdiff refs
-        run: git fetch origin 'refs/agentdiff/*:refs/agentdiff/*'
-
-      - name: Consolidate traces
-        run: agentdiff consolidate --branch ${{ github.head_ref }} --push
+        run: git fetch origin '+refs/agentdiff/*:refs/agentdiff/*' || true
 
       - name: Verify signatures
         run: agentdiff verify
 
       - name: Policy check
         run: agentdiff policy check --format github-annotations
-
-      - name: Generate report
-        run: agentdiff report --format markdown --out-md ai-report.md
-
-      - name: Post as PR comment
-        uses: marocchino/sticky-pull-request-comment@v2
-        with:
-          path: ai-report.md
 ```
 
 ---
@@ -508,8 +514,7 @@ Config lives at `~/.agentdiff/config.toml`:
 ```toml
 schema_version = "1.0"
 scripts_dir = "~/.agentdiff/scripts"
-auto_amend_ledger = true        # include ledger in same commit automatically
-data_dir = "~/.agentdiff/spillover"
+capture_prompts = true   # set false to omit prompt excerpts from traces
 
 [[repos]]
 path = "/home/user/my-project"
@@ -517,8 +522,8 @@ slug = "-home-user-my-project"
 ```
 
 ```bash
-# Disable auto-amend
-agentdiff config set auto_amend_ledger false
+# Disable prompt capture
+agentdiff config set capture_prompts false
 
 # View current config
 agentdiff config show

From ebf1b58141caa09461ec68f51f95a90a33978475 Mon Sep 17 00:00:00 2001
From: Prakhar Khatri <prakharkhatri123@gmail.com>
Date: Tue, 21 Apr 2026 07:23:52 +0000
Subject: [PATCH 3/9] =?UTF-8?q?fix:=20correct=20attribution=20pipeline=20?=
 =?UTF-8?q?=E2=80=94=20human=20fallback,=20git=5Fauthor=20display,=20no=20?=
 =?UTF-8?q?session-evidence=20files?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- prepare-ledger: preserve agent="human" as semantic token; add git_author
  field separately so finalize-ledger can display the real git username
  without losing the human/AI distinction for type checks
- prepare-ledger: explicitly attribute files with no session.jsonl evidence
  to human rather than inheriting the dominant AI agent — fixes cases where
  AI and human edits are committed together and untracked files were
  incorrectly claimed by the AI
- finalize-ledger: read git_author from payload; use it for tool.name when
  agent=="human" so contributor.type=="human" traces show the committer name
- store: remove session.jsonl load from load_entries() — only AgentTrace
  records belong in the committed view; add load_uncommitted_entries() for
  the --uncommitted path to avoid double-counting and copilot leakage
- list: use load_uncommitted_entries() for the uncommitted view

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 scripts/finalize-ledger.py |  3 ++-
 scripts/prepare-ledger.py  | 16 ++++++++++++----
 src/commands/list.rs       |  3 +--
 src/store.rs               | 12 ++++++++----
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/scripts/finalize-ledger.py b/scripts/finalize-ledger.py
index cea7395..7168f32 100644
--- a/scripts/finalize-ledger.py
+++ b/scripts/finalize-ledger.py
@@ -106,6 +106,7 @@ def write_agent_trace(repo_root: str, pending: dict, sha: str, ts: str) -> Optio
 
     # Build per-file trace entries from pending payload.
     agent = str(pending.get("agent") or "human")
+    git_author = str(pending.get("git_author") or agent)
     model = str(pending.get("model") or "human")
     attribution = pending.get("attribution") or {}
     lines_map = pending.get("lines") or {}
@@ -161,7 +162,7 @@ def write_agent_trace(repo_root: str, pending: dict, sha: str, ts: str) -> Optio
         "id": str(uuid_mod.uuid4()),
         "timestamp": ts,
         "vcs": {"type": "git", "revision": sha},
-        "tool": {"name": agent},
+        "tool": {"name": git_author if agent == "human" else agent},
         "files": files,
     }
     _ = model  # captured above into per-file contributor.model_id
diff --git a/scripts/prepare-ledger.py b/scripts/prepare-ledger.py
index 5607dc0..e4b5b4c 100644
--- a/scripts/prepare-ledger.py
+++ b/scripts/prepare-ledger.py
@@ -289,8 +289,7 @@ def main() -> int:
     prompt = str(pending.get("prompt") or event.get("prompt") or "")
     session_id = str(pending.get("session_id") or event.get("session_id") or "unknown")
     agent = str(pending.get("agent") or event.get("agent") or "human")
-    if agent == "human":
-        agent = get_git_username(repo_root)
+    git_author = get_git_username(repo_root)
     model = str(pending.get("model_id") or pending.get("model") or event.get("model") or "human")
     files_read = pending.get("files_read")
     if not isinstance(files_read, list):
@@ -313,13 +312,14 @@ def main() -> int:
         intent = str(intent)
 
     # Per-file attribution — each file maps to the agent/model that wrote it.
-    # Only populated when multiple agents are detected; omitted for single-agent commits.
+    # Files with a session event that matches the dominant agent are omitted (finalize
+    # falls back to the top-level agent). Files with NO session evidence at all are
+    # explicitly marked "human" so they are never incorrectly claimed by the dominant agent.
     attribution: Dict[str, dict] = {}
     for fp, ev in events_by_file.items():
         file_agent = str(ev.get("agent") or "human")
         file_model = str(ev.get("model") or "human")
         if file_agent != agent or file_model != model:
-            # Only store when it differs from the dominant agent (saves space for single-agent commits)
             attribution[fp] = {
                 "agent": file_agent,
                 "model": file_model,
@@ -327,9 +327,17 @@ def main() -> int:
                 "tool": str(ev.get("tool") or "commit"),
             }
 
+    # Files committed with no captured session event → attribute to human.
+    # Without this, finalize-ledger.py would inherit the dominant AI agent for these
+    # files even though we have no evidence the AI touched them.
+    for fp in files_touched:
+        if fp not in events_by_file:
+            attribution[fp] = {"agent": "human", "model": "human"}
+
     payload = {
         "captured_at": datetime.now(timezone.utc).isoformat(),
         "agent": agent,
+        "git_author": git_author,
         "model": model,
         "session_id": session_id,
         "files_touched": files_touched,
diff --git a/src/commands/list.rs b/src/commands/list.rs
index 9974651..2f9a9cc 100644
--- a/src/commands/list.rs
+++ b/src/commands/list.rs
@@ -92,8 +92,7 @@ pub fn run(store: &Store, args: &ListArgs) -> Result<()> {
 }
 
 fn run_uncommitted(store: &Store, args: &ListArgs) -> Result<()> {
-    let mut entries = store.load_entries()?;
-    entries.retain(|e| !e.committed);
+    let mut entries = store.load_uncommitted_entries()?;
 
     if let Some(ref agent) = args.agent {
         entries.retain(|e| e.agent.contains(agent.as_str()));
diff --git a/src/store.rs b/src/store.rs
index 7d2053e..f44fcb3 100644
--- a/src/store.rs
+++ b/src/store.rs
@@ -109,6 +109,14 @@ impl Store {
         Ok(values)
     }
 
+    /// Load uncommitted session entries (not yet finalized into AgentTrace records).
+    pub fn load_uncommitted_entries(&self) -> Result<Vec<Entry>> {
+        let mut entries = Vec::new();
+        let session_path = Config::repo_session_log(&self.repo_root);
+        load_session_from(&session_path, &mut entries, false)?;
+        Ok(entries)
+    }
+
     /// Load all traces and convert to Entry for display commands.
     pub fn load_entries(&self) -> Result<Vec<Entry>> {
         let traces = self.load_all_traces()?;
@@ -117,10 +125,6 @@ impl Store {
             .flat_map(|t| t.to_entries(&self.repo_root))
             .collect();
 
-        // Also load uncommitted session entries
-        let session_path = Config::repo_session_log(&self.repo_root);
-        load_session_from(&session_path, &mut entries, false)?;
-
         entries.sort_by(|a, b| {
             a.timestamp
                 .cmp(&b.timestamp)

From 7b0c47a4e4d787432fb7dd6f27348e778c724856 Mon Sep 17 00:00:00 2001
From: Prakhar Khatri <prakharkhatri123@gmail.com>
Date: Tue, 21 Apr 2026 07:24:04 +0000
Subject: [PATCH 4/9] =?UTF-8?q?fix:=20cursor=20configure=20=E2=80=94=20che?=
 =?UTF-8?q?ck=20dir=20not=20file,=20create=20hooks.json,=20write=20to=20WS?=
 =?UTF-8?q?L+Windows=20paths?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Check ~/.cursor/ directory existence instead of hooks.json existence so
  the file is created when Cursor is installed but hooks.json is absent
- Extract configure_cursor_hooks_file() helper to apply the same hooks to
  multiple paths without duplication
- On WSL2, Cursor is a Windows app — scan /mnt/c/Users/*/\.cursor and write
  hooks.json there alongside the WSL ~/.cursor/hooks.json so whichever path
  cursor-server resolves picks up the config
- Summary in print_configure_summary now checks presence_path (dir) separately
  from config_path (file) for all home-based tools, giving accurate output
  when the tool is installed but not yet configured

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/configure/cursor.rs | 75 ++++++++++++++++++++++++++++++++++-------
 src/configure/mod.rs    | 24 ++++++++++---
 2 files changed, 81 insertions(+), 18 deletions(-)

diff --git a/src/configure/cursor.rs b/src/configure/cursor.rs
index 14c3fde..2b2dc07 100644
--- a/src/configure/cursor.rs
+++ b/src/configure/cursor.rs
@@ -5,19 +5,70 @@ use anyhow::{Context, Result};
 use std::fs;
 
 pub fn step_configure_cursor(config: &Config) -> Result<()> {
-    let hooks_path = dirs::home_dir().unwrap().join(".cursor").join("hooks.json");
-    if !hooks_path.exists() {
+    let capture_script = config.scripts_root().join("capture-cursor.py");
+    let capture_cmd = format!("python3 {}", capture_script.display());
+
+    // Cursor on WSL2 is a Windows app — it reads hooks from the Windows-side ~/.cursor/.
+    // We write to both locations so native Linux installs and WSL2 are both covered.
+    let candidate_dirs: Vec<std::path::PathBuf> = {
+        let mut dirs = Vec::new();
+        if let Some(home) = dirs::home_dir() {
+            dirs.push(home.join(".cursor"));
+        }
+        // Windows-side path when running under WSL2.
+        let win_cursor = std::path::Path::new("/mnt/c/Users")
+            .read_dir()
+            .ok()
+            .and_then(|mut rd| rd.next())
+            .and_then(|e| e.ok())
+            .map(|e| e.path().join(".cursor"));
+        // More reliable: derive from $USERPROFILE or the actual Windows username.
+        // Fall back to scanning /mnt/c/Users for the first user directory that has .cursor.
+        let win_cursor_reliable = std::path::Path::new("/mnt/c/Users")
+            .read_dir()
+            .ok()
+            .and_then(|rd| {
+                rd.filter_map(|e| e.ok())
+                    .map(|e| e.path().join(".cursor"))
+                    .find(|p| p.exists())
+            });
+        if let Some(p) = win_cursor_reliable {
+            dirs.push(p);
+        } else if let Some(p) = win_cursor {
+            if p.exists() {
+                dirs.push(p);
+            }
+        }
+        dirs
+    };
+
+    let mut any_found = false;
+    for cursor_dir in &candidate_dirs {
+        if !cursor_dir.exists() {
+            continue;
+        }
+        any_found = true;
+        let hooks_path = cursor_dir.join("hooks.json");
+        configure_cursor_hooks_file(&hooks_path, &capture_cmd)
+            .with_context(|| format!("configuring {}", hooks_path.display()))?;
+    }
+
+    if !any_found {
         println!(
-            "{} ~/.cursor/hooks.json not found — skipping Cursor setup",
+            "{} ~/.cursor not found — skipping Cursor setup",
             warn()
         );
-        return Ok(());
     }
+    Ok(())
+}
 
-    let capture_script = config.scripts_root().join("capture-cursor.py");
-    let raw = fs::read_to_string(&hooks_path)?;
+fn configure_cursor_hooks_file(
+    hooks_path: &std::path::Path,
+    capture_cmd: &str,
+) -> Result<()> {
+    let raw = fs::read_to_string(hooks_path).unwrap_or_else(|_| "{}".to_string());
     let mut hooks_cfg: serde_json::Value =
-        serde_json::from_str(&raw).context("parsing ~/.cursor/hooks.json")?;
+        serde_json::from_str(&raw).context("parsing hooks.json")?;
 
     let hooks = hooks_cfg
         .as_object_mut()
@@ -27,10 +78,9 @@ pub fn step_configure_cursor(config: &Config) -> Result<()> {
         .as_object_mut()
         .unwrap();
 
-    let capture_cmd = format!("python3 {}", capture_script.display());
     let events = ["afterFileEdit", "afterTabFileEdit", "beforeSubmitPrompt"];
-
     let mut changed = false;
+
     for event in events {
         let arr = hooks
             .entry(event)
@@ -49,7 +99,7 @@ pub fn step_configure_cursor(config: &Config) -> Result<()> {
             if cmd.contains("capture-cursor.py") {
                 found = true;
                 if cmd != capture_cmd {
-                    *cmd_val = serde_json::Value::String(capture_cmd.clone());
+                    *cmd_val = serde_json::Value::String(capture_cmd.to_string());
                     changed = true;
                 }
             }
@@ -60,7 +110,6 @@ pub fn step_configure_cursor(config: &Config) -> Result<()> {
             changed = true;
         }
 
-        // De-duplicate exact command duplicates while preserving order.
         let mut seen = std::collections::HashSet::new();
         arr.retain(|hook| {
             let Some(cmd) = hook.get("command").and_then(|c| c.as_str()) else {
@@ -77,14 +126,14 @@ pub fn step_configure_cursor(config: &Config) -> Result<()> {
     }
 
     if changed {
-        fs::write(&hooks_path, serde_json::to_string_pretty(&hooks_cfg)?)?;
+        fs::write(hooks_path, serde_json::to_string_pretty(&hooks_cfg)?)?;
         println!(
             "{} Cursor hooks registered in {}",
             ok(),
             hooks_path.display()
         );
     } else {
-        println!("{} Cursor hooks already present", dim());
+        println!("{} Cursor hooks already present in {}", dim(), hooks_path.display());
     }
     Ok(())
 }
diff --git a/src/configure/mod.rs b/src/configure/mod.rs
index 2d4e12e..27b87ab 100644
--- a/src/configure/mod.rs
+++ b/src/configure/mod.rs
@@ -136,29 +136,34 @@ fn print_configure_summary(
         None => return,
     };
 
-    // Each tuple: (display name, skipped flag, config path joined from home, marker string)
-    let home_based: &[(&str, bool, &[&str], &str)] = &[
+    // Each tuple: (display name, skipped flag, presence-check path parts, config file path parts, marker string)
+    // presence_parts: path that must exist for the tool to be considered installed (dir or file)
+    // config_parts: path that contains the hooks (checked for marker string)
+    let home_based: &[(&str, bool, &[&str], &[&str], &str)] = &[
         (
             "claude-code",
             no_claude,
+            &[".claude"],
             &[".claude", "settings.json"],
             "capture-claude",
         ),
         (
             "cursor",
             no_cursor,
+            &[".cursor"],
             &[".cursor", "hooks.json"],
             "capture-cursor",
         ),
         (
             "windsurf",
             no_windsurf,
+            &[".codeium", "windsurf"],
             &[".codeium", "windsurf", "hooks.json"],
             "capture-windsurf",
         ),
     ];
 
-    for (name, skipped, path_parts, marker) in home_based {
+    for (name, skipped, presence_parts, config_parts, marker) in home_based {
         if *skipped {
             println!(
                 "  {} {}  skipped (--no-{})",
@@ -168,11 +173,20 @@ fn print_configure_summary(
             );
             continue;
         }
-        let config_path = path_parts.iter().fold(home.clone(), |p, part| p.join(part));
-        if !config_path.exists() {
+        let presence_path = presence_parts.iter().fold(home.clone(), |p, part| p.join(part));
+        if !presence_path.exists() {
             println!("  {} {}  not installed on this machine", dim(), name);
             continue;
         }
+        let config_path = config_parts.iter().fold(home.clone(), |p, part| p.join(part));
+        if !config_path.exists() {
+            println!(
+                "  {} {}  hook missing — re-run 'agentdiff configure'",
+                warn(),
+                name
+            );
+            continue;
+        }
         let registered = std::fs::read_to_string(&config_path)
             .map(|s| s.contains(marker))
             .unwrap_or(false);

From 3026001b4d7ef693b2cf9f4eb38d27fbf42a2cfc Mon Sep 17 00:00:00 2001
From: Prakhar Khatri <prakharkhatri123@gmail.com>
Date: Tue, 21 Apr 2026 07:24:10 +0000
Subject: [PATCH 5/9] docs: add CLAUDE.md with project context, attribution
 invariants, and gotchas

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 CLAUDE.md | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 CLAUDE.md

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..6c402e3
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,25 @@
+# agentdiff — Project Context for Claude
+
+## Role
+Work on this as a senior engineer. The bar is production quality: correct attribution logic, no edge-case misattribution, clean diffs. Argue when you think a direction is wrong, but ship it working.
+
+## Project summary
+`agentdiff` is a Rust + Python CLI that tracks which AI agent (claude-code, cursor, opencode, copilot, etc.) wrote which lines of code in a git repo. It hooks into agent tool callbacks, captures to `session.jsonl`, then on commit runs `prepare-ledger.py` → `finalize-ledger.py` to produce signed `AgentTrace` records in `.git/agentdiff/traces/{branch}.jsonl`.
+
+## Architecture
+- **Capture**: per-agent Python scripts (`capture-claude.py`, etc.) write raw events to `.git/agentdiff/session.jsonl`
+- **Prepare** (`scripts/prepare-ledger.py`): runs pre-commit, reads session.jsonl, computes per-file attribution using line overlap, writes `pending_ledger.json`
+- **Finalize** (`scripts/finalize-ledger.py`): runs post-commit, converts pending payload to signed `AgentTrace`, appends to `traces/{branch}.jsonl`
+- **Store** (`src/store.rs`): reads traces into `Entry` structs for `list` / `report` commands
+- **Binary install path**: `~/.local/bin/agentdiff` (NOT `~/.cargo/bin/`) — always `cp target/release/agentdiff ~/.local/bin/agentdiff` after build
+
+## Attribution invariants
+- `copilot` is in `_EXCLUDED_AGENTS` — captured in session.jsonl for stats, never wins file attribution
+- Files with no session evidence → `agent = "human"`, must be explicit in attribution dict
+- `agent = "human"` in payload is the semantic token; `git_author` holds the display name
+- `contributor.type = "human"` iff `file_agent == "human"` — never infer from tool name
+
+## Key gotchas learned the hard way
+- Scripts installed to `~/.agentdiff/scripts/` must be manually synced after edits: `cp scripts/*.py ~/.agentdiff/scripts/`
+- `load_entries()` in store.rs must NOT load session.jsonl — only traces; uncommitted path uses `load_uncommitted_entries()`
+- Configure steps must check directory existence (e.g. `~/.cursor/`), not config file existence — create the file if absent

From 8bff731b14de114f2ebb1db0510b22d8ddf85c26 Mon Sep 17 00:00:00 2001
From: Prakhar Khatri <prakharkhatri123@gmail.com>
Date: Tue, 21 Apr 2026 07:24:15 +0000
Subject: [PATCH 6/9] chore: bump version to 0.1.25

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 Cargo.lock | 2 +-
 Cargo.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 14a4879..90186a0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,7 +4,7 @@ version = 4
 
 [[package]]
 name = "agentdiff"
-version = "0.1.23"
+version = "0.1.25"
 dependencies = [
  "anyhow",
  "assert_cmd",
diff --git a/Cargo.toml b/Cargo.toml
index 501310a..7727009 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "agentdiff"
-version = "0.1.23"
+version = "0.1.25"
 edition = "2024"
 rust-version = "1.85"
 description = "Audit and trace autonomous AI code contributions in git repositories"

From f8d8fa054c25b9fe890a589196eae7e42997ebf7 Mon Sep 17 00:00:00 2001
From: Prakhar Khatri <prakharkhatri123@gmail.com>
Date: Wed, 22 Apr 2026 11:32:41 +0000
Subject: [PATCH 7/9] feat: enhance opencode capture with model and prompt
 retrieval from SQLite DB

- Added functions to retrieve the model ID and initial user prompt from the OpenCode SQLite database.
- Implemented fallback mechanisms to read from model.json if the database lookup fails.
- Updated the main capture logic to utilize the new retrieval functions for model and prompt.
- Introduced a comprehensive test script for the agentdiff pipeline, validating the entire capture, prepare, and finalize process with real and simulated agents.
- Improved cursor configuration in Rust to ensure versioning in hooks configuration.
---
 scripts/capture-claude.py   | 125 ++++++++++++++++++++++++++----------
 scripts/capture-codex.py    |  87 +++++++++++++++++++------
 scripts/capture-cursor.py   | 122 ++++++++++++++++++++++++++++-------
 scripts/capture-opencode.py | 102 ++++++++++++++++++++++++++++-
 src/configure/cursor.rs     |   6 +-
 5 files changed, 359 insertions(+), 83 deletions(-)

diff --git a/scripts/capture-claude.py b/scripts/capture-claude.py
index e55e7b8..e5a2ad8 100644
--- a/scripts/capture-claude.py
+++ b/scripts/capture-claude.py
@@ -85,49 +85,103 @@ def get_session_log(cwd: str):
     return None
 
 
-def get_model_and_prompt(cwd: str, session_id: str) -> tuple:
-    """Read model and prompt from Claude Code session JSONL.
+def _tail_read_jsonl(path: str, chunk_size: int = 32768) -> list:
+    """Read JSONL lines from the end of a potentially large file.
 
-    Claude Code stores session files at:
-      ~/.claude/projects/{repo-slug}/{session_id}.jsonl
-    where the repo slug is the repo path with slashes replaced by dashes.
-    We glob-search all project dirs to avoid reconstructing the slug.
+    Returns parsed dicts, most-recent first.  Reads at most chunk_size bytes
+    from the end on the first pass — enough for thousands of short entries.
     """
-    import glob as _glob
+    results = []
     try:
-        home = os.path.expanduser("~")
-        pattern = os.path.join(home, ".claude", "projects", "**", f"{session_id}.jsonl")
-        matches = _glob.glob(pattern, recursive=True)
-        if not matches:
-            return "unknown", "unknown"
-
-        session_path = matches[0]
-        with open(session_path, encoding="utf-8", errors="replace") as f:
-            lines = f.readlines()
-
-        model = "unknown"
-        for line in reversed(lines):
-            try:
-                entry = json.loads(line)
-                if entry.get("type") == "assistant" and entry.get("message", {}).get("model"):
-                    model = entry["message"]["model"]
-                    break
-            except Exception:
+        size = os.path.getsize(path)
+        with open(path, "rb") as fh:
+            offset = max(0, size - chunk_size)
+            fh.seek(offset)
+            raw = fh.read()
+        if offset > 0:
+            # Skip the (possibly partial) first line we cut into.
+            nl = raw.find(b"\n")
+            raw = raw[nl + 1:] if nl >= 0 else raw
+        for line in reversed(raw.decode("utf-8", errors="replace").splitlines()):
+            line = line.strip()
+            if not line:
                 continue
-
-        prompt = "unknown"
-        for line in reversed(lines):
             try:
-                entry = json.loads(line)
-                if entry.get("type") == "last-prompt":
-                    prompt = entry.get("lastPrompt", "unknown")
-                    break
+                results.append(json.loads(line))
             except Exception:
                 continue
-
-        return model, prompt
     except Exception:
-        return "unknown", "unknown"
+        pass
+    return results
+
+
+def get_prompt_from_history(session_id: str) -> str:
+    """Read the most-recent user prompt for session_id from ~/.claude/history.jsonl.
+
+    history.jsonl format (one JSON object per line):
+      {"display":"...", "pastedContents":{...}, "sessionId":"...", "project":"...", "timestamp":...}
+
+    We take the most-recent entry whose sessionId matches and whose display
+    is not a slash command.  We also append any inline pasted text content.
+    """
+    path = os.path.expanduser("~/.claude/history.jsonl")
+    entries = _tail_read_jsonl(path)
+    for entry in entries:
+        if entry.get("sessionId") != session_id:
+            continue
+        display = entry.get("display", "").strip()
+        if not display or display.startswith("/"):
+            continue
+        # Append pasted content that has actual text (not just a hash).
+        extra_parts = []
+        for pasted in (entry.get("pastedContents") or {}).values():
+            if isinstance(pasted, dict) and pasted.get("type") == "text":
+                content = pasted.get("content", "")
+                if content:
+                    extra_parts.append(content[:200])
+        if extra_parts:
+            display = display + " [pasted: " + " | ".join(extra_parts) + "]"
+        return display[:500]
+    return "unknown"
+
+
+def get_model_and_prompt(cwd: str, session_id: str) -> tuple:
+    """Read model from Claude Code session JSONL, prompt from history.jsonl.
+
+    Model: ~/.claude/projects/{repo-slug}/{session_id}.jsonl — assistant entries.
+      Skips <synthetic> model values (injected during context compression).
+    Prompt: ~/.claude/history.jsonl — most-recent display for this sessionId.
+    """
+    import glob as _glob
+    model = "unknown"
+    try:
+        home = os.path.expanduser("~")
+        pattern = os.path.join(home, ".claude", "projects", "**", f"{session_id}.jsonl")
+        debug_log(f"glob pattern: {pattern}")
+        matches = _glob.glob(pattern, recursive=True)
+        debug_log(f"glob matches: {matches}")
+        if matches:
+            session_path = matches[0]
+            debug_log(f"session_path: {session_path}")
+            for entry in _tail_read_jsonl(session_path):
+                if entry.get("type") == "assistant":
+                    m = entry.get("message", {}).get("model", "")
+                    if m and m != "<synthetic>":
+                        model = m
+                        debug_log(f"model found: {model}")
+                        break
+    except Exception as exc:
+        debug_log(f"model lookup error: {exc}")
+
+    prompt = get_prompt_from_history(session_id)
+    # Allow test/CI injection via env var when history lookup can't find the session.
+    if prompt == "unknown":
+        env_prompt = os.environ.get("AGENTDIFF_PROMPT", "")
+        if env_prompt:
+            prompt = env_prompt
+            debug_log(f"prompt from AGENTDIFF_PROMPT env var")
+    debug_log(f"prompt: {prompt[:80]!r}")
+    return model, prompt
 
 
 def is_in_repo(abs_file: str, repo_root: str) -> bool:
@@ -195,6 +249,7 @@ def main():
         sys.exit(0)
 
     session_id = first(payload, "session_id", "sessionId", default="unknown")
+    debug_log(f"before get_model_and_prompt session_id={session_id}")
     model, prompt = get_model_and_prompt(cwd, session_id)
 
     timestamp = datetime.now(timezone.utc).isoformat()
diff --git a/scripts/capture-codex.py b/scripts/capture-codex.py
index 1936e8a..dc1c76d 100644
--- a/scripts/capture-codex.py
+++ b/scripts/capture-codex.py
@@ -17,15 +17,26 @@ def debug_enabled() -> bool:
     return os.environ.get("AGENTDIFF_DEBUG", "").lower() in {"1", "true", "yes", "on"}
 
 
+def _write_log(path: str, message: str) -> None:
+    try:
+        log_dir = os.path.expanduser("~/.agentdiff/logs")
+        os.makedirs(log_dir, exist_ok=True)
+        ts = datetime.now(timezone.utc).isoformat()
+        with open(os.path.join(log_dir, path), "a", encoding="utf-8") as f:
+            f.write(f"{ts} {message}\n")
+    except Exception:
+        pass
+
+
+def always_log(message: str) -> None:
+    """Write to codex.log unconditionally — key events, no secrets."""
+    _write_log("capture-codex.log", message)
+
+
 def debug_log(message: str) -> None:
     if not debug_enabled():
         return
-    log_dir = os.path.expanduser("~/.agentdiff/logs")
-    os.makedirs(log_dir, exist_ok=True)
-    path = os.path.join(log_dir, "capture-codex.log")
-    ts = datetime.now(timezone.utc).isoformat()
-    with open(path, "a", encoding="utf-8") as f:
-        f.write(f"{ts} {message}\n")
+    _write_log("capture-codex-debug.log", message)
 
 
 def first(payload: dict, *keys, default=None):
@@ -39,6 +50,46 @@ def codex_sessions_root() -> str:
     return os.environ.get("CODEX_SESSIONS_ROOT", os.path.expanduser("~/.codex/sessions"))
 
 
+def _tail_read_jsonl(path: str, chunk_size: int = 32768) -> List[dict]:
+    """Read JSONL lines from the end of a potentially large file, most-recent first."""
+    results: List[dict] = []
+    try:
+        size = os.path.getsize(path)
+        with open(path, "rb") as fh:
+            offset = max(0, size - chunk_size)
+            fh.seek(offset)
+            raw = fh.read()
+        if offset > 0:
+            nl = raw.find(b"\n")
+            raw = raw[nl + 1:] if nl >= 0 else raw
+        for line in reversed(raw.decode("utf-8", errors="replace").splitlines()):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                results.append(json.loads(line))
+            except Exception:
+                continue
+    except Exception:
+        pass
+    return results
+
+
+def get_prompt_from_history(session_id: str) -> str:
+    """Read the most-recent user prompt for session_id from ~/.codex/history.jsonl.
+
+    history.jsonl format:
+      {"session_id":"...","ts":1234567890,"text":"..."}
+
+    Returns the text of the most-recent entry whose session_id matches.
+    """
+    path = os.path.expanduser("~/.codex/history.jsonl")
+    for entry in _tail_read_jsonl(path):
+        if entry.get("session_id") == session_id and entry.get("text"):
+            return str(entry["text"])[:500]
+    return ""
+
+
 def find_repo_root(cwd: str) -> str:
     try:
         result = subprocess.run(
@@ -586,6 +637,7 @@ def main() -> int:
 
     try:
         cwd, model, session_id, turn_id, prompt, event_name = extract_codex_context(events)
+        always_log(f"event={event_name!r} turn={turn_id!r} cwd={cwd!r} model={model!r} session={session_id!r}")
         debug_log(f"event_name={event_name!r} turn_id={turn_id!r} cwd_from_events={cwd!r}")
 
         # task_started / UserPromptSubmit: snapshot dirty files so task_complete can
@@ -616,6 +668,7 @@ def main() -> int:
             "agent_turn_stop",
         }
         if event_name and event_name in known_skip_events:
+            always_log(f"SKIP non_edit_event={event_name!r}")
             debug_log(f"skip: non-edit event {event_name!r}")
             run_forward(forward_cmd, input_data)
             return 0
@@ -637,23 +690,15 @@ def main() -> int:
             repo_root, chosen_cwd, changed = resolve_repo_and_changes([recovered_cwd] if recovered_cwd else [])
 
         if not changed:
+            always_log(f"SKIP no_changed_lines candidates={candidate_cwds}")
             debug_log("skip: no changed lines found in any candidate repo")
             run_forward(forward_cmd, input_data)
             return 0
 
-        # Filter out files that were already dirty before this codex task started.
-        # This prevents codex from claiming attribution for changes made by other
-        # agents (claude-code, opencode, etc.) that were pending at task_started.
-        pre_task_files = load_and_consume_pre_task_state(repo_root) if repo_root else set()
-        if pre_task_files:
-            changed = {f: lines for f, lines in changed.items() if f not in pre_task_files}
-            debug_log(
-                f"post-filter: {len(changed)} files after excluding {len(pre_task_files)} pre-task dirty files"
-            )
-            if not changed:
-                debug_log("skip: all changed files were pre-existing dirty (not codex's work)")
-                run_forward(forward_cmd, input_data)
-                return 0
+        # Consume (and discard) the pre-task snapshot — kept for hook compatibility
+        # but no longer used to filter. Attribution conflicts across agents are
+        # resolved by prepare-ledger at commit time, not here.
+        load_and_consume_pre_task_state(repo_root) if repo_root else None
 
         if not chosen_cwd:
             chosen_cwd = cwd or os.getcwd()
@@ -670,6 +715,7 @@ def main() -> int:
         timestamp = datetime.now(timezone.utc).isoformat()
         session_log = get_session_log(chosen_cwd)
         if session_log is None:
+            always_log(f"SKIP no_agentdiff_init cwd={chosen_cwd!r}")
             debug_log(f"skip: agentdiff init not run in {chosen_cwd!r}")
             return 0
 
@@ -685,12 +731,13 @@ def main() -> int:
                     "tool": event_name or "task_complete",
                     "file": file_path,
                     "abs_file": abs_file,
-                    "prompt": prompt or "unknown",
+                    "prompt": prompt or get_prompt_from_history(str(session_id)) or "unknown",
                     "acceptance": "verbatim",
                     "lines": lines,
                 }
                 f.write(json.dumps(entry) + "\n")
 
+        always_log(f"WROTE {len(changed)} entries files={list(changed.keys())} model={model!r} session={session_log!r}")
         debug_log(f"wrote {len(changed)} codex entries to {session_log}")
     finally:
         run_forward(forward_cmd, input_data)
diff --git a/scripts/capture-cursor.py b/scripts/capture-cursor.py
index 697e6af..98add9d 100644
--- a/scripts/capture-cursor.py
+++ b/scripts/capture-cursor.py
@@ -14,15 +14,26 @@ def debug_enabled() -> bool:
     return os.environ.get("AGENTDIFF_DEBUG", "").lower() in {"1", "true", "yes", "on"}
 
 
+def _write_log(path: str, message: str) -> None:
+    try:
+        log_dir = os.path.expanduser("~/.agentdiff/logs")
+        os.makedirs(log_dir, exist_ok=True)
+        ts = datetime.now(timezone.utc).isoformat()
+        with open(os.path.join(log_dir, path), "a", encoding="utf-8") as f:
+            f.write(f"{ts} {message}\n")
+    except Exception:
+        pass
+
+
+def always_log(message: str) -> None:
+    """Write to cursor.log unconditionally — key events, no secrets."""
+    _write_log("capture-cursor.log", message)
+
+
 def debug_log(message: str) -> None:
     if not debug_enabled():
         return
-    log_dir = os.path.expanduser("~/.agentdiff/logs")
-    os.makedirs(log_dir, exist_ok=True)
-    path = os.path.join(log_dir, "capture-cursor.log")
-    ts = datetime.now(timezone.utc).isoformat()
-    with open(path, "a", encoding="utf-8") as f:
-        f.write(f"{ts} {message}\n")
+    _write_log("capture-cursor-debug.log", message)
 
 
 def first(payload: dict, *keys, default=None):
@@ -107,9 +118,65 @@ def get_cached_prompt(conversation_id: str) -> str:
     """Read cached prompt from beforeSubmitPrompt."""
     prompt_path = os.path.expanduser(f"~/.cursor/hooks/prompts/{conversation_id}.txt")
     if os.path.exists(prompt_path):
-        with open(prompt_path) as f:
-            return f.read().strip()
-    return "unknown"
+        try:
+            with open(prompt_path) as f:
+                return f.read().strip()
+        except Exception:
+            pass
+    return ""
+
+
+def _cursor_project_slug(repo_root: str) -> str:
+    """Derive the ~/.cursor/projects/ slug from a repo root path.
+
+    /home/prakh/ml-resarch  →  home-prakh-ml-resarch
+    """
+    return repo_root.lstrip("/").replace("/", "-")
+
+
+def get_prompt_from_transcript(conversation_id: str, repo_root: str) -> str:
+    """Read the user's prompt from Cursor's agent-transcript JSONL.
+
+    Files live at:
+      ~/.cursor/projects/{slug}/agent-transcripts/{conv_id}/{conv_id}.jsonl
+
+    We read the first user message and extract its text content.
+    """
+    slug = _cursor_project_slug(repo_root)
+    transcript_path = os.path.expanduser(
+        f"~/.cursor/projects/{slug}/agent-transcripts/{conversation_id}/{conversation_id}.jsonl"
+    )
+    if not os.path.exists(transcript_path):
+        debug_log(f"transcript not found: {transcript_path}")
+        return ""
+    try:
+        with open(transcript_path, encoding="utf-8", errors="replace") as f:
+            for raw in f:
+                raw = raw.strip()
+                if not raw:
+                    continue
+                try:
+                    entry = json.loads(raw)
+                except Exception:
+                    continue
+                if entry.get("role") != "user":
+                    continue
+                content = entry.get("message", {}).get("content", [])
+                if isinstance(content, list):
+                    for part in content:
+                        if isinstance(part, dict) and part.get("type") == "text":
+                            text = part.get("text", "")
+                            # Strip the <user_query>…</user_query> wrapper Cursor adds.
+                            text = re.sub(r"<user_query>\s*", "", text)
+                            text = re.sub(r"\s*</user_query>", "", text)
+                            return text.strip()[:500]
+                elif isinstance(content, str):
+                    return content.strip()[:500]
+    except Exception as exc:
+        debug_log(f"transcript read error: {exc}")
+    return ""
+
+
 
 
 def main():
@@ -121,11 +188,12 @@ def main():
     try:
         payload = json.loads(input_data)
     except json.JSONDecodeError:
+        always_log(f"SKIP parse_error input={input_data[:120]!r}")
         sys.exit(0)
 
     event_name = first(payload, "hook_event_name", "hookEventName", "event_name", "event", default="")
     if event_name not in ["afterFileEdit", "afterTabFileEdit", "beforeSubmitPrompt"]:
-        debug_log(f"skip: unknown event_name={event_name!r}")
+        always_log(f"SKIP unknown_event={event_name!r}")
         sys.exit(0)
 
     # Handle beforeSubmitPrompt - cache the prompt
@@ -136,7 +204,7 @@ def main():
         os.makedirs(prompt_dir, exist_ok=True)
         with open(os.path.join(prompt_dir, f"{conversation_id}.txt"), "w") as f:
             f.write(prompt)
-        debug_log(f"cached prompt for conversation_id={conversation_id}")
+        always_log(f"cached_prompt conv={conversation_id}")
         sys.exit(0)
 
     cwd_raw = first(payload, "cwd", "workspace", "workspace_path", "workspacePath", default=os.getcwd())
@@ -147,12 +215,12 @@ def main():
     if not abs_file and isinstance(payload.get("file"), dict):
         abs_file = first(payload.get("file", {}), "path", "file_path", "filePath", default="")
     if not abs_file:
-        debug_log("skip: missing abs_file")
+        always_log("SKIP missing_abs_file")
         sys.exit(0)
 
     abs_file = normalize_path(str(abs_file), cwd)
     if not abs_file:
-        debug_log("skip: invalid abs_file after normalize")
+        always_log("SKIP invalid_abs_file_after_normalize")
         sys.exit(0)
 
     file_repo_root = find_repo_root_from_path(abs_file)
@@ -161,21 +229,25 @@ def main():
 
     in_repo = is_git_repo(repo_root)
     if in_repo and not is_within_repo(abs_file, repo_root):
-        debug_log(f"skip: file outside repo abs_file={abs_file!r} repo_root={repo_root!r}")
+        always_log(f"SKIP file_outside_repo file={abs_file!r} repo={repo_root!r}")
         sys.exit(0)
 
     # Get prompt for agent mode
     if event_name == "afterFileEdit":
         conversation_id = first(payload, "conversation_id", "conversationId", default="")
-        prompt = get_cached_prompt(conversation_id) if conversation_id else "unknown"
+        prompt = ""
+        if conversation_id:
+            prompt = get_cached_prompt(conversation_id)
+            if not prompt and repo_root:
+                prompt = get_prompt_from_transcript(conversation_id, repo_root)
+        if not prompt:
+            prompt = "unknown"
         mode = "agent"
     else:  # afterTabFileEdit
         prompt = None
         mode = "tab"
 
     timestamp = datetime.now(timezone.utc).isoformat()
-
-    # Model comes from payload in Cursor
     model = first(payload, "model", "model_name", "modelName", default="cursor-unknown")
 
     entry = {
@@ -207,22 +279,26 @@ def main():
                 end = ch.get("endLine") or ch.get("line_end") or start
                 if isinstance(start, int) and isinstance(end, int):
                     new_lines.extend(list(range(min(start, end), max(start, end) + 1)))
+
         entry["lines"] = new_lines if new_lines else old_lines
+        always_log(
+            f"event={event_name} file={entry['file']!r} model={model!r} "
+            f"n_lines={len(entry['lines'])} "
+            f"payload_old={old_lines} payload_new={new_lines} "
+            f"repo={repo_root!r}"
+        )
     else:  # tab completion
         line_num = first(payload, "line_number", "lineNumber", default=1)
         entry["lines"] = [line_num if isinstance(line_num, int) else 1]
+        always_log(f"event={event_name} file={entry['file']!r} line={line_num}")
 
     session_log = get_session_log(cwd, repo_root if in_repo else "")
     if session_log is None:
-        debug_log(f"skip: agentdiff init not run in {repo_root!r}")
+        always_log(f"SKIP no_agentdiff_init repo={repo_root!r}")
         sys.exit(0)
     with open(session_log, "a") as f:
         f.write(json.dumps(entry) + "\n")
-    debug_log(
-        "wrote entry "
-        f"event={event_name} file={entry['file']} lines={entry.get('lines')} "
-        f"cwd={cwd!r} repo_root={repo_root!r} session_log={session_log!r}"
-    )
+    always_log(f"WROTE file={entry['file']!r} lines={entry.get('lines')} session_log={session_log!r}")
 
 
 if __name__ == "__main__":
diff --git a/scripts/capture-opencode.py b/scripts/capture-opencode.py
index 7c7276c..6632434 100644
--- a/scripts/capture-opencode.py
+++ b/scripts/capture-opencode.py
@@ -4,6 +4,7 @@
 """
 import json
 import os
+import sqlite3
 import subprocess
 import sys
 from datetime import datetime, timezone
@@ -44,6 +45,91 @@ def find_repo_root(cwd: str) -> str:
         return cwd
 
 
+_OPENCODE_DB = os.path.expanduser("~/.local/share/opencode/opencode.db")
+_OPENCODE_MODEL_JSON = os.path.expanduser("~/.local/state/opencode/model.json")
+
+
+def get_opencode_model(session_id: str) -> str:
+    """Look up the model used in an OpenCode session.
+
+    Primary: query the SQLite DB for the most-recent assistant message's modelID.
+    Fallback: ~/.local/state/opencode/model.json → recent[0].modelID.
+    """
+    if os.path.exists(_OPENCODE_DB):
+        try:
+            conn = sqlite3.connect(f"file:{_OPENCODE_DB}?mode=ro", uri=True, timeout=2)
+            row = conn.execute(
+                "SELECT json_extract(data, '$.modelID') "
+                "FROM message "
+                "WHERE session_id=? AND json_extract(data,'$.role')='assistant' "
+                "ORDER BY time_created DESC LIMIT 1",
+                (session_id,),
+            ).fetchone()
+            conn.close()
+            if row and row[0]:
+                debug_log(f"opencode model from DB: {row[0]!r}")
+                return str(row[0])
+        except Exception as exc:
+            debug_log(f"opencode model DB lookup failed: {exc}")
+
+    # Fallback: most-recently used model from model.json
+    if os.path.exists(_OPENCODE_MODEL_JSON):
+        try:
+            with open(_OPENCODE_MODEL_JSON, encoding="utf-8") as f:
+                data = json.load(f)
+            recent = data.get("recent", [])
+            if recent:
+                model_id = recent[0].get("modelID", "")
+                if model_id:
+                    debug_log(f"opencode model from model.json: {model_id!r}")
+                    return model_id
+        except Exception as exc:
+            debug_log(f"opencode model.json lookup failed: {exc}")
+
+    return "opencode"
+
+
+def get_opencode_prompt(session_id: str) -> str:
+    """Look up the user's initial prompt for an OpenCode session from the SQLite DB.
+
+    message.data contains: {"role":"user", ...}
+    The text is in the part table: {"type":"text","text":"..."}
+    """
+    if not os.path.exists(_OPENCODE_DB):
+        return "unknown"
+    try:
+        conn = sqlite3.connect(f"file:{_OPENCODE_DB}?mode=ro", uri=True, timeout=2)
+        # Get first user message for this session
+        row = conn.execute(
+            "SELECT id FROM message WHERE session_id=? "
+            "AND json_extract(data,'$.role')='user' "
+            "ORDER BY time_created ASC LIMIT 1",
+            (session_id,),
+        ).fetchone()
+        if not row:
+            conn.close()
+            return "unknown"
+        msg_id = row[0]
+        # Get text parts for this message
+        parts = conn.execute(
+            "SELECT data FROM part WHERE message_id=? ORDER BY time_created ASC",
+            (msg_id,),
+        ).fetchall()
+        conn.close()
+        for part_row in parts:
+            try:
+                part = json.loads(part_row[0])
+                if part.get("type") == "text" and part.get("text"):
+                    text = str(part["text"]).strip()
+                    debug_log(f"opencode prompt from DB: {text[:80]!r}")
+                    return text[:500]
+            except Exception:
+                continue
+    except Exception as exc:
+        debug_log(f"opencode prompt DB lookup failed: {exc}")
+    return "unknown"
+
+
 def get_session_log(cwd: str):
     """Return session log path, or None if agentdiff init has not been run here."""
     override = os.environ.get("AGENTDIFF_SESSION_LOG")
@@ -121,8 +207,20 @@ def main() -> int:
 
     rel_file = abs_file[len(repo_root):].lstrip("/") if abs_file.startswith(repo_root) else abs_file
     session_id = str(first(payload, "session_id", "sessionId", default="unknown"))
-    model = str(first(payload, "model", "modelID", "model_id", default="opencode"))
-    prompt = first(payload, "prompt", "user_prompt", "userPrompt", default="unknown")
+
+    # Model: payload may already carry it (from older plugin version); otherwise query DB.
+    raw_model = str(first(payload, "model", "modelID", "model_id", default="") or "")
+    if not raw_model or raw_model == "opencode":
+        model = get_opencode_model(session_id)
+    else:
+        model = raw_model
+
+    # Prompt: payload may carry it; otherwise query DB for first user message.
+    raw_prompt = first(payload, "prompt", "user_prompt", "userPrompt", default="") or ""
+    if not raw_prompt or str(raw_prompt) in ("unknown", "null"):
+        prompt = get_opencode_prompt(session_id)
+    else:
+        prompt = str(raw_prompt)
 
     entry = {
         "timestamp": datetime.now(timezone.utc).isoformat(),
diff --git a/src/configure/cursor.rs b/src/configure/cursor.rs
index 2b2dc07..8094757 100644
--- a/src/configure/cursor.rs
+++ b/src/configure/cursor.rs
@@ -70,9 +70,9 @@ fn configure_cursor_hooks_file(
     let mut hooks_cfg: serde_json::Value =
         serde_json::from_str(&raw).context("parsing hooks.json")?;
 
-    let hooks = hooks_cfg
-        .as_object_mut()
-        .unwrap()
+    let obj = hooks_cfg.as_object_mut().unwrap();
+    obj.entry("version").or_insert(serde_json::json!(1));
+    let hooks = obj
         .entry("hooks")
         .or_insert(serde_json::json!({}))
         .as_object_mut()

From ff0a2e97ece08783cddccc1cbcdd4010eb599e59 Mon Sep 17 00:00:00 2001
From: Prakhar Khatri <prakharkhatri123@gmail.com>
Date: Wed, 22 Apr 2026 11:49:40 +0000
Subject: [PATCH 8/9] fix: remove unconditional logging from cursor/codex
 captures; fix MCP attribution fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Drop always_log from capture-cursor.py and capture-codex.py — was
  writing to log files on every agent event regardless of AGENTDIFF_DEBUG,
  silently filling ~/.agentdiff/logs/. All call sites replaced with
  debug_log (conditional on AGENTDIFF_DEBUG env var).
- Fix prepare-ledger.py: files with no session event but present in
  MCP files_read now correctly inherit the MCP agent/model instead of
  falling back to "human". Fixes CI mcp-smoke test failure:
  RuntimeError: expected model_id=mcp-smoke-model in trace entry.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 scripts/capture-codex.py  | 15 +++++----------
 scripts/capture-cursor.py | 25 ++++++++++---------------
 scripts/prepare-ledger.py | 11 ++++++++---
 3 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/scripts/capture-codex.py b/scripts/capture-codex.py
index dc1c76d..9db736d 100644
--- a/scripts/capture-codex.py
+++ b/scripts/capture-codex.py
@@ -28,11 +28,6 @@ def _write_log(path: str, message: str) -> None:
         pass
 
 
-def always_log(message: str) -> None:
-    """Write to codex.log unconditionally — key events, no secrets."""
-    _write_log("capture-codex.log", message)
-
-
 def debug_log(message: str) -> None:
     if not debug_enabled():
         return
@@ -637,7 +632,7 @@ def main() -> int:
 
     try:
         cwd, model, session_id, turn_id, prompt, event_name = extract_codex_context(events)
-        always_log(f"event={event_name!r} turn={turn_id!r} cwd={cwd!r} model={model!r} session={session_id!r}")
+        debug_log(f"event={event_name!r} turn={turn_id!r} cwd={cwd!r} model={model!r} session={session_id!r}")
         debug_log(f"event_name={event_name!r} turn_id={turn_id!r} cwd_from_events={cwd!r}")
 
         # task_started / UserPromptSubmit: snapshot dirty files so task_complete can
@@ -668,7 +663,7 @@ def main() -> int:
             "agent_turn_stop",
         }
         if event_name and event_name in known_skip_events:
-            always_log(f"SKIP non_edit_event={event_name!r}")
+            debug_log(f"SKIP non_edit_event={event_name!r}")
             debug_log(f"skip: non-edit event {event_name!r}")
             run_forward(forward_cmd, input_data)
             return 0
@@ -690,7 +685,7 @@ def main() -> int:
             repo_root, chosen_cwd, changed = resolve_repo_and_changes([recovered_cwd] if recovered_cwd else [])
 
         if not changed:
-            always_log(f"SKIP no_changed_lines candidates={candidate_cwds}")
+            debug_log(f"SKIP no_changed_lines candidates={candidate_cwds}")
             debug_log("skip: no changed lines found in any candidate repo")
             run_forward(forward_cmd, input_data)
             return 0
@@ -715,7 +710,7 @@ def main() -> int:
         timestamp = datetime.now(timezone.utc).isoformat()
         session_log = get_session_log(chosen_cwd)
         if session_log is None:
-            always_log(f"SKIP no_agentdiff_init cwd={chosen_cwd!r}")
+            debug_log(f"SKIP no_agentdiff_init cwd={chosen_cwd!r}")
             debug_log(f"skip: agentdiff init not run in {chosen_cwd!r}")
             return 0
 
@@ -737,7 +732,7 @@ def main() -> int:
                 }
                 f.write(json.dumps(entry) + "\n")
 
-        always_log(f"WROTE {len(changed)} entries files={list(changed.keys())} model={model!r} session={session_log!r}")
+        debug_log(f"WROTE {len(changed)} entries files={list(changed.keys())} model={model!r} session={session_log!r}")
         debug_log(f"wrote {len(changed)} codex entries to {session_log}")
     finally:
         run_forward(forward_cmd, input_data)
diff --git a/scripts/capture-cursor.py b/scripts/capture-cursor.py
index 98add9d..e5824dc 100644
--- a/scripts/capture-cursor.py
+++ b/scripts/capture-cursor.py
@@ -25,11 +25,6 @@ def _write_log(path: str, message: str) -> None:
         pass
 
 
-def always_log(message: str) -> None:
-    """Write to cursor.log unconditionally — key events, no secrets."""
-    _write_log("capture-cursor.log", message)
-
-
 def debug_log(message: str) -> None:
     if not debug_enabled():
         return
@@ -188,12 +183,12 @@ def main():
     try:
         payload = json.loads(input_data)
     except json.JSONDecodeError:
-        always_log(f"SKIP parse_error input={input_data[:120]!r}")
+        debug_log(f"SKIP parse_error input={input_data[:120]!r}")
         sys.exit(0)
 
     event_name = first(payload, "hook_event_name", "hookEventName", "event_name", "event", default="")
     if event_name not in ["afterFileEdit", "afterTabFileEdit", "beforeSubmitPrompt"]:
-        always_log(f"SKIP unknown_event={event_name!r}")
+        debug_log(f"SKIP unknown_event={event_name!r}")
         sys.exit(0)
 
     # Handle beforeSubmitPrompt - cache the prompt
@@ -204,7 +199,7 @@ def main():
         os.makedirs(prompt_dir, exist_ok=True)
         with open(os.path.join(prompt_dir, f"{conversation_id}.txt"), "w") as f:
             f.write(prompt)
-        always_log(f"cached_prompt conv={conversation_id}")
+        debug_log(f"cached_prompt conv={conversation_id}")
         sys.exit(0)
 
     cwd_raw = first(payload, "cwd", "workspace", "workspace_path", "workspacePath", default=os.getcwd())
@@ -215,12 +210,12 @@ def main():
     if not abs_file and isinstance(payload.get("file"), dict):
         abs_file = first(payload.get("file", {}), "path", "file_path", "filePath", default="")
     if not abs_file:
-        always_log("SKIP missing_abs_file")
+        debug_log("SKIP missing_abs_file")
         sys.exit(0)
 
     abs_file = normalize_path(str(abs_file), cwd)
     if not abs_file:
-        always_log("SKIP invalid_abs_file_after_normalize")
+        debug_log("SKIP invalid_abs_file_after_normalize")
         sys.exit(0)
 
     file_repo_root = find_repo_root_from_path(abs_file)
@@ -229,7 +224,7 @@ def main():
 
     in_repo = is_git_repo(repo_root)
     if in_repo and not is_within_repo(abs_file, repo_root):
-        always_log(f"SKIP file_outside_repo file={abs_file!r} repo={repo_root!r}")
+        debug_log(f"SKIP file_outside_repo file={abs_file!r} repo={repo_root!r}")
         sys.exit(0)
 
     # Get prompt for agent mode
@@ -281,7 +276,7 @@ def main():
                     new_lines.extend(list(range(min(start, end), max(start, end) + 1)))
 
         entry["lines"] = new_lines if new_lines else old_lines
-        always_log(
+        debug_log(
             f"event={event_name} file={entry['file']!r} model={model!r} "
             f"n_lines={len(entry['lines'])} "
             f"payload_old={old_lines} payload_new={new_lines} "
@@ -290,15 +285,15 @@ def main():
     else:  # tab completion
         line_num = first(payload, "line_number", "lineNumber", default=1)
         entry["lines"] = [line_num if isinstance(line_num, int) else 1]
-        always_log(f"event={event_name} file={entry['file']!r} line={line_num}")
+        debug_log(f"event={event_name} file={entry['file']!r} line={line_num}")
 
     session_log = get_session_log(cwd, repo_root if in_repo else "")
     if session_log is None:
-        always_log(f"SKIP no_agentdiff_init repo={repo_root!r}")
+        debug_log(f"SKIP no_agentdiff_init repo={repo_root!r}")
         sys.exit(0)
     with open(session_log, "a") as f:
         f.write(json.dumps(entry) + "\n")
-    always_log(f"WROTE file={entry['file']!r} lines={entry.get('lines')} session_log={session_log!r}")
+    debug_log(f"WROTE file={entry['file']!r} lines={entry.get('lines')} session_log={session_log!r}")
 
 
 if __name__ == "__main__":
diff --git a/scripts/prepare-ledger.py b/scripts/prepare-ledger.py
index e4b5b4c..d86065f 100644
--- a/scripts/prepare-ledger.py
+++ b/scripts/prepare-ledger.py
@@ -328,11 +328,16 @@ def main() -> int:
             }
 
     # Files committed with no captured session event → attribute to human.
-    # Without this, finalize-ledger.py would inherit the dominant AI agent for these
-    # files even though we have no evidence the AI touched them.
+    # Exception: if the file appears in files_read from the MCP pending context and
+    # there is a non-human top-level agent, the MCP context is sufficient evidence —
+    # use the MCP agent/model rather than falling back to human.
+    files_read_set = {os.path.basename(f) for f in files_read} | set(files_read)
     for fp in files_touched:
         if fp not in events_by_file:
-            attribution[fp] = {"agent": "human", "model": "human"}
+            if agent != "human" and (fp in files_read_set or os.path.basename(fp) in files_read_set):
+                attribution[fp] = {"agent": agent, "model": model}
+            else:
+                attribution[fp] = {"agent": "human", "model": "human"}
 
     payload = {
         "captured_at": datetime.now(timezone.utc).isoformat(),

From e3b94ea1004942207b189bf47de6519bf5fa2e2f Mon Sep 17 00:00:00 2001
From: Prakhar Khatri <prakharkhatri123@gmail.com>
Date: Thu, 23 Apr 2026 10:21:38 +0000
Subject: [PATCH 9/9] =?UTF-8?q?fix:=20address=20bot=20review=20=E2=80=94?=
 =?UTF-8?q?=20MCP=20path=20normalisation,=20SQLite=20leaks,=20duplicate=20?=
 =?UTF-8?q?debug=20logs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- prepare-ledger: replace basename-union files_read_set with repo-relative
  path normalisation; full-path match now fires correctly, eliminating
  false-positive MCP attribution on common filenames (e.g. utils.py)
- capture-opencode: guard both SQLite connections with contextlib.closing so
  the file lock is released on exception; probe both DB path candidates
  (~/.local/share/opencode and ~/.opencode) to cover all install methods
- capture-codex: remove 5 duplicate debug_log lines that were strict subsets
  of the preceding log call

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 scripts/capture-codex.py    |  5 ----
 scripts/capture-opencode.py | 56 ++++++++++++++++++-------------------
 scripts/prepare-ledger.py   |  7 +++--
 3 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/scripts/capture-codex.py b/scripts/capture-codex.py
index 9db736d..cef47eb 100644
--- a/scripts/capture-codex.py
+++ b/scripts/capture-codex.py
@@ -633,7 +633,6 @@ def main() -> int:
     try:
         cwd, model, session_id, turn_id, prompt, event_name = extract_codex_context(events)
         debug_log(f"event={event_name!r} turn={turn_id!r} cwd={cwd!r} model={model!r} session={session_id!r}")
-        debug_log(f"event_name={event_name!r} turn_id={turn_id!r} cwd_from_events={cwd!r}")
 
         # task_started / UserPromptSubmit: snapshot dirty files so task_complete can
         # isolate what codex changed. UserPromptSubmit fires from hooks.json before
@@ -664,7 +663,6 @@ def main() -> int:
         }
         if event_name and event_name in known_skip_events:
             debug_log(f"SKIP non_edit_event={event_name!r}")
-            debug_log(f"skip: non-edit event {event_name!r}")
             run_forward(forward_cmd, input_data)
             return 0
 
@@ -686,7 +684,6 @@ def main() -> int:
 
         if not changed:
             debug_log(f"SKIP no_changed_lines candidates={candidate_cwds}")
-            debug_log("skip: no changed lines found in any candidate repo")
             run_forward(forward_cmd, input_data)
             return 0
 
@@ -711,7 +708,6 @@ def main() -> int:
         session_log = get_session_log(chosen_cwd)
         if session_log is None:
             debug_log(f"SKIP no_agentdiff_init cwd={chosen_cwd!r}")
-            debug_log(f"skip: agentdiff init not run in {chosen_cwd!r}")
             return 0
 
         with open(session_log, "a", encoding="utf-8") as f:
@@ -733,7 +729,6 @@ def main() -> int:
                 f.write(json.dumps(entry) + "\n")
 
         debug_log(f"WROTE {len(changed)} entries files={list(changed.keys())} model={model!r} session={session_log!r}")
-        debug_log(f"wrote {len(changed)} codex entries to {session_log}")
     finally:
         run_forward(forward_cmd, input_data)
 
diff --git a/scripts/capture-opencode.py b/scripts/capture-opencode.py
index 6632434..209cc94 100644
--- a/scripts/capture-opencode.py
+++ b/scripts/capture-opencode.py
@@ -2,6 +2,7 @@
 """
 AgentDiff capture script for OpenCode plugin hooks.
 """
+import contextlib
 import json
 import os
 import sqlite3
@@ -45,7 +46,11 @@ def find_repo_root(cwd: str) -> str:
         return cwd
 
 
-_OPENCODE_DB = os.path.expanduser("~/.local/share/opencode/opencode.db")
+_OPENCODE_DB_CANDIDATES = [
+    os.path.expanduser("~/.local/share/opencode/opencode.db"),
+    os.path.expanduser("~/.opencode/opencode.db"),
+]
+_OPENCODE_DB = next((p for p in _OPENCODE_DB_CANDIDATES if os.path.exists(p)), _OPENCODE_DB_CANDIDATES[0])
 _OPENCODE_MODEL_JSON = os.path.expanduser("~/.local/state/opencode/model.json")
 
 
@@ -57,15 +62,14 @@ def get_opencode_model(session_id: str) -> str:
     """
     if os.path.exists(_OPENCODE_DB):
         try:
-            conn = sqlite3.connect(f"file:{_OPENCODE_DB}?mode=ro", uri=True, timeout=2)
-            row = conn.execute(
-                "SELECT json_extract(data, '$.modelID') "
-                "FROM message "
-                "WHERE session_id=? AND json_extract(data,'$.role')='assistant' "
-                "ORDER BY time_created DESC LIMIT 1",
-                (session_id,),
-            ).fetchone()
-            conn.close()
+            with contextlib.closing(sqlite3.connect(f"file:{_OPENCODE_DB}?mode=ro", uri=True, timeout=2)) as conn:
+                row = conn.execute(
+                    "SELECT json_extract(data, '$.modelID') "
+                    "FROM message "
+                    "WHERE session_id=? AND json_extract(data,'$.role')='assistant' "
+                    "ORDER BY time_created DESC LIMIT 1",
+                    (session_id,),
+                ).fetchone()
             if row and row[0]:
                 debug_log(f"opencode model from DB: {row[0]!r}")
                 return str(row[0])
@@ -98,24 +102,20 @@ def get_opencode_prompt(session_id: str) -> str:
     if not os.path.exists(_OPENCODE_DB):
         return "unknown"
     try:
-        conn = sqlite3.connect(f"file:{_OPENCODE_DB}?mode=ro", uri=True, timeout=2)
-        # Get first user message for this session
-        row = conn.execute(
-            "SELECT id FROM message WHERE session_id=? "
-            "AND json_extract(data,'$.role')='user' "
-            "ORDER BY time_created ASC LIMIT 1",
-            (session_id,),
-        ).fetchone()
-        if not row:
-            conn.close()
-            return "unknown"
-        msg_id = row[0]
-        # Get text parts for this message
-        parts = conn.execute(
-            "SELECT data FROM part WHERE message_id=? ORDER BY time_created ASC",
-            (msg_id,),
-        ).fetchall()
-        conn.close()
+        with contextlib.closing(sqlite3.connect(f"file:{_OPENCODE_DB}?mode=ro", uri=True, timeout=2)) as conn:
+            row = conn.execute(
+                "SELECT id FROM message WHERE session_id=? "
+                "AND json_extract(data,'$.role')='user' "
+                "ORDER BY time_created ASC LIMIT 1",
+                (session_id,),
+            ).fetchone()
+            if not row:
+                return "unknown"
+            msg_id = row[0]
+            parts = conn.execute(
+                "SELECT data FROM part WHERE message_id=? ORDER BY time_created ASC",
+                (msg_id,),
+            ).fetchall()
         for part_row in parts:
             try:
                 part = json.loads(part_row[0])
diff --git a/scripts/prepare-ledger.py b/scripts/prepare-ledger.py
index d86065f..d6e4b62 100644
--- a/scripts/prepare-ledger.py
+++ b/scripts/prepare-ledger.py
@@ -331,10 +331,13 @@ def main() -> int:
     # Exception: if the file appears in files_read from the MCP pending context and
     # there is a non-human top-level agent, the MCP context is sufficient evidence —
     # use the MCP agent/model rather than falling back to human.
-    files_read_set = {os.path.basename(f) for f in files_read} | set(files_read)
+    files_read_rel = {
+        os.path.relpath(f, repo_root) if os.path.isabs(f) and f.startswith(repo_root) else f
+        for f in files_read
+    }
     for fp in files_touched:
         if fp not in events_by_file:
-            if agent != "human" and (fp in files_read_set or os.path.basename(fp) in files_read_set):
+            if agent != "human" and fp in files_read_rel:
                 attribution[fp] = {"agent": agent, "model": model}
             else:
                 attribution[fp] = {"agent": "human", "model": "human"}