From f287bbbfb381657dcc03d44a2ddbdebcb56fbce4 Mon Sep 17 00:00:00 2001 From: Prakhar Khatri Date: Mon, 20 Apr 2026 11:12:46 +0000 Subject: [PATCH 1/9] fix: capture new untracked files in codex, fix claude model lookup, remove duplicate codex capture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit capture-codex.py: `git diff` misses brand-new untracked files. Add `git ls-files --others --exclude-standard` pass so Codex attribution works when it creates a file from scratch. Also updated `get_dirty_file_names` to include untracked files in the pre-task snapshot for correct exclusion. capture-claude.py: `get_model_and_prompt` was guessing the session file path from the session ID, but Claude Code organises sessions by repo path slug, not session ID. Switch to a recursive glob search across ~/.claude/projects/**/{session_id}.jsonl so the model name is always found. codex.rs: `agentdiff configure` was writing both `notify` in config.toml AND `UserPromptSubmit`/`Stop` in hooks.json. When codex_hooks=true, Codex fires both for the same task — doubling every session.jsonl entry. Remove the `notify` key when enabling codex_hooks so only hooks.json fires. Co-Authored-By: Claude Sonnet 4.6 --- scripts/capture-claude.py | 77 ++++++++++++++++----------------- scripts/capture-codex.py | 45 ++++++++++++++++--- src/configure/codex.rs | 91 +++++++++++++++++---------------------- 3 files changed, 118 insertions(+), 95 deletions(-) diff --git a/scripts/capture-claude.py b/scripts/capture-claude.py index 043ff9b..e55e7b8 100644 --- a/scripts/capture-claude.py +++ b/scripts/capture-claude.py @@ -86,47 +86,46 @@ def get_session_log(cwd: str): def get_model_and_prompt(cwd: str, session_id: str) -> tuple: - """Read model and prompt from Claude session JSONL.""" + """Read model and prompt from Claude Code session JSONL. + + Claude Code stores session files at: + ~/.claude/projects/{repo-slug}/{session_id}.jsonl + where the repo slug is the repo path with slashes replaced by dashes. + We glob-search all project dirs to avoid reconstructing the slug. + """ + import glob as _glob try: - # Try to find the session file home = os.path.expanduser("~") - parts = session_id.split("-") - # Construct likely path - possible_paths = [ - os.path.join(home, ".claude", "projects", parts[-1] if parts else "", f"{session_id}.jsonl"), - os.path.join(home, ".claude", "projects", session_id[:8], f"{session_id}.jsonl"), - ] - - for session_path in possible_paths: - if os.path.exists(session_path): - with open(session_path) as f: - lines = f.readlines() - - # Find last assistant message for model - model = "unknown" - for line in reversed(lines): - try: - entry = json.loads(line) - if entry.get("type") == "assistant" and entry.get("message", {}).get("model"): - model = entry["message"]["model"] - break - except: - continue - - # Find last-prompt for the actual user request - prompt = "unknown" - for line in reversed(lines): - try: - entry = json.loads(line) - if entry.get("type") == "last-prompt": - prompt = entry.get("lastPrompt", "unknown") - break - except: - continue - - return model, prompt - - return "unknown", "unknown" + pattern = os.path.join(home, ".claude", "projects", "**", f"{session_id}.jsonl") + matches = _glob.glob(pattern, recursive=True) + if not matches: + return "unknown", "unknown" + + session_path = matches[0] + with open(session_path, encoding="utf-8", errors="replace") as f: + lines = f.readlines() + + model = "unknown" + for line in reversed(lines): + try: + entry = json.loads(line) + if entry.get("type") == "assistant" and entry.get("message", {}).get("model"): + model = entry["message"]["model"] + break + except Exception: + continue + + prompt = "unknown" + for line in reversed(lines): + try: + entry = json.loads(line) + if entry.get("type") == "last-prompt": + prompt = entry.get("lastPrompt", "unknown") + break + except Exception: + continue + + return model, prompt except Exception: return "unknown", "unknown" diff --git a/scripts/capture-codex.py b/scripts/capture-codex.py index 2ed2dfd..1936e8a 100644 --- a/scripts/capture-codex.py +++ b/scripts/capture-codex.py @@ -131,21 +131,56 @@ def collect_changed_lines(repo_root: str) -> Dict[str, List[int]]: result.setdefault(path, []) result[path].extend(lines) + # git diff does not show brand-new untracked files. Detect them separately + # so Codex attribution works when it creates a file from scratch. + try: + untracked = subprocess.run( + ["git", "ls-files", "--others", "--exclude-standard"], + capture_output=True, text=True, cwd=repo_root, + ) + if untracked.returncode == 0: + for rel_path in untracked.stdout.splitlines(): + rel_path = rel_path.strip() + if not rel_path or rel_path in result: + continue + abs_path = os.path.join(repo_root, rel_path) + try: + with open(abs_path, "r", encoding="utf-8", errors="replace") as fh: + line_count = sum(1 for _ in fh) + if line_count > 0: + result[rel_path] = list(range(1, line_count + 1)) + else: + result[rel_path] = [1] + except (OSError, IOError): + result[rel_path] = [1] + except Exception: + pass + return {k: sorted(set(v)) for k, v in result.items() if v} def get_dirty_file_names(repo_root: str) -> List[str]: - """Return repo-relative paths of all files currently differing from HEAD.""" + """Return repo-relative paths of all files currently differing from HEAD, including untracked.""" + files: List[str] = [] try: out = subprocess.run( ["git", "diff", "HEAD", "--name-only"], capture_output=True, text=True, cwd=repo_root, ) - if out.returncode != 0: - return [] - return [line.strip() for line in out.stdout.splitlines() if line.strip()] + if out.returncode == 0: + files.extend(line.strip() for line in out.stdout.splitlines() if line.strip()) except Exception: - return [] + pass + try: + untracked = subprocess.run( + ["git", "ls-files", "--others", "--exclude-standard"], + capture_output=True, text=True, cwd=repo_root, + ) + if untracked.returncode == 0: + files.extend(line.strip() for line in untracked.stdout.splitlines() if line.strip()) + except Exception: + pass + return list(dict.fromkeys(files)) # deduplicate, preserve order def pre_task_state_path(repo_root: str) -> str: diff --git a/src/configure/codex.rs b/src/configure/codex.rs index 3f7fbfa..cd3a496 100644 --- a/src/configure/codex.rs +++ b/src/configure/codex.rs @@ -17,15 +17,17 @@ pub fn step_configure_codex(config: &Config) -> Result<()> { Ok(()) } -/// Write notify + codex_hooks=true into ~/.codex/config.toml. -/// The notify array is kept for backwards compat with older Codex builds that -/// predate the hooks.json event system. +/// Write codex_hooks=true into ~/.codex/config.toml. +/// When hooks.json is active (codex_hooks=true), the legacy `notify` key is +/// removed — newer Codex fires both notify AND hooks.json Stop for the same +/// event, causing duplicate session.jsonl entries per task. +/// The notify key is only kept when codex_hooks cannot be enabled (old Codex). fn step_configure_codex_toml( config: &Config, codex_dir: &std::path::Path, config_path: &std::path::Path, ) -> Result<()> { - let capture_script = config.scripts_root().join("capture-codex.py"); + let _ = config; // capture_script path no longer needed (notify removed) let raw = fs::read_to_string(config_path).unwrap_or_default(); let mut cfg_val: toml::Value = if raw.trim().is_empty() { toml::Value::Table(Default::default()) @@ -38,52 +40,7 @@ fn step_configure_codex_toml( .context("Codex config root must be a table")?; let mut changed = false; - let current_notify = table.get("notify").and_then(toml_array_to_strings); - let wanted_base = vec![ - "python3".to_string(), - capture_script.to_string_lossy().to_string(), - ]; - - let next_notify = match current_notify { - None => wanted_base.clone(), - Some(existing) => { - if existing.iter().any(|part| part.contains("capture-codex.py")) { - if let Some(forward_idx) = existing.iter().position(|p| p == "--forward") { - let forward = existing.get(forward_idx + 1).cloned().unwrap_or_default(); - if forward.is_empty() { - wanted_base.clone() - } else { - let mut with_forward = wanted_base.clone(); - with_forward.push("--forward".to_string()); - with_forward.push(forward); - with_forward - } - } else { - wanted_base.clone() - } - } else if existing.is_empty() { - wanted_base.clone() - } else { - let forward = serde_json::to_string(&existing)?; - let mut chained = wanted_base.clone(); - chained.push("--forward".to_string()); - chained.push(forward); - chained - } - } - }; - - if table - .get("notify") - .and_then(toml_array_to_strings) - .unwrap_or_default() - != next_notify - { - table.insert("notify".to_string(), string_array_to_toml(&next_notify)); - changed = true; - } - - // Ensure codex_hooks = true so hooks.json events are emitted. + // Enable hooks.json event system — this is the primary capture path. let features = table .entry("features".to_string()) .or_insert(toml::Value::Table(Default::default())); @@ -95,11 +52,43 @@ fn step_configure_codex_toml( changed = true; } + // With codex_hooks=true, hooks.json handles all events. Remove the legacy + // `notify` key so Codex doesn't fire capture-codex.py twice per task + // (once via notify, once via the hooks.json Stop event). + // If a prior notify value was forwarding to another tool, preserve that + // tool but strip our own capture script out of the chain. + let current_notify = table.get("notify").and_then(toml_array_to_strings); + if let Some(existing) = current_notify { + if existing.iter().any(|p| p.contains("capture-codex.py")) { + // Find what, if anything, was being forwarded to. + let forward_idx = existing.iter().position(|p| p == "--forward"); + let forward_val = forward_idx + .and_then(|i| existing.get(i + 1)) + .cloned() + .unwrap_or_default(); + + if forward_val.is_empty() { + // notify was only our hook — remove the key entirely. + table.remove("notify"); + } else { + // notify was chaining into another tool — restore just that tool. + if let Ok(other_cmd) = serde_json::from_str::>(&forward_val) { + table.insert("notify".to_string(), string_array_to_toml(&other_cmd)); + } else { + table.remove("notify"); + } + } + changed = true; + } + // If notify doesn't contain our script, leave it untouched. + } + // If notify was absent, nothing to do. + if changed { fs::create_dir_all(codex_dir)?; fs::write(config_path, toml::to_string_pretty(&cfg_val)?)?; println!( - "{} Codex config.toml updated (notify + codex_hooks=true) in {}", + "{} Codex config.toml updated (codex_hooks=true, notify removed) in {}", ok(), config_path.display() ); From 685244326eeca6d0c66fc6f58faba7c88d1387ce Mon Sep 17 00:00:00 2001 From: Prakhar Khatri Date: Mon, 20 Apr 2026 11:12:57 +0000 Subject: [PATCH 2/9] =?UTF-8?q?docs:=20fix=20stale=20README=20=E2=80=94=20?= =?UTF-8?q?correct=20tracking=20claim,=20remove=20removed=20commands,=20up?= =?UTF-8?q?date=20CI=20section?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix incorrect claim that configure tracks all repos globally (init is required per repo) - Remove commands that no longer exist: stats, log, remote-status, migrate, export - Add install-ci to commands table - Fix example flags: --out-md/--out-annotations → --out, agentdiff stats → agentdiff report - Replace manual CI YAML with agentdiff install-ci workflow + correct manual example - Fix install.sh URL: master → main - Remove stale config.toml keys (data_dir, auto_amend_ledger) Co-Authored-By: Claude Sonnet 4.6 --- README.md | 87 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index 864d90c..ccc3bcf 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ agentdiff stats That's it. From here every commit is attributed to whichever agent (or human) wrote it. -> **Note:** `agentdiff configure` installs capture hooks globally — all repos you work on with AI agents will be tracked. To track only specific repos, you can skip the global configure and run `agentdiff init` per-repo only (you will need to configure hooks manually). +> **Note:** `agentdiff configure` installs capture scripts globally, but capture only fires in repos where `agentdiff init` has been run (the `.git/agentdiff/` directory must exist). Running `configure` on its own does not track any repo — you must also run `agentdiff init` inside each repo you want to track. --- @@ -87,16 +87,16 @@ That's it. From here every commit is attributed to whichever agent (or human) wr | Command | Description | |---------|-------------| -| `agentdiff configure` | Install global agent hooks — run once per machine | -| `agentdiff init` | Initialize tracking in current repository | +| `agentdiff configure` | Install global agent capture hooks — run once per machine | +| `agentdiff init` | Initialize tracking in current repository (required per repo) | +| `agentdiff install-ci` | Write CI workflow YAMLs to `.github/workflows/` — run once per repo | | `agentdiff list` | List attribution entries | | `agentdiff blame ` | Line-level attribution, like `git blame` | -| `agentdiff stats` | Aggregate stats by agent, model, file | -| `agentdiff log` | Chronological AI contribution history | | `agentdiff diff []` | Attribution diff for a commit or range | | `agentdiff show ` | Full details for one trace entry | -| `agentdiff report` | CI report in Markdown or GitHub annotations | +| `agentdiff report` | Aggregate report (text, markdown, annotations, JSONL) | | `agentdiff status` | Health check — hooks, keys, traces | +| `agentdiff status --remote` | Show remote trace ref state (`refs/agentdiff/*` on origin) | | `agentdiff push` | Push local traces to per-branch ref on origin | | `agentdiff consolidate` | Merge per-branch traces into permanent store (CI) | | `agentdiff verify` | Verify ed25519 signatures on trace entries | @@ -104,9 +104,6 @@ That's it. From here every commit is attributed to whichever agent (or human) wr | `agentdiff keys register` | Register your public key in the git key registry | | `agentdiff keys rotate` | Rotate your keypair and register the new key | | `agentdiff policy check` | Enforce AI attribution policy rules | -| `agentdiff export` | Export traces in Agent Trace JSONL format | -| `agentdiff remote-status` | Show remote trace ref state (`refs/agentdiff/*` on origin) | -| `agentdiff migrate` | Import legacy ledger.jsonl into new storage | | `agentdiff config` | Manage global configuration |
@@ -120,15 +117,19 @@ agentdiff list --limit 50 # Blame for a specific agent only agentdiff blame src/api.rs --agent claude-code -# Stats broken down by file and model -agentdiff stats --by-file --by-model +# Report broken down by file and model +agentdiff report --by-file --by-model -# Stats from a specific date -agentdiff stats --since 2026-01-01T00:00:00Z +# Report from a specific date +agentdiff report --since 2026-01-01T00:00:00Z -# CI report to file -agentdiff report --format markdown --out-md report.md -agentdiff report --format annotations --out-annotations annotations.json +# Report to file +agentdiff report --format markdown --out report.md +agentdiff report --format annotations --out annotations.json + +# Post report as a PR comment (auto-detects PR from current branch) +agentdiff report --format markdown --post-pr-comment +agentdiff report --format markdown --post-pr-comment 42 # explicit PR number # Attribution diff for last 3 commits agentdiff diff HEAD~3 @@ -147,6 +148,9 @@ agentdiff push # Consolidate a branch's traces into permanent store (CI step) agentdiff consolidate --branch feature/my-branch --push +# Write CI workflows to .github/workflows/ (run once per repo) +agentdiff install-ci + # Skip specific agents during configure agentdiff configure --no-copilot --no-antigravity @@ -154,8 +158,8 @@ agentdiff configure --no-copilot --no-antigravity agentdiff init --no-git-hook # Check remote trace ref state after pushing -agentdiff remote-status -agentdiff remote-status --no-fetch # fast: show refs + SHAs only, skip trace counts +agentdiff status --remote +agentdiff status --remote --no-fetch # fast: show refs + SHAs only, skip trace counts ```
@@ -174,7 +178,7 @@ agentdiff remote-status --no-fetch # fast: show refs + SHAs only, skip trace c | **Codex CLI** | `notify` hook (`~/.codex/config.toml`) | Task-level file changes | | **Gemini / Antigravity** | `BeforeTool`/`AfterTool` hooks (`~/.gemini/settings.json`) | `write_file`, `replace` | -Agent hooks for Claude, Cursor, Codex, Windsurf, OpenCode, and Gemini are all installed **globally once** via `agentdiff configure` — no per-repo setup needed for those. +Agent hooks for Claude, Cursor, Codex, Windsurf, OpenCode, and Gemini are all installed **globally once** via `agentdiff configure`. However, capture only fires in repos where `agentdiff init` has been run — the `.git/agentdiff/` directory must exist for any data to be written. --- @@ -456,11 +460,27 @@ Exits 0 on pass, 1 on violation. Use `--since ` to scope to a specific rang ## CI Integration -**Full pipeline** — report, verify, and enforce policy on every PR: +Run once to write both workflow files into your repo: + +```bash +agentdiff install-ci +git add .github/workflows/agentdiff-*.yml +git commit -m "ci: add agentdiff consolidation and policy workflows" +``` + +This writes two workflows: + +- **`agentdiff-consolidate.yml`** — triggers on PR merge: consolidates per-branch traces into the permanent store and posts an attribution comment to the PR. +- **`agentdiff-policy.yml`** — triggers on every PR: runs `agentdiff policy check` and posts GitHub check annotations if rules are violated. + +For repos that need a custom pipeline, the manual equivalent: ```yaml -# .github/workflows/agentdiff.yml +# .github/workflows/agentdiff-policy.yml on: [pull_request] +permissions: + contents: read + checks: write jobs: agentdiff: @@ -472,31 +492,17 @@ jobs: - name: Install agentdiff run: | - curl -fsSL https://raw.githubusercontent.com/codeprakhar25/agentdiff/master/install.sh | bash + curl -fsSL https://raw.githubusercontent.com/codeprakhar25/agentdiff/main/install.sh | bash echo "$HOME/.local/bin" >> $GITHUB_PATH - - name: Init repo - run: agentdiff init --no-git-hook - - name: Fetch agentdiff refs - run: git fetch origin 'refs/agentdiff/*:refs/agentdiff/*' - - - name: Consolidate traces - run: agentdiff consolidate --branch ${{ github.head_ref }} --push + run: git fetch origin '+refs/agentdiff/*:refs/agentdiff/*' || true - name: Verify signatures run: agentdiff verify - name: Policy check run: agentdiff policy check --format github-annotations - - - name: Generate report - run: agentdiff report --format markdown --out-md ai-report.md - - - name: Post as PR comment - uses: marocchino/sticky-pull-request-comment@v2 - with: - path: ai-report.md ``` --- @@ -508,8 +514,7 @@ Config lives at `~/.agentdiff/config.toml`: ```toml schema_version = "1.0" scripts_dir = "~/.agentdiff/scripts" -auto_amend_ledger = true # include ledger in same commit automatically -data_dir = "~/.agentdiff/spillover" +capture_prompts = true # set false to omit prompt excerpts from traces [[repos]] path = "/home/user/my-project" @@ -517,8 +522,8 @@ slug = "-home-user-my-project" ``` ```bash -# Disable auto-amend -agentdiff config set auto_amend_ledger false +# Disable prompt capture +agentdiff config set capture_prompts false # View current config agentdiff config show From ebf1b58141caa09461ec68f51f95a90a33978475 Mon Sep 17 00:00:00 2001 From: Prakhar Khatri Date: Tue, 21 Apr 2026 07:23:52 +0000 Subject: [PATCH 3/9] =?UTF-8?q?fix:=20correct=20attribution=20pipeline=20?= =?UTF-8?q?=E2=80=94=20human=20fallback,=20git=5Fauthor=20display,=20no=20?= =?UTF-8?q?session-evidence=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - prepare-ledger: preserve agent="human" as semantic token; add git_author field separately so finalize-ledger can display the real git username without losing the human/AI distinction for type checks - prepare-ledger: explicitly attribute files with no session.jsonl evidence to human rather than inheriting the dominant AI agent — fixes cases where AI and human edits are committed together and untracked files were incorrectly claimed by the AI - finalize-ledger: read git_author from payload; use it for tool.name when agent=="human" so contributor.type=="human" traces show the committer name - store: remove session.jsonl load from load_entries() — only AgentTrace records belong in the committed view; add load_uncommitted_entries() for the --uncommitted path to avoid double-counting and copilot leakage - list: use load_uncommitted_entries() for the uncommitted view Co-Authored-By: Claude Sonnet 4.6 --- scripts/finalize-ledger.py | 3 ++- scripts/prepare-ledger.py | 16 ++++++++++++---- src/commands/list.rs | 3 +-- src/store.rs | 12 ++++++++---- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/scripts/finalize-ledger.py b/scripts/finalize-ledger.py index cea7395..7168f32 100644 --- a/scripts/finalize-ledger.py +++ b/scripts/finalize-ledger.py @@ -106,6 +106,7 @@ def write_agent_trace(repo_root: str, pending: dict, sha: str, ts: str) -> Optio # Build per-file trace entries from pending payload. agent = str(pending.get("agent") or "human") + git_author = str(pending.get("git_author") or agent) model = str(pending.get("model") or "human") attribution = pending.get("attribution") or {} lines_map = pending.get("lines") or {} @@ -161,7 +162,7 @@ def write_agent_trace(repo_root: str, pending: dict, sha: str, ts: str) -> Optio "id": str(uuid_mod.uuid4()), "timestamp": ts, "vcs": {"type": "git", "revision": sha}, - "tool": {"name": agent}, + "tool": {"name": git_author if agent == "human" else agent}, "files": files, } _ = model # captured above into per-file contributor.model_id diff --git a/scripts/prepare-ledger.py b/scripts/prepare-ledger.py index 5607dc0..e4b5b4c 100644 --- a/scripts/prepare-ledger.py +++ b/scripts/prepare-ledger.py @@ -289,8 +289,7 @@ def main() -> int: prompt = str(pending.get("prompt") or event.get("prompt") or "") session_id = str(pending.get("session_id") or event.get("session_id") or "unknown") agent = str(pending.get("agent") or event.get("agent") or "human") - if agent == "human": - agent = get_git_username(repo_root) + git_author = get_git_username(repo_root) model = str(pending.get("model_id") or pending.get("model") or event.get("model") or "human") files_read = pending.get("files_read") if not isinstance(files_read, list): @@ -313,13 +312,14 @@ def main() -> int: intent = str(intent) # Per-file attribution — each file maps to the agent/model that wrote it. - # Only populated when multiple agents are detected; omitted for single-agent commits. + # Files with a session event that matches the dominant agent are omitted (finalize + # falls back to the top-level agent). Files with NO session evidence at all are + # explicitly marked "human" so they are never incorrectly claimed by the dominant agent. attribution: Dict[str, dict] = {} for fp, ev in events_by_file.items(): file_agent = str(ev.get("agent") or "human") file_model = str(ev.get("model") or "human") if file_agent != agent or file_model != model: - # Only store when it differs from the dominant agent (saves space for single-agent commits) attribution[fp] = { "agent": file_agent, "model": file_model, @@ -327,9 +327,17 @@ def main() -> int: "tool": str(ev.get("tool") or "commit"), } + # Files committed with no captured session event → attribute to human. + # Without this, finalize-ledger.py would inherit the dominant AI agent for these + # files even though we have no evidence the AI touched them. + for fp in files_touched: + if fp not in events_by_file: + attribution[fp] = {"agent": "human", "model": "human"} + payload = { "captured_at": datetime.now(timezone.utc).isoformat(), "agent": agent, + "git_author": git_author, "model": model, "session_id": session_id, "files_touched": files_touched, diff --git a/src/commands/list.rs b/src/commands/list.rs index 9974651..2f9a9cc 100644 --- a/src/commands/list.rs +++ b/src/commands/list.rs @@ -92,8 +92,7 @@ pub fn run(store: &Store, args: &ListArgs) -> Result<()> { } fn run_uncommitted(store: &Store, args: &ListArgs) -> Result<()> { - let mut entries = store.load_entries()?; - entries.retain(|e| !e.committed); + let mut entries = store.load_uncommitted_entries()?; if let Some(ref agent) = args.agent { entries.retain(|e| e.agent.contains(agent.as_str())); diff --git a/src/store.rs b/src/store.rs index 7d2053e..f44fcb3 100644 --- a/src/store.rs +++ b/src/store.rs @@ -109,6 +109,14 @@ impl Store { Ok(values) } + /// Load uncommitted session entries (not yet finalized into AgentTrace records). + pub fn load_uncommitted_entries(&self) -> Result> { + let mut entries = Vec::new(); + let session_path = Config::repo_session_log(&self.repo_root); + load_session_from(&session_path, &mut entries, false)?; + Ok(entries) + } + /// Load all traces and convert to Entry for display commands. pub fn load_entries(&self) -> Result> { let traces = self.load_all_traces()?; @@ -117,10 +125,6 @@ impl Store { .flat_map(|t| t.to_entries(&self.repo_root)) .collect(); - // Also load uncommitted session entries - let session_path = Config::repo_session_log(&self.repo_root); - load_session_from(&session_path, &mut entries, false)?; - entries.sort_by(|a, b| { a.timestamp .cmp(&b.timestamp) From 7b0c47a4e4d787432fb7dd6f27348e778c724856 Mon Sep 17 00:00:00 2001 From: Prakhar Khatri Date: Tue, 21 Apr 2026 07:24:04 +0000 Subject: [PATCH 4/9] =?UTF-8?q?fix:=20cursor=20configure=20=E2=80=94=20che?= =?UTF-8?q?ck=20dir=20not=20file,=20create=20hooks.json,=20write=20to=20WS?= =?UTF-8?q?L+Windows=20paths?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Check ~/.cursor/ directory existence instead of hooks.json existence so the file is created when Cursor is installed but hooks.json is absent - Extract configure_cursor_hooks_file() helper to apply the same hooks to multiple paths without duplication - On WSL2, Cursor is a Windows app — scan /mnt/c/Users/*/\.cursor and write hooks.json there alongside the WSL ~/.cursor/hooks.json so whichever path cursor-server resolves picks up the config - Summary in print_configure_summary now checks presence_path (dir) separately from config_path (file) for all home-based tools, giving accurate output when the tool is installed but not yet configured Co-Authored-By: Claude Sonnet 4.6 --- src/configure/cursor.rs | 75 ++++++++++++++++++++++++++++++++++------- src/configure/mod.rs | 24 ++++++++++--- 2 files changed, 81 insertions(+), 18 deletions(-) diff --git a/src/configure/cursor.rs b/src/configure/cursor.rs index 14c3fde..2b2dc07 100644 --- a/src/configure/cursor.rs +++ b/src/configure/cursor.rs @@ -5,19 +5,70 @@ use anyhow::{Context, Result}; use std::fs; pub fn step_configure_cursor(config: &Config) -> Result<()> { - let hooks_path = dirs::home_dir().unwrap().join(".cursor").join("hooks.json"); - if !hooks_path.exists() { + let capture_script = config.scripts_root().join("capture-cursor.py"); + let capture_cmd = format!("python3 {}", capture_script.display()); + + // Cursor on WSL2 is a Windows app — it reads hooks from the Windows-side ~/.cursor/. + // We write to both locations so native Linux installs and WSL2 are both covered. + let candidate_dirs: Vec = { + let mut dirs = Vec::new(); + if let Some(home) = dirs::home_dir() { + dirs.push(home.join(".cursor")); + } + // Windows-side path when running under WSL2. + let win_cursor = std::path::Path::new("/mnt/c/Users") + .read_dir() + .ok() + .and_then(|mut rd| rd.next()) + .and_then(|e| e.ok()) + .map(|e| e.path().join(".cursor")); + // More reliable: derive from $USERPROFILE or the actual Windows username. + // Fall back to scanning /mnt/c/Users for the first user directory that has .cursor. + let win_cursor_reliable = std::path::Path::new("/mnt/c/Users") + .read_dir() + .ok() + .and_then(|rd| { + rd.filter_map(|e| e.ok()) + .map(|e| e.path().join(".cursor")) + .find(|p| p.exists()) + }); + if let Some(p) = win_cursor_reliable { + dirs.push(p); + } else if let Some(p) = win_cursor { + if p.exists() { + dirs.push(p); + } + } + dirs + }; + + let mut any_found = false; + for cursor_dir in &candidate_dirs { + if !cursor_dir.exists() { + continue; + } + any_found = true; + let hooks_path = cursor_dir.join("hooks.json"); + configure_cursor_hooks_file(&hooks_path, &capture_cmd) + .with_context(|| format!("configuring {}", hooks_path.display()))?; + } + + if !any_found { println!( - "{} ~/.cursor/hooks.json not found — skipping Cursor setup", + "{} ~/.cursor not found — skipping Cursor setup", warn() ); - return Ok(()); } + Ok(()) +} - let capture_script = config.scripts_root().join("capture-cursor.py"); - let raw = fs::read_to_string(&hooks_path)?; +fn configure_cursor_hooks_file( + hooks_path: &std::path::Path, + capture_cmd: &str, +) -> Result<()> { + let raw = fs::read_to_string(hooks_path).unwrap_or_else(|_| "{}".to_string()); let mut hooks_cfg: serde_json::Value = - serde_json::from_str(&raw).context("parsing ~/.cursor/hooks.json")?; + serde_json::from_str(&raw).context("parsing hooks.json")?; let hooks = hooks_cfg .as_object_mut() @@ -27,10 +78,9 @@ pub fn step_configure_cursor(config: &Config) -> Result<()> { .as_object_mut() .unwrap(); - let capture_cmd = format!("python3 {}", capture_script.display()); let events = ["afterFileEdit", "afterTabFileEdit", "beforeSubmitPrompt"]; - let mut changed = false; + for event in events { let arr = hooks .entry(event) @@ -49,7 +99,7 @@ pub fn step_configure_cursor(config: &Config) -> Result<()> { if cmd.contains("capture-cursor.py") { found = true; if cmd != capture_cmd { - *cmd_val = serde_json::Value::String(capture_cmd.clone()); + *cmd_val = serde_json::Value::String(capture_cmd.to_string()); changed = true; } } @@ -60,7 +110,6 @@ pub fn step_configure_cursor(config: &Config) -> Result<()> { changed = true; } - // De-duplicate exact command duplicates while preserving order. let mut seen = std::collections::HashSet::new(); arr.retain(|hook| { let Some(cmd) = hook.get("command").and_then(|c| c.as_str()) else { @@ -77,14 +126,14 @@ pub fn step_configure_cursor(config: &Config) -> Result<()> { } if changed { - fs::write(&hooks_path, serde_json::to_string_pretty(&hooks_cfg)?)?; + fs::write(hooks_path, serde_json::to_string_pretty(&hooks_cfg)?)?; println!( "{} Cursor hooks registered in {}", ok(), hooks_path.display() ); } else { - println!("{} Cursor hooks already present", dim()); + println!("{} Cursor hooks already present in {}", dim(), hooks_path.display()); } Ok(()) } diff --git a/src/configure/mod.rs b/src/configure/mod.rs index 2d4e12e..27b87ab 100644 --- a/src/configure/mod.rs +++ b/src/configure/mod.rs @@ -136,29 +136,34 @@ fn print_configure_summary( None => return, }; - // Each tuple: (display name, skipped flag, config path joined from home, marker string) - let home_based: &[(&str, bool, &[&str], &str)] = &[ + // Each tuple: (display name, skipped flag, presence-check path parts, config file path parts, marker string) + // presence_parts: path that must exist for the tool to be considered installed (dir or file) + // config_parts: path that contains the hooks (checked for marker string) + let home_based: &[(&str, bool, &[&str], &[&str], &str)] = &[ ( "claude-code", no_claude, + &[".claude"], &[".claude", "settings.json"], "capture-claude", ), ( "cursor", no_cursor, + &[".cursor"], &[".cursor", "hooks.json"], "capture-cursor", ), ( "windsurf", no_windsurf, + &[".codeium", "windsurf"], &[".codeium", "windsurf", "hooks.json"], "capture-windsurf", ), ]; - for (name, skipped, path_parts, marker) in home_based { + for (name, skipped, presence_parts, config_parts, marker) in home_based { if *skipped { println!( " {} {} skipped (--no-{})", @@ -168,11 +173,20 @@ fn print_configure_summary( ); continue; } - let config_path = path_parts.iter().fold(home.clone(), |p, part| p.join(part)); - if !config_path.exists() { + let presence_path = presence_parts.iter().fold(home.clone(), |p, part| p.join(part)); + if !presence_path.exists() { println!(" {} {} not installed on this machine", dim(), name); continue; } + let config_path = config_parts.iter().fold(home.clone(), |p, part| p.join(part)); + if !config_path.exists() { + println!( + " {} {} hook missing — re-run 'agentdiff configure'", + warn(), + name + ); + continue; + } let registered = std::fs::read_to_string(&config_path) .map(|s| s.contains(marker)) .unwrap_or(false); From 3026001b4d7ef693b2cf9f4eb38d27fbf42a2cfc Mon Sep 17 00:00:00 2001 From: Prakhar Khatri Date: Tue, 21 Apr 2026 07:24:10 +0000 Subject: [PATCH 5/9] docs: add CLAUDE.md with project context, attribution invariants, and gotchas Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..6c402e3 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,25 @@ +# agentdiff — Project Context for Claude + +## Role +Work on this as a senior engineer. The bar is production quality: correct attribution logic, no edge-case misattribution, clean diffs. Argue when you think a direction is wrong, but ship it working. + +## Project summary +`agentdiff` is a Rust + Python CLI that tracks which AI agent (claude-code, cursor, opencode, copilot, etc.) wrote which lines of code in a git repo. It hooks into agent tool callbacks, captures to `session.jsonl`, then on commit runs `prepare-ledger.py` → `finalize-ledger.py` to produce signed `AgentTrace` records in `.git/agentdiff/traces/{branch}.jsonl`. + +## Architecture +- **Capture**: per-agent Python scripts (`capture-claude.py`, etc.) write raw events to `.git/agentdiff/session.jsonl` +- **Prepare** (`scripts/prepare-ledger.py`): runs pre-commit, reads session.jsonl, computes per-file attribution using line overlap, writes `pending_ledger.json` +- **Finalize** (`scripts/finalize-ledger.py`): runs post-commit, converts pending payload to signed `AgentTrace`, appends to `traces/{branch}.jsonl` +- **Store** (`src/store.rs`): reads traces into `Entry` structs for `list` / `report` commands +- **Binary install path**: `~/.local/bin/agentdiff` (NOT `~/.cargo/bin/`) — always `cp target/release/agentdiff ~/.local/bin/agentdiff` after build + +## Attribution invariants +- `copilot` is in `_EXCLUDED_AGENTS` — captured in session.jsonl for stats, never wins file attribution +- Files with no session evidence → `agent = "human"`, must be explicit in attribution dict +- `agent = "human"` in payload is the semantic token; `git_author` holds the display name +- `contributor.type = "human"` iff `file_agent == "human"` — never infer from tool name + +## Key gotchas learned the hard way +- Scripts installed to `~/.agentdiff/scripts/` must be manually synced after edits: `cp scripts/*.py ~/.agentdiff/scripts/` +- `load_entries()` in store.rs must NOT load session.jsonl — only traces; uncommitted path uses `load_uncommitted_entries()` +- Configure steps must check directory existence (e.g. `~/.cursor/`), not config file existence — create the file if absent From 8bff731b14de114f2ebb1db0510b22d8ddf85c26 Mon Sep 17 00:00:00 2001 From: Prakhar Khatri Date: Tue, 21 Apr 2026 07:24:15 +0000 Subject: [PATCH 6/9] chore: bump version to 0.1.25 Co-Authored-By: Claude Sonnet 4.6 --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 14a4879..90186a0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,7 +4,7 @@ version = 4 [[package]] name = "agentdiff" -version = "0.1.23" +version = "0.1.25" dependencies = [ "anyhow", "assert_cmd", diff --git a/Cargo.toml b/Cargo.toml index 501310a..7727009 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "agentdiff" -version = "0.1.23" +version = "0.1.25" edition = "2024" rust-version = "1.85" description = "Audit and trace autonomous AI code contributions in git repositories" From f8d8fa054c25b9fe890a589196eae7e42997ebf7 Mon Sep 17 00:00:00 2001 From: Prakhar Khatri Date: Wed, 22 Apr 2026 11:32:41 +0000 Subject: [PATCH 7/9] feat: enhance opencode capture with model and prompt retrieval from SQLite DB - Added functions to retrieve the model ID and initial user prompt from the OpenCode SQLite database. - Implemented fallback mechanisms to read from model.json if the database lookup fails. - Updated the main capture logic to utilize the new retrieval functions for model and prompt. - Introduced a comprehensive test script for the agentdiff pipeline, validating the entire capture, prepare, and finalize process with real and simulated agents. - Improved cursor configuration in Rust to ensure versioning in hooks configuration. --- scripts/capture-claude.py | 125 ++++++++++++++++++++++++++---------- scripts/capture-codex.py | 87 +++++++++++++++++++------ scripts/capture-cursor.py | 122 ++++++++++++++++++++++++++++------- scripts/capture-opencode.py | 102 ++++++++++++++++++++++++++++- src/configure/cursor.rs | 6 +- 5 files changed, 359 insertions(+), 83 deletions(-) diff --git a/scripts/capture-claude.py b/scripts/capture-claude.py index e55e7b8..e5a2ad8 100644 --- a/scripts/capture-claude.py +++ b/scripts/capture-claude.py @@ -85,49 +85,103 @@ def get_session_log(cwd: str): return None -def get_model_and_prompt(cwd: str, session_id: str) -> tuple: - """Read model and prompt from Claude Code session JSONL. +def _tail_read_jsonl(path: str, chunk_size: int = 32768) -> list: + """Read JSONL lines from the end of a potentially large file. - Claude Code stores session files at: - ~/.claude/projects/{repo-slug}/{session_id}.jsonl - where the repo slug is the repo path with slashes replaced by dashes. - We glob-search all project dirs to avoid reconstructing the slug. + Returns parsed dicts, most-recent first. Reads at most chunk_size bytes + from the end on the first pass — enough for thousands of short entries. """ - import glob as _glob + results = [] try: - home = os.path.expanduser("~") - pattern = os.path.join(home, ".claude", "projects", "**", f"{session_id}.jsonl") - matches = _glob.glob(pattern, recursive=True) - if not matches: - return "unknown", "unknown" - - session_path = matches[0] - with open(session_path, encoding="utf-8", errors="replace") as f: - lines = f.readlines() - - model = "unknown" - for line in reversed(lines): - try: - entry = json.loads(line) - if entry.get("type") == "assistant" and entry.get("message", {}).get("model"): - model = entry["message"]["model"] - break - except Exception: + size = os.path.getsize(path) + with open(path, "rb") as fh: + offset = max(0, size - chunk_size) + fh.seek(offset) + raw = fh.read() + if offset > 0: + # Skip the (possibly partial) first line we cut into. + nl = raw.find(b"\n") + raw = raw[nl + 1:] if nl >= 0 else raw + for line in reversed(raw.decode("utf-8", errors="replace").splitlines()): + line = line.strip() + if not line: continue - - prompt = "unknown" - for line in reversed(lines): try: - entry = json.loads(line) - if entry.get("type") == "last-prompt": - prompt = entry.get("lastPrompt", "unknown") - break + results.append(json.loads(line)) except Exception: continue - - return model, prompt except Exception: - return "unknown", "unknown" + pass + return results + + +def get_prompt_from_history(session_id: str) -> str: + """Read the most-recent user prompt for session_id from ~/.claude/history.jsonl. + + history.jsonl format (one JSON object per line): + {"display":"...", "pastedContents":{...}, "sessionId":"...", "project":"...", "timestamp":...} + + We take the most-recent entry whose sessionId matches and whose display + is not a slash command. We also append any inline pasted text content. + """ + path = os.path.expanduser("~/.claude/history.jsonl") + entries = _tail_read_jsonl(path) + for entry in entries: + if entry.get("sessionId") != session_id: + continue + display = entry.get("display", "").strip() + if not display or display.startswith("/"): + continue + # Append pasted content that has actual text (not just a hash). + extra_parts = [] + for pasted in (entry.get("pastedContents") or {}).values(): + if isinstance(pasted, dict) and pasted.get("type") == "text": + content = pasted.get("content", "") + if content: + extra_parts.append(content[:200]) + if extra_parts: + display = display + " [pasted: " + " | ".join(extra_parts) + "]" + return display[:500] + return "unknown" + + +def get_model_and_prompt(cwd: str, session_id: str) -> tuple: + """Read model from Claude Code session JSONL, prompt from history.jsonl. + + Model: ~/.claude/projects/{repo-slug}/{session_id}.jsonl — assistant entries. + Skips model values (injected during context compression). + Prompt: ~/.claude/history.jsonl — most-recent display for this sessionId. + """ + import glob as _glob + model = "unknown" + try: + home = os.path.expanduser("~") + pattern = os.path.join(home, ".claude", "projects", "**", f"{session_id}.jsonl") + debug_log(f"glob pattern: {pattern}") + matches = _glob.glob(pattern, recursive=True) + debug_log(f"glob matches: {matches}") + if matches: + session_path = matches[0] + debug_log(f"session_path: {session_path}") + for entry in _tail_read_jsonl(session_path): + if entry.get("type") == "assistant": + m = entry.get("message", {}).get("model", "") + if m and m != "": + model = m + debug_log(f"model found: {model}") + break + except Exception as exc: + debug_log(f"model lookup error: {exc}") + + prompt = get_prompt_from_history(session_id) + # Allow test/CI injection via env var when history lookup can't find the session. + if prompt == "unknown": + env_prompt = os.environ.get("AGENTDIFF_PROMPT", "") + if env_prompt: + prompt = env_prompt + debug_log(f"prompt from AGENTDIFF_PROMPT env var") + debug_log(f"prompt: {prompt[:80]!r}") + return model, prompt def is_in_repo(abs_file: str, repo_root: str) -> bool: @@ -195,6 +249,7 @@ def main(): sys.exit(0) session_id = first(payload, "session_id", "sessionId", default="unknown") + debug_log(f"before get_model_and_prompt session_id={session_id}") model, prompt = get_model_and_prompt(cwd, session_id) timestamp = datetime.now(timezone.utc).isoformat() diff --git a/scripts/capture-codex.py b/scripts/capture-codex.py index 1936e8a..dc1c76d 100644 --- a/scripts/capture-codex.py +++ b/scripts/capture-codex.py @@ -17,15 +17,26 @@ def debug_enabled() -> bool: return os.environ.get("AGENTDIFF_DEBUG", "").lower() in {"1", "true", "yes", "on"} +def _write_log(path: str, message: str) -> None: + try: + log_dir = os.path.expanduser("~/.agentdiff/logs") + os.makedirs(log_dir, exist_ok=True) + ts = datetime.now(timezone.utc).isoformat() + with open(os.path.join(log_dir, path), "a", encoding="utf-8") as f: + f.write(f"{ts} {message}\n") + except Exception: + pass + + +def always_log(message: str) -> None: + """Write to codex.log unconditionally — key events, no secrets.""" + _write_log("capture-codex.log", message) + + def debug_log(message: str) -> None: if not debug_enabled(): return - log_dir = os.path.expanduser("~/.agentdiff/logs") - os.makedirs(log_dir, exist_ok=True) - path = os.path.join(log_dir, "capture-codex.log") - ts = datetime.now(timezone.utc).isoformat() - with open(path, "a", encoding="utf-8") as f: - f.write(f"{ts} {message}\n") + _write_log("capture-codex-debug.log", message) def first(payload: dict, *keys, default=None): @@ -39,6 +50,46 @@ def codex_sessions_root() -> str: return os.environ.get("CODEX_SESSIONS_ROOT", os.path.expanduser("~/.codex/sessions")) +def _tail_read_jsonl(path: str, chunk_size: int = 32768) -> List[dict]: + """Read JSONL lines from the end of a potentially large file, most-recent first.""" + results: List[dict] = [] + try: + size = os.path.getsize(path) + with open(path, "rb") as fh: + offset = max(0, size - chunk_size) + fh.seek(offset) + raw = fh.read() + if offset > 0: + nl = raw.find(b"\n") + raw = raw[nl + 1:] if nl >= 0 else raw + for line in reversed(raw.decode("utf-8", errors="replace").splitlines()): + line = line.strip() + if not line: + continue + try: + results.append(json.loads(line)) + except Exception: + continue + except Exception: + pass + return results + + +def get_prompt_from_history(session_id: str) -> str: + """Read the most-recent user prompt for session_id from ~/.codex/history.jsonl. + + history.jsonl format: + {"session_id":"...","ts":1234567890,"text":"..."} + + Returns the text of the most-recent entry whose session_id matches. + """ + path = os.path.expanduser("~/.codex/history.jsonl") + for entry in _tail_read_jsonl(path): + if entry.get("session_id") == session_id and entry.get("text"): + return str(entry["text"])[:500] + return "" + + def find_repo_root(cwd: str) -> str: try: result = subprocess.run( @@ -586,6 +637,7 @@ def main() -> int: try: cwd, model, session_id, turn_id, prompt, event_name = extract_codex_context(events) + always_log(f"event={event_name!r} turn={turn_id!r} cwd={cwd!r} model={model!r} session={session_id!r}") debug_log(f"event_name={event_name!r} turn_id={turn_id!r} cwd_from_events={cwd!r}") # task_started / UserPromptSubmit: snapshot dirty files so task_complete can @@ -616,6 +668,7 @@ def main() -> int: "agent_turn_stop", } if event_name and event_name in known_skip_events: + always_log(f"SKIP non_edit_event={event_name!r}") debug_log(f"skip: non-edit event {event_name!r}") run_forward(forward_cmd, input_data) return 0 @@ -637,23 +690,15 @@ def main() -> int: repo_root, chosen_cwd, changed = resolve_repo_and_changes([recovered_cwd] if recovered_cwd else []) if not changed: + always_log(f"SKIP no_changed_lines candidates={candidate_cwds}") debug_log("skip: no changed lines found in any candidate repo") run_forward(forward_cmd, input_data) return 0 - # Filter out files that were already dirty before this codex task started. - # This prevents codex from claiming attribution for changes made by other - # agents (claude-code, opencode, etc.) that were pending at task_started. - pre_task_files = load_and_consume_pre_task_state(repo_root) if repo_root else set() - if pre_task_files: - changed = {f: lines for f, lines in changed.items() if f not in pre_task_files} - debug_log( - f"post-filter: {len(changed)} files after excluding {len(pre_task_files)} pre-task dirty files" - ) - if not changed: - debug_log("skip: all changed files were pre-existing dirty (not codex's work)") - run_forward(forward_cmd, input_data) - return 0 + # Consume (and discard) the pre-task snapshot — kept for hook compatibility + # but no longer used to filter. Attribution conflicts across agents are + # resolved by prepare-ledger at commit time, not here. + load_and_consume_pre_task_state(repo_root) if repo_root else None if not chosen_cwd: chosen_cwd = cwd or os.getcwd() @@ -670,6 +715,7 @@ def main() -> int: timestamp = datetime.now(timezone.utc).isoformat() session_log = get_session_log(chosen_cwd) if session_log is None: + always_log(f"SKIP no_agentdiff_init cwd={chosen_cwd!r}") debug_log(f"skip: agentdiff init not run in {chosen_cwd!r}") return 0 @@ -685,12 +731,13 @@ def main() -> int: "tool": event_name or "task_complete", "file": file_path, "abs_file": abs_file, - "prompt": prompt or "unknown", + "prompt": prompt or get_prompt_from_history(str(session_id)) or "unknown", "acceptance": "verbatim", "lines": lines, } f.write(json.dumps(entry) + "\n") + always_log(f"WROTE {len(changed)} entries files={list(changed.keys())} model={model!r} session={session_log!r}") debug_log(f"wrote {len(changed)} codex entries to {session_log}") finally: run_forward(forward_cmd, input_data) diff --git a/scripts/capture-cursor.py b/scripts/capture-cursor.py index 697e6af..98add9d 100644 --- a/scripts/capture-cursor.py +++ b/scripts/capture-cursor.py @@ -14,15 +14,26 @@ def debug_enabled() -> bool: return os.environ.get("AGENTDIFF_DEBUG", "").lower() in {"1", "true", "yes", "on"} +def _write_log(path: str, message: str) -> None: + try: + log_dir = os.path.expanduser("~/.agentdiff/logs") + os.makedirs(log_dir, exist_ok=True) + ts = datetime.now(timezone.utc).isoformat() + with open(os.path.join(log_dir, path), "a", encoding="utf-8") as f: + f.write(f"{ts} {message}\n") + except Exception: + pass + + +def always_log(message: str) -> None: + """Write to cursor.log unconditionally — key events, no secrets.""" + _write_log("capture-cursor.log", message) + + def debug_log(message: str) -> None: if not debug_enabled(): return - log_dir = os.path.expanduser("~/.agentdiff/logs") - os.makedirs(log_dir, exist_ok=True) - path = os.path.join(log_dir, "capture-cursor.log") - ts = datetime.now(timezone.utc).isoformat() - with open(path, "a", encoding="utf-8") as f: - f.write(f"{ts} {message}\n") + _write_log("capture-cursor-debug.log", message) def first(payload: dict, *keys, default=None): @@ -107,9 +118,65 @@ def get_cached_prompt(conversation_id: str) -> str: """Read cached prompt from beforeSubmitPrompt.""" prompt_path = os.path.expanduser(f"~/.cursor/hooks/prompts/{conversation_id}.txt") if os.path.exists(prompt_path): - with open(prompt_path) as f: - return f.read().strip() - return "unknown" + try: + with open(prompt_path) as f: + return f.read().strip() + except Exception: + pass + return "" + + +def _cursor_project_slug(repo_root: str) -> str: + """Derive the ~/.cursor/projects/ slug from a repo root path. + + /home/prakh/ml-resarch → home-prakh-ml-resarch + """ + return repo_root.lstrip("/").replace("/", "-") + + +def get_prompt_from_transcript(conversation_id: str, repo_root: str) -> str: + """Read the user's prompt from Cursor's agent-transcript JSONL. + + Files live at: + ~/.cursor/projects/{slug}/agent-transcripts/{conv_id}/{conv_id}.jsonl + + We read the first user message and extract its text content. + """ + slug = _cursor_project_slug(repo_root) + transcript_path = os.path.expanduser( + f"~/.cursor/projects/{slug}/agent-transcripts/{conversation_id}/{conversation_id}.jsonl" + ) + if not os.path.exists(transcript_path): + debug_log(f"transcript not found: {transcript_path}") + return "" + try: + with open(transcript_path, encoding="utf-8", errors="replace") as f: + for raw in f: + raw = raw.strip() + if not raw: + continue + try: + entry = json.loads(raw) + except Exception: + continue + if entry.get("role") != "user": + continue + content = entry.get("message", {}).get("content", []) + if isinstance(content, list): + for part in content: + if isinstance(part, dict) and part.get("type") == "text": + text = part.get("text", "") + # Strip the wrapper Cursor adds. + text = re.sub(r"\s*", "", text) + text = re.sub(r"\s*", "", text) + return text.strip()[:500] + elif isinstance(content, str): + return content.strip()[:500] + except Exception as exc: + debug_log(f"transcript read error: {exc}") + return "" + + def main(): @@ -121,11 +188,12 @@ def main(): try: payload = json.loads(input_data) except json.JSONDecodeError: + always_log(f"SKIP parse_error input={input_data[:120]!r}") sys.exit(0) event_name = first(payload, "hook_event_name", "hookEventName", "event_name", "event", default="") if event_name not in ["afterFileEdit", "afterTabFileEdit", "beforeSubmitPrompt"]: - debug_log(f"skip: unknown event_name={event_name!r}") + always_log(f"SKIP unknown_event={event_name!r}") sys.exit(0) # Handle beforeSubmitPrompt - cache the prompt @@ -136,7 +204,7 @@ def main(): os.makedirs(prompt_dir, exist_ok=True) with open(os.path.join(prompt_dir, f"{conversation_id}.txt"), "w") as f: f.write(prompt) - debug_log(f"cached prompt for conversation_id={conversation_id}") + always_log(f"cached_prompt conv={conversation_id}") sys.exit(0) cwd_raw = first(payload, "cwd", "workspace", "workspace_path", "workspacePath", default=os.getcwd()) @@ -147,12 +215,12 @@ def main(): if not abs_file and isinstance(payload.get("file"), dict): abs_file = first(payload.get("file", {}), "path", "file_path", "filePath", default="") if not abs_file: - debug_log("skip: missing abs_file") + always_log("SKIP missing_abs_file") sys.exit(0) abs_file = normalize_path(str(abs_file), cwd) if not abs_file: - debug_log("skip: invalid abs_file after normalize") + always_log("SKIP invalid_abs_file_after_normalize") sys.exit(0) file_repo_root = find_repo_root_from_path(abs_file) @@ -161,21 +229,25 @@ def main(): in_repo = is_git_repo(repo_root) if in_repo and not is_within_repo(abs_file, repo_root): - debug_log(f"skip: file outside repo abs_file={abs_file!r} repo_root={repo_root!r}") + always_log(f"SKIP file_outside_repo file={abs_file!r} repo={repo_root!r}") sys.exit(0) # Get prompt for agent mode if event_name == "afterFileEdit": conversation_id = first(payload, "conversation_id", "conversationId", default="") - prompt = get_cached_prompt(conversation_id) if conversation_id else "unknown" + prompt = "" + if conversation_id: + prompt = get_cached_prompt(conversation_id) + if not prompt and repo_root: + prompt = get_prompt_from_transcript(conversation_id, repo_root) + if not prompt: + prompt = "unknown" mode = "agent" else: # afterTabFileEdit prompt = None mode = "tab" timestamp = datetime.now(timezone.utc).isoformat() - - # Model comes from payload in Cursor model = first(payload, "model", "model_name", "modelName", default="cursor-unknown") entry = { @@ -207,22 +279,26 @@ def main(): end = ch.get("endLine") or ch.get("line_end") or start if isinstance(start, int) and isinstance(end, int): new_lines.extend(list(range(min(start, end), max(start, end) + 1))) + entry["lines"] = new_lines if new_lines else old_lines + always_log( + f"event={event_name} file={entry['file']!r} model={model!r} " + f"n_lines={len(entry['lines'])} " + f"payload_old={old_lines} payload_new={new_lines} " + f"repo={repo_root!r}" + ) else: # tab completion line_num = first(payload, "line_number", "lineNumber", default=1) entry["lines"] = [line_num if isinstance(line_num, int) else 1] + always_log(f"event={event_name} file={entry['file']!r} line={line_num}") session_log = get_session_log(cwd, repo_root if in_repo else "") if session_log is None: - debug_log(f"skip: agentdiff init not run in {repo_root!r}") + always_log(f"SKIP no_agentdiff_init repo={repo_root!r}") sys.exit(0) with open(session_log, "a") as f: f.write(json.dumps(entry) + "\n") - debug_log( - "wrote entry " - f"event={event_name} file={entry['file']} lines={entry.get('lines')} " - f"cwd={cwd!r} repo_root={repo_root!r} session_log={session_log!r}" - ) + always_log(f"WROTE file={entry['file']!r} lines={entry.get('lines')} session_log={session_log!r}") if __name__ == "__main__": diff --git a/scripts/capture-opencode.py b/scripts/capture-opencode.py index 7c7276c..6632434 100644 --- a/scripts/capture-opencode.py +++ b/scripts/capture-opencode.py @@ -4,6 +4,7 @@ """ import json import os +import sqlite3 import subprocess import sys from datetime import datetime, timezone @@ -44,6 +45,91 @@ def find_repo_root(cwd: str) -> str: return cwd +_OPENCODE_DB = os.path.expanduser("~/.local/share/opencode/opencode.db") +_OPENCODE_MODEL_JSON = os.path.expanduser("~/.local/state/opencode/model.json") + + +def get_opencode_model(session_id: str) -> str: + """Look up the model used in an OpenCode session. + + Primary: query the SQLite DB for the most-recent assistant message's modelID. + Fallback: ~/.local/state/opencode/model.json → recent[0].modelID. + """ + if os.path.exists(_OPENCODE_DB): + try: + conn = sqlite3.connect(f"file:{_OPENCODE_DB}?mode=ro", uri=True, timeout=2) + row = conn.execute( + "SELECT json_extract(data, '$.modelID') " + "FROM message " + "WHERE session_id=? AND json_extract(data,'$.role')='assistant' " + "ORDER BY time_created DESC LIMIT 1", + (session_id,), + ).fetchone() + conn.close() + if row and row[0]: + debug_log(f"opencode model from DB: {row[0]!r}") + return str(row[0]) + except Exception as exc: + debug_log(f"opencode model DB lookup failed: {exc}") + + # Fallback: most-recently used model from model.json + if os.path.exists(_OPENCODE_MODEL_JSON): + try: + with open(_OPENCODE_MODEL_JSON, encoding="utf-8") as f: + data = json.load(f) + recent = data.get("recent", []) + if recent: + model_id = recent[0].get("modelID", "") + if model_id: + debug_log(f"opencode model from model.json: {model_id!r}") + return model_id + except Exception as exc: + debug_log(f"opencode model.json lookup failed: {exc}") + + return "opencode" + + +def get_opencode_prompt(session_id: str) -> str: + """Look up the user's initial prompt for an OpenCode session from the SQLite DB. + + message.data contains: {"role":"user", ...} + The text is in the part table: {"type":"text","text":"..."} + """ + if not os.path.exists(_OPENCODE_DB): + return "unknown" + try: + conn = sqlite3.connect(f"file:{_OPENCODE_DB}?mode=ro", uri=True, timeout=2) + # Get first user message for this session + row = conn.execute( + "SELECT id FROM message WHERE session_id=? " + "AND json_extract(data,'$.role')='user' " + "ORDER BY time_created ASC LIMIT 1", + (session_id,), + ).fetchone() + if not row: + conn.close() + return "unknown" + msg_id = row[0] + # Get text parts for this message + parts = conn.execute( + "SELECT data FROM part WHERE message_id=? ORDER BY time_created ASC", + (msg_id,), + ).fetchall() + conn.close() + for part_row in parts: + try: + part = json.loads(part_row[0]) + if part.get("type") == "text" and part.get("text"): + text = str(part["text"]).strip() + debug_log(f"opencode prompt from DB: {text[:80]!r}") + return text[:500] + except Exception: + continue + except Exception as exc: + debug_log(f"opencode prompt DB lookup failed: {exc}") + return "unknown" + + def get_session_log(cwd: str): """Return session log path, or None if agentdiff init has not been run here.""" override = os.environ.get("AGENTDIFF_SESSION_LOG") @@ -121,8 +207,20 @@ def main() -> int: rel_file = abs_file[len(repo_root):].lstrip("/") if abs_file.startswith(repo_root) else abs_file session_id = str(first(payload, "session_id", "sessionId", default="unknown")) - model = str(first(payload, "model", "modelID", "model_id", default="opencode")) - prompt = first(payload, "prompt", "user_prompt", "userPrompt", default="unknown") + + # Model: payload may already carry it (from older plugin version); otherwise query DB. + raw_model = str(first(payload, "model", "modelID", "model_id", default="") or "") + if not raw_model or raw_model == "opencode": + model = get_opencode_model(session_id) + else: + model = raw_model + + # Prompt: payload may carry it; otherwise query DB for first user message. + raw_prompt = first(payload, "prompt", "user_prompt", "userPrompt", default="") or "" + if not raw_prompt or str(raw_prompt) in ("unknown", "null"): + prompt = get_opencode_prompt(session_id) + else: + prompt = str(raw_prompt) entry = { "timestamp": datetime.now(timezone.utc).isoformat(), diff --git a/src/configure/cursor.rs b/src/configure/cursor.rs index 2b2dc07..8094757 100644 --- a/src/configure/cursor.rs +++ b/src/configure/cursor.rs @@ -70,9 +70,9 @@ fn configure_cursor_hooks_file( let mut hooks_cfg: serde_json::Value = serde_json::from_str(&raw).context("parsing hooks.json")?; - let hooks = hooks_cfg - .as_object_mut() - .unwrap() + let obj = hooks_cfg.as_object_mut().unwrap(); + obj.entry("version").or_insert(serde_json::json!(1)); + let hooks = obj .entry("hooks") .or_insert(serde_json::json!({})) .as_object_mut() From ff0a2e97ece08783cddccc1cbcdd4010eb599e59 Mon Sep 17 00:00:00 2001 From: Prakhar Khatri Date: Wed, 22 Apr 2026 11:49:40 +0000 Subject: [PATCH 8/9] fix: remove unconditional logging from cursor/codex captures; fix MCP attribution fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Drop always_log from capture-cursor.py and capture-codex.py — was writing to log files on every agent event regardless of AGENTDIFF_DEBUG, silently filling ~/.agentdiff/logs/. All call sites replaced with debug_log (conditional on AGENTDIFF_DEBUG env var). - Fix prepare-ledger.py: files with no session event but present in MCP files_read now correctly inherit the MCP agent/model instead of falling back to "human". Fixes CI mcp-smoke test failure: RuntimeError: expected model_id=mcp-smoke-model in trace entry. Co-Authored-By: Claude Sonnet 4.6 --- scripts/capture-codex.py | 15 +++++---------- scripts/capture-cursor.py | 25 ++++++++++--------------- scripts/prepare-ledger.py | 11 ++++++++--- 3 files changed, 23 insertions(+), 28 deletions(-) diff --git a/scripts/capture-codex.py b/scripts/capture-codex.py index dc1c76d..9db736d 100644 --- a/scripts/capture-codex.py +++ b/scripts/capture-codex.py @@ -28,11 +28,6 @@ def _write_log(path: str, message: str) -> None: pass -def always_log(message: str) -> None: - """Write to codex.log unconditionally — key events, no secrets.""" - _write_log("capture-codex.log", message) - - def debug_log(message: str) -> None: if not debug_enabled(): return @@ -637,7 +632,7 @@ def main() -> int: try: cwd, model, session_id, turn_id, prompt, event_name = extract_codex_context(events) - always_log(f"event={event_name!r} turn={turn_id!r} cwd={cwd!r} model={model!r} session={session_id!r}") + debug_log(f"event={event_name!r} turn={turn_id!r} cwd={cwd!r} model={model!r} session={session_id!r}") debug_log(f"event_name={event_name!r} turn_id={turn_id!r} cwd_from_events={cwd!r}") # task_started / UserPromptSubmit: snapshot dirty files so task_complete can @@ -668,7 +663,7 @@ def main() -> int: "agent_turn_stop", } if event_name and event_name in known_skip_events: - always_log(f"SKIP non_edit_event={event_name!r}") + debug_log(f"SKIP non_edit_event={event_name!r}") debug_log(f"skip: non-edit event {event_name!r}") run_forward(forward_cmd, input_data) return 0 @@ -690,7 +685,7 @@ def main() -> int: repo_root, chosen_cwd, changed = resolve_repo_and_changes([recovered_cwd] if recovered_cwd else []) if not changed: - always_log(f"SKIP no_changed_lines candidates={candidate_cwds}") + debug_log(f"SKIP no_changed_lines candidates={candidate_cwds}") debug_log("skip: no changed lines found in any candidate repo") run_forward(forward_cmd, input_data) return 0 @@ -715,7 +710,7 @@ def main() -> int: timestamp = datetime.now(timezone.utc).isoformat() session_log = get_session_log(chosen_cwd) if session_log is None: - always_log(f"SKIP no_agentdiff_init cwd={chosen_cwd!r}") + debug_log(f"SKIP no_agentdiff_init cwd={chosen_cwd!r}") debug_log(f"skip: agentdiff init not run in {chosen_cwd!r}") return 0 @@ -737,7 +732,7 @@ def main() -> int: } f.write(json.dumps(entry) + "\n") - always_log(f"WROTE {len(changed)} entries files={list(changed.keys())} model={model!r} session={session_log!r}") + debug_log(f"WROTE {len(changed)} entries files={list(changed.keys())} model={model!r} session={session_log!r}") debug_log(f"wrote {len(changed)} codex entries to {session_log}") finally: run_forward(forward_cmd, input_data) diff --git a/scripts/capture-cursor.py b/scripts/capture-cursor.py index 98add9d..e5824dc 100644 --- a/scripts/capture-cursor.py +++ b/scripts/capture-cursor.py @@ -25,11 +25,6 @@ def _write_log(path: str, message: str) -> None: pass -def always_log(message: str) -> None: - """Write to cursor.log unconditionally — key events, no secrets.""" - _write_log("capture-cursor.log", message) - - def debug_log(message: str) -> None: if not debug_enabled(): return @@ -188,12 +183,12 @@ def main(): try: payload = json.loads(input_data) except json.JSONDecodeError: - always_log(f"SKIP parse_error input={input_data[:120]!r}") + debug_log(f"SKIP parse_error input={input_data[:120]!r}") sys.exit(0) event_name = first(payload, "hook_event_name", "hookEventName", "event_name", "event", default="") if event_name not in ["afterFileEdit", "afterTabFileEdit", "beforeSubmitPrompt"]: - always_log(f"SKIP unknown_event={event_name!r}") + debug_log(f"SKIP unknown_event={event_name!r}") sys.exit(0) # Handle beforeSubmitPrompt - cache the prompt @@ -204,7 +199,7 @@ def main(): os.makedirs(prompt_dir, exist_ok=True) with open(os.path.join(prompt_dir, f"{conversation_id}.txt"), "w") as f: f.write(prompt) - always_log(f"cached_prompt conv={conversation_id}") + debug_log(f"cached_prompt conv={conversation_id}") sys.exit(0) cwd_raw = first(payload, "cwd", "workspace", "workspace_path", "workspacePath", default=os.getcwd()) @@ -215,12 +210,12 @@ def main(): if not abs_file and isinstance(payload.get("file"), dict): abs_file = first(payload.get("file", {}), "path", "file_path", "filePath", default="") if not abs_file: - always_log("SKIP missing_abs_file") + debug_log("SKIP missing_abs_file") sys.exit(0) abs_file = normalize_path(str(abs_file), cwd) if not abs_file: - always_log("SKIP invalid_abs_file_after_normalize") + debug_log("SKIP invalid_abs_file_after_normalize") sys.exit(0) file_repo_root = find_repo_root_from_path(abs_file) @@ -229,7 +224,7 @@ def main(): in_repo = is_git_repo(repo_root) if in_repo and not is_within_repo(abs_file, repo_root): - always_log(f"SKIP file_outside_repo file={abs_file!r} repo={repo_root!r}") + debug_log(f"SKIP file_outside_repo file={abs_file!r} repo={repo_root!r}") sys.exit(0) # Get prompt for agent mode @@ -281,7 +276,7 @@ def main(): new_lines.extend(list(range(min(start, end), max(start, end) + 1))) entry["lines"] = new_lines if new_lines else old_lines - always_log( + debug_log( f"event={event_name} file={entry['file']!r} model={model!r} " f"n_lines={len(entry['lines'])} " f"payload_old={old_lines} payload_new={new_lines} " @@ -290,15 +285,15 @@ def main(): else: # tab completion line_num = first(payload, "line_number", "lineNumber", default=1) entry["lines"] = [line_num if isinstance(line_num, int) else 1] - always_log(f"event={event_name} file={entry['file']!r} line={line_num}") + debug_log(f"event={event_name} file={entry['file']!r} line={line_num}") session_log = get_session_log(cwd, repo_root if in_repo else "") if session_log is None: - always_log(f"SKIP no_agentdiff_init repo={repo_root!r}") + debug_log(f"SKIP no_agentdiff_init repo={repo_root!r}") sys.exit(0) with open(session_log, "a") as f: f.write(json.dumps(entry) + "\n") - always_log(f"WROTE file={entry['file']!r} lines={entry.get('lines')} session_log={session_log!r}") + debug_log(f"WROTE file={entry['file']!r} lines={entry.get('lines')} session_log={session_log!r}") if __name__ == "__main__": diff --git a/scripts/prepare-ledger.py b/scripts/prepare-ledger.py index e4b5b4c..d86065f 100644 --- a/scripts/prepare-ledger.py +++ b/scripts/prepare-ledger.py @@ -328,11 +328,16 @@ def main() -> int: } # Files committed with no captured session event → attribute to human. - # Without this, finalize-ledger.py would inherit the dominant AI agent for these - # files even though we have no evidence the AI touched them. + # Exception: if the file appears in files_read from the MCP pending context and + # there is a non-human top-level agent, the MCP context is sufficient evidence — + # use the MCP agent/model rather than falling back to human. + files_read_set = {os.path.basename(f) for f in files_read} | set(files_read) for fp in files_touched: if fp not in events_by_file: - attribution[fp] = {"agent": "human", "model": "human"} + if agent != "human" and (fp in files_read_set or os.path.basename(fp) in files_read_set): + attribution[fp] = {"agent": agent, "model": model} + else: + attribution[fp] = {"agent": "human", "model": "human"} payload = { "captured_at": datetime.now(timezone.utc).isoformat(), From e3b94ea1004942207b189bf47de6519bf5fa2e2f Mon Sep 17 00:00:00 2001 From: Prakhar Khatri Date: Thu, 23 Apr 2026 10:21:38 +0000 Subject: [PATCH 9/9] =?UTF-8?q?fix:=20address=20bot=20review=20=E2=80=94?= =?UTF-8?q?=20MCP=20path=20normalisation,=20SQLite=20leaks,=20duplicate=20?= =?UTF-8?q?debug=20logs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - prepare-ledger: replace basename-union files_read_set with repo-relative path normalisation; full-path match now fires correctly, eliminating false-positive MCP attribution on common filenames (e.g. utils.py) - capture-opencode: guard both SQLite connections with contextlib.closing so the file lock is released on exception; probe both DB path candidates (~/.local/share/opencode and ~/.opencode) to cover all install methods - capture-codex: remove 5 duplicate debug_log lines that were strict subsets of the preceding log call Co-Authored-By: Claude Sonnet 4.6 --- scripts/capture-codex.py | 5 ---- scripts/capture-opencode.py | 56 ++++++++++++++++++------------------- scripts/prepare-ledger.py | 7 +++-- 3 files changed, 33 insertions(+), 35 deletions(-) diff --git a/scripts/capture-codex.py b/scripts/capture-codex.py index 9db736d..cef47eb 100644 --- a/scripts/capture-codex.py +++ b/scripts/capture-codex.py @@ -633,7 +633,6 @@ def main() -> int: try: cwd, model, session_id, turn_id, prompt, event_name = extract_codex_context(events) debug_log(f"event={event_name!r} turn={turn_id!r} cwd={cwd!r} model={model!r} session={session_id!r}") - debug_log(f"event_name={event_name!r} turn_id={turn_id!r} cwd_from_events={cwd!r}") # task_started / UserPromptSubmit: snapshot dirty files so task_complete can # isolate what codex changed. UserPromptSubmit fires from hooks.json before @@ -664,7 +663,6 @@ def main() -> int: } if event_name and event_name in known_skip_events: debug_log(f"SKIP non_edit_event={event_name!r}") - debug_log(f"skip: non-edit event {event_name!r}") run_forward(forward_cmd, input_data) return 0 @@ -686,7 +684,6 @@ def main() -> int: if not changed: debug_log(f"SKIP no_changed_lines candidates={candidate_cwds}") - debug_log("skip: no changed lines found in any candidate repo") run_forward(forward_cmd, input_data) return 0 @@ -711,7 +708,6 @@ def main() -> int: session_log = get_session_log(chosen_cwd) if session_log is None: debug_log(f"SKIP no_agentdiff_init cwd={chosen_cwd!r}") - debug_log(f"skip: agentdiff init not run in {chosen_cwd!r}") return 0 with open(session_log, "a", encoding="utf-8") as f: @@ -733,7 +729,6 @@ def main() -> int: f.write(json.dumps(entry) + "\n") debug_log(f"WROTE {len(changed)} entries files={list(changed.keys())} model={model!r} session={session_log!r}") - debug_log(f"wrote {len(changed)} codex entries to {session_log}") finally: run_forward(forward_cmd, input_data) diff --git a/scripts/capture-opencode.py b/scripts/capture-opencode.py index 6632434..209cc94 100644 --- a/scripts/capture-opencode.py +++ b/scripts/capture-opencode.py @@ -2,6 +2,7 @@ """ AgentDiff capture script for OpenCode plugin hooks. """ +import contextlib import json import os import sqlite3 @@ -45,7 +46,11 @@ def find_repo_root(cwd: str) -> str: return cwd -_OPENCODE_DB = os.path.expanduser("~/.local/share/opencode/opencode.db") +_OPENCODE_DB_CANDIDATES = [ + os.path.expanduser("~/.local/share/opencode/opencode.db"), + os.path.expanduser("~/.opencode/opencode.db"), +] +_OPENCODE_DB = next((p for p in _OPENCODE_DB_CANDIDATES if os.path.exists(p)), _OPENCODE_DB_CANDIDATES[0]) _OPENCODE_MODEL_JSON = os.path.expanduser("~/.local/state/opencode/model.json") @@ -57,15 +62,14 @@ def get_opencode_model(session_id: str) -> str: """ if os.path.exists(_OPENCODE_DB): try: - conn = sqlite3.connect(f"file:{_OPENCODE_DB}?mode=ro", uri=True, timeout=2) - row = conn.execute( - "SELECT json_extract(data, '$.modelID') " - "FROM message " - "WHERE session_id=? AND json_extract(data,'$.role')='assistant' " - "ORDER BY time_created DESC LIMIT 1", - (session_id,), - ).fetchone() - conn.close() + with contextlib.closing(sqlite3.connect(f"file:{_OPENCODE_DB}?mode=ro", uri=True, timeout=2)) as conn: + row = conn.execute( + "SELECT json_extract(data, '$.modelID') " + "FROM message " + "WHERE session_id=? AND json_extract(data,'$.role')='assistant' " + "ORDER BY time_created DESC LIMIT 1", + (session_id,), + ).fetchone() if row and row[0]: debug_log(f"opencode model from DB: {row[0]!r}") return str(row[0]) @@ -98,24 +102,20 @@ def get_opencode_prompt(session_id: str) -> str: if not os.path.exists(_OPENCODE_DB): return "unknown" try: - conn = sqlite3.connect(f"file:{_OPENCODE_DB}?mode=ro", uri=True, timeout=2) - # Get first user message for this session - row = conn.execute( - "SELECT id FROM message WHERE session_id=? " - "AND json_extract(data,'$.role')='user' " - "ORDER BY time_created ASC LIMIT 1", - (session_id,), - ).fetchone() - if not row: - conn.close() - return "unknown" - msg_id = row[0] - # Get text parts for this message - parts = conn.execute( - "SELECT data FROM part WHERE message_id=? ORDER BY time_created ASC", - (msg_id,), - ).fetchall() - conn.close() + with contextlib.closing(sqlite3.connect(f"file:{_OPENCODE_DB}?mode=ro", uri=True, timeout=2)) as conn: + row = conn.execute( + "SELECT id FROM message WHERE session_id=? " + "AND json_extract(data,'$.role')='user' " + "ORDER BY time_created ASC LIMIT 1", + (session_id,), + ).fetchone() + if not row: + return "unknown" + msg_id = row[0] + parts = conn.execute( + "SELECT data FROM part WHERE message_id=? ORDER BY time_created ASC", + (msg_id,), + ).fetchall() for part_row in parts: try: part = json.loads(part_row[0]) diff --git a/scripts/prepare-ledger.py b/scripts/prepare-ledger.py index d86065f..d6e4b62 100644 --- a/scripts/prepare-ledger.py +++ b/scripts/prepare-ledger.py @@ -331,10 +331,13 @@ def main() -> int: # Exception: if the file appears in files_read from the MCP pending context and # there is a non-human top-level agent, the MCP context is sufficient evidence — # use the MCP agent/model rather than falling back to human. - files_read_set = {os.path.basename(f) for f in files_read} | set(files_read) + files_read_rel = { + os.path.relpath(f, repo_root) if os.path.isabs(f) and f.startswith(repo_root) else f + for f in files_read + } for fp in files_touched: if fp not in events_by_file: - if agent != "human" and (fp in files_read_set or os.path.basename(fp) in files_read_set): + if agent != "human" and fp in files_read_rel: attribution[fp] = {"agent": agent, "model": model} else: attribution[fp] = {"agent": "human", "model": "human"}