From edea64ccb0cd58ac85d935dd5218d48652d01e10 Mon Sep 17 00:00:00 2001 From: Jean Mertz Date: Fri, 15 May 2026 08:32:56 +0200 Subject: [PATCH 1/2] fix(attachment, github): Use fallback when diff is too large GitHub's unified-diff endpoint (`application/vnd.github.diff`) caps responses at 20,000 lines and returns HTTP 406 beyond that. Previously, `jp` would surface that as an opaque error, making large PRs unusable via the `gh:pr` attachment URI. When the 406 is detected, the handler now falls back to the paginated `/pulls/{N}/files` endpoint and synthesizes a unified-diff-like blob from the per-file `DiffEntry` records. Each file gets the standard `diff --git a/PATH b/PATH` header plus the patch hunks as returned by GitHub. Renames and copies include the appropriate `rename from`/ `rename to` (or `copy from`/`copy to`) markers so downstream consumers can still identify the nature of each change. When GitHub omits the `patch` field for a file (binary files or files exceeding GitHub's per-file limit), a placeholder line is emitted that surfaces the file status and line counts, ensuring the LLM still knows the file was modified even without hunks. The existing exclude-pattern logic applies equally to the fallback path, so files like `Cargo.lock` or snapshot files are dropped before the synthesized blob is assembled. A notice is injected into the PR header whenever the fallback is used, including the count of patch-truncated files when non-zero. Signed-off-by: Jean Mertz --- crates/jp_attachment_github/src/lib.rs | 166 ++++++++++++++- crates/jp_attachment_github/src/lib_tests.rs | 210 +++++++++++++++++++ 2 files changed, 366 insertions(+), 10 deletions(-) diff --git a/crates/jp_attachment_github/src/lib.rs b/crates/jp_attachment_github/src/lib.rs index 2ba95a64..da6bc86c 100644 --- a/crates/jp_attachment_github/src/lib.rs +++ b/crates/jp_attachment_github/src/lib.rs @@ -54,8 +54,11 @@ use jp_attachment::{ Attachment, BoxedHandler, HANDLERS, Handler, distributed_slice, linkme, typetag, }; use jp_github::{ - Octocrab, - models::pulls::{PullRequest, Review, ReviewComment, Side}, + Error as GhError, Octocrab, + models::{ + pulls::{PullRequest, Review, ReviewComment, Side}, + repos::{DiffEntry, DiffEntryStatus}, + }, }; use jp_mcp::Client; use serde::{Deserialize, Serialize}; @@ -312,19 +315,156 @@ async fn fetch_pr_diff( // Metadata first (title, description, refs). let pr = pulls.get(number).await?; - - // Then the unified diff via custom Accept header. - let diff_text = pulls.diff(number).await?; - let patterns = compile_excludes(&parsed.excludes, parsed.no_defaults)?; - let (filtered, included, excluded) = filter_diff(&diff_text, &patterns); - let header = render_diff_header(uri, &pr, included, excluded, parsed); - let body = format!("{header}\n\n{filtered}"); + // Try the unified-diff endpoint first. GitHub caps it at 20,000 + // lines and returns 406 above that; fall back to the paginated + // /pulls/N/files endpoint and synthesize a diff-like blob from the + // per-file patches. The fallback has different limits (max ~300 + // files per PR; individual patches are dropped when too large or + // binary), so we surface both file count and patch-truncation count + // in the header. + let (body, included, excluded, notice) = match pulls.diff(number).await { + Ok(diff_text) => { + let (filtered, included, excluded) = filter_diff(&diff_text, &patterns); + (filtered, included, excluded, None) + } + Err(err) if is_diff_too_large(&err) => { + debug!( + uri = %uri, + "Unified diff exceeded GitHub's 20k-line cap; falling back to /pulls/N/files." + ); + let page = pulls.list_files(number).await?; + let entries = client.all_pages(page).await?; + let synth = synthesize_diff_from_files(&entries, &patterns); + let mut note = format!( + "Note: PR diff exceeded GitHub's 20,000-line cap; reconstructed from the \ + paginated /pulls/{number}/files endpoint." + ); + if synth.truncated > 0 { + note.push_str(&format!( + " {} file(s) had their patch omitted by GitHub (too large or binary); the \ + `diff --git` headers are still present.", + synth.truncated + )); + } + (synth.text, synth.included, synth.excluded, Some(note)) + } + Err(err) => return Err(err.into()), + }; + + let header = render_diff_header(uri, &pr, included, excluded, parsed, notice.as_deref()); + let body = format!("{header}\n\n{body}"); Ok(Attachment::text(uri.to_string(), body)) } +/// Did the unified-diff endpoint reject the PR for being too large? +/// +/// GitHub returns HTTP 406 (Not Acceptable) for `application/vnd.github.diff` +/// once the diff exceeds 20,000 lines. The same status is unlikely to come +/// back from this endpoint for any other reason, so a status-only check is +/// sufficient — we don't pattern-match the human-readable message because +/// GitHub has rewritten it before. +fn is_diff_too_large(err: &GhError) -> bool { + matches!(err, GhError::GitHub { source, .. } if source.status_code.as_u16() == 406) +} + +/// Output of [`synthesize_diff_from_files`]. +struct SynthesizedDiff { + text: String, + included: usize, + excluded: usize, + /// Files whose `patch` field was `None` — GitHub omits it for files + /// that are too large (~3000 lines) or binary. The `diff --git` header + /// is still emitted so the LLM can see *which* files changed even when + /// the hunks are missing. + truncated: usize, +} + +/// Build a unified-diff-like blob from the per-file entries returned by +/// `/pulls/{N}/files`. Used as the fallback when the unified-diff endpoint +/// exceeds GitHub's 20k-line cap. +/// +/// The output is *not* a byte-for-byte equivalent of `git diff`: GitHub's +/// per-file `patch` field starts at the first hunk (`@@ ...`) and omits the +/// `index`/`---`/`+++` lines. We add the `diff --git a/PATH b/PATH` header +/// per file so downstream consumers (and the LLM) can still identify file +/// boundaries the standard way. Renames and copies get the usual +/// `rename from`/`rename to` (or `copy from`/`copy to`) markers, mirroring +/// what `git diff` would emit. +fn synthesize_diff_from_files(entries: &[DiffEntry], patterns: &[Pattern]) -> SynthesizedDiff { + let mut text = String::new(); + let mut included = 0_usize; + let mut excluded = 0_usize; + let mut truncated = 0_usize; + + for entry in entries { + if path_matches_any(&entry.filename, patterns) { + excluded += 1; + continue; + } + + let renamed = matches!( + entry.status, + DiffEntryStatus::Renamed | DiffEntryStatus::Copied + ); + let prev = entry.previous_filename.as_deref().filter(|_| renamed); + let a_path = prev.unwrap_or(entry.filename.as_str()); + let b_path = entry.filename.as_str(); + + text.push_str(&format!("diff --git a/{a_path} b/{b_path}\n")); + if let Some(prev) = prev { + let label = if matches!(entry.status, DiffEntryStatus::Copied) { + "copy" + } else { + "rename" + }; + text.push_str(&format!("{label} from {prev}\n")); + text.push_str(&format!("{label} to {}\n", entry.filename)); + } + + if let Some(patch) = entry.patch.as_deref() { + text.push_str(patch); + if !patch.ends_with('\n') { + text.push('\n'); + } + } else { + truncated += 1; + text.push_str(&format!( + "[patch omitted by GitHub: file is binary or exceeds the per-file Files API \ + limit; status={status}, +{add}/-{del} ({changes} changes)]\n", + status = status_str(entry.status), + add = entry.additions, + del = entry.deletions, + changes = entry.changes, + )); + } + + included += 1; + } + + SynthesizedDiff { + text, + included, + excluded, + truncated, + } +} + +const fn status_str(s: DiffEntryStatus) -> &'static str { + match s { + DiffEntryStatus::Added => "added", + DiffEntryStatus::Removed => "removed", + DiffEntryStatus::Modified => "modified", + DiffEntryStatus::Renamed => "renamed", + DiffEntryStatus::Copied => "copied", + DiffEntryStatus::Changed => "changed", + DiffEntryStatus::Unchanged => "unchanged", + DiffEntryStatus::Unknown => "unknown", + } +} + async fn fetch_pr_reviews( uri: &Url, parsed: &ParsedUri, @@ -445,6 +585,7 @@ fn render_diff_header( included: usize, excluded: usize, parsed: &ParsedUri, + notice: Option<&str>, ) -> String { let title = pr.title.as_deref().unwrap_or("(no title)"); let author = pr.user.as_ref().map_or("(unknown)", |u| u.login.as_str()); @@ -485,9 +626,14 @@ fn render_diff_header( format!("Files included: {included}, excluded by defaults + `?exclude`: {excluded}.") }; + let notice_line = match notice { + Some(n) => format!("\n{n}"), + None => String::new(), + }; + format!( "PR #{number}: {title}\n\nSource: {uri}\nURL: {html_url}\nAuthor: {author}\nState: \ - {state}\nBase: {base}\nHead: {head}\n\n{filter_line}{body_section}", + {state}\nBase: {base}\nHead: {head}\n\n{filter_line}{notice_line}{body_section}", number = pr.number, ) } diff --git a/crates/jp_attachment_github/src/lib_tests.rs b/crates/jp_attachment_github/src/lib_tests.rs index 5d3a09a7..46db223f 100644 --- a/crates/jp_attachment_github/src/lib_tests.rs +++ b/crates/jp_attachment_github/src/lib_tests.rs @@ -1,10 +1,41 @@ use jp_github::models::{ User, pulls::{Review, ReviewComment, ReviewState, Side}, + repos::{DiffEntry, DiffEntryStatus}, }; use super::*; +fn entry( + filename: &str, + status: DiffEntryStatus, + additions: u64, + deletions: u64, + patch: Option<&str>, +) -> DiffEntry { + DiffEntry { + filename: filename.to_owned(), + status, + additions, + deletions, + changes: additions + deletions, + previous_filename: None, + patch: patch.map(str::to_owned), + } +} + +fn renamed_entry(prev: &str, new: &str, patch: Option<&str>) -> DiffEntry { + DiffEntry { + filename: new.to_owned(), + status: DiffEntryStatus::Renamed, + additions: 0, + deletions: 0, + changes: 0, + previous_filename: Some(prev.to_owned()), + patch: patch.map(str::to_owned), + } +} + fn url(s: &str) -> Url { Url::parse(s).unwrap() } @@ -696,3 +727,182 @@ index 111..222 assert_eq!(excluded, 0); assert!(filtered.contains("snapshots/foo.snap")); } + +#[test] +fn synthesizes_unified_diff_from_files() { + // Happy path: two modified files, both with patches. Output must + // carry a `diff --git` header per file plus the patch verbatim, so + // it can be consumed by anything that already parses unified diffs. + let entries = vec![ + entry( + "src/lib.rs", + DiffEntryStatus::Modified, + 1, + 1, + Some("@@ -1 +1 @@\n-old\n+new\n"), + ), + entry( + "src/main.rs", + DiffEntryStatus::Modified, + 1, + 0, + Some("@@ -1,2 +1,3 @@\n a\n+inserted\n b\n"), + ), + ]; + + let patterns = compile_excludes(&[], true).unwrap(); + let synth = synthesize_diff_from_files(&entries, &patterns); + + assert_eq!(synth.included, 2); + assert_eq!(synth.excluded, 0); + assert_eq!(synth.truncated, 0); + assert_eq!( + synth.text, + "\ +diff --git a/src/lib.rs b/src/lib.rs +@@ -1 +1 @@ +-old ++new +diff --git a/src/main.rs b/src/main.rs +@@ -1,2 +1,3 @@ + a ++inserted + b +" + ); +} + +#[test] +fn synthesizer_filters_entries_before_emitting_patches() { + // The exclude filter must drop matching entries entirely — not just + // hide them in the output. This is the win over the diff-endpoint + // path: we never have to look at a 5000-line lockfile patch to + // decide we don't want it. + let entries = vec![ + entry( + "src/lib.rs", + DiffEntryStatus::Modified, + 1, + 1, + Some("@@ -1 +1 @@\n-a\n+b\n"), + ), + entry( + "Cargo.lock", + DiffEntryStatus::Modified, + 5000, + 5000, + Some("@@ -1 +1 @@\n-x\n+y\n"), + ), + entry( + "snapshots/foo.snap", + DiffEntryStatus::Modified, + 10, + 10, + Some("@@ -1 +1 @@\n-s\n+s2\n"), + ), + ]; + + let patterns = compile_excludes(&[], false).unwrap(); + let synth = synthesize_diff_from_files(&entries, &patterns); + + assert_eq!(synth.included, 1); + assert_eq!(synth.excluded, 2); + assert!(synth.text.contains("diff --git a/src/lib.rs b/src/lib.rs")); + assert!( + !synth.text.contains("Cargo.lock"), + "excluded file must not appear in output:\n{}", + synth.text + ); + assert!( + !synth.text.contains("snapshots/foo.snap"), + "excluded file must not appear in output:\n{}", + synth.text + ); +} + +#[test] +fn synthesizer_emits_placeholder_for_missing_patch() { + // GitHub omits `patch` for files too large or binary. We still emit + // the `diff --git` header so the LLM can see the file changed, plus + // a placeholder summarizing the status and line counts. + let entries = vec![entry( + "assets/logo.png", + DiffEntryStatus::Modified, + 0, + 0, + None, + )]; + + let patterns = compile_excludes(&[], true).unwrap(); + let synth = synthesize_diff_from_files(&entries, &patterns); + + assert_eq!(synth.included, 1); + assert_eq!(synth.truncated, 1); + assert!( + synth + .text + .contains("diff --git a/assets/logo.png b/assets/logo.png") + ); + assert!( + synth.text.contains("[patch omitted by GitHub"), + "placeholder line should mention GitHub omitted the patch:\n{}", + synth.text + ); + assert!( + synth.text.contains("status=modified"), + "placeholder must surface the change status so reviewers know what kind of edit it \ + was:\n{}", + synth.text + ); +} + +#[test] +fn synthesizer_emits_rename_markers() { + // Renames need the `rename from`/`rename to` markers AND the `a/` + // path must point at the previous filename, otherwise downstream + // consumers can't distinguish a rename from a delete+add. With a + // patch attached, the markers come before the hunks. + let entries = vec![ + renamed_entry("src/old_name.rs", "src/new_name.rs", None), + renamed_entry( + "src/touched_old.rs", + "src/touched_new.rs", + Some("@@ -1 +1 @@\n-a\n+b\n"), + ), + ]; + + let patterns = compile_excludes(&[], true).unwrap(); + let synth = synthesize_diff_from_files(&entries, &patterns); + + assert_eq!(synth.included, 2); + assert!( + synth + .text + .contains("diff --git a/src/old_name.rs b/src/new_name.rs"), + "a/ side must use the previous filename for renames:\n{}", + synth.text + ); + assert!(synth.text.contains("rename from src/old_name.rs")); + assert!(synth.text.contains("rename to src/new_name.rs")); + assert!( + synth + .text + .contains("rename from src/touched_old.rs\nrename to src/touched_new.rs\n@@ "), + "rename markers must appear before the hunks:\n{}", + synth.text + ); +} + +#[test] +fn synthesizer_returns_empty_for_no_entries() { + // Defensive: a PR with no changed files (rare, but legal — e.g. + // a force-push to identical content) must produce no body and zero + // counters, not a malformed blob. + let patterns = compile_excludes(&[], true).unwrap(); + let synth = synthesize_diff_from_files(&[], &patterns); + + assert_eq!(synth.included, 0); + assert_eq!(synth.excluded, 0); + assert_eq!(synth.truncated, 0); + assert!(synth.text.is_empty()); +} From c5f96e1e7ddad04bf56d6aadeaa883ce78fba784 Mon Sep 17 00:00:00 2001 From: Jean Mertz Date: Sat, 16 May 2026 08:10:00 +0200 Subject: [PATCH 2/2] ci: kick merge ref Signed-off-by: Jean Mertz