Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 116 additions & 2 deletions .config/jp/tools/src/web/fetch/html.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,32 @@ use crate::{
/// Max chars of preview text per section in the listing.
const PREVIEW_MAX: usize = 120;

/// Top-level wrapper IDs used by rustdoc-generated pages (docs.rs, local
/// `cargo doc` output). These get matched as section anchors via the
/// ancestor-id fallback in `resolve_heading_id`, but they're page chrome,
/// not content. Filtering them out only at the listing layer keeps explicit
/// `sections=["main-content"]` requests working for non-rustdoc pages.
const RUSTDOC_SCAFFOLDING_IDS: &[&str] = &[
"rustdoc_body_wrapper",
"rustdoc-toc",
"rustdoc-modnav",
"main-content",
];

fn is_rustdoc_scaffolding_id(id: &str) -> bool {
RUSTDOC_SCAFFOLDING_IDS.contains(&id)
}

/// True if `el` is a `<div class="docblock">` (rustdoc's documentation
/// container). Multi-class attributes are handled.
fn is_docblock(el: &ElementRef<'_>) -> bool {
el.value().name() == "div"
&& el
.value()
.attr("class")
.is_some_and(|c| c.split_ascii_whitespace().any(|cl| cl == "docblock"))
}

const HAIKU_MODEL: &str = "claude-haiku-4-5";
const ANTHROPIC_API_URL: &str = "https://api.anthropic.com/v1/messages";

Expand Down Expand Up @@ -207,6 +233,37 @@ fn extract_anchor_html(html: &str, anchor: &str) -> Option<String> {
))
}

/// Build the extraction for a rustdoc `<section>` item: the section itself
/// plus the immediately following sibling `<div class="docblock">`.
///
/// When the section is wrapped in `<summary>` (rustdoc's `<details>` toggle
/// layout), the docblock is a sibling of the summary, not of the section.
/// We do **not** return the whole `<details>` because impl headers nest
/// every method inside a `<div class="impl-items">` sibling — returning
/// the details would pull in every method's signature and docs.
fn extract_rustdoc_section(section: &ElementRef<'_>) -> String {
let mut parts = vec![section.html()];

let docblock_anchor = section
.parent()
.and_then(ElementRef::wrap)
.filter(|el| el.value().name() == "summary")
.unwrap_or(*section);

for sib in docblock_anchor.next_siblings() {
let Some(el) = ElementRef::wrap(sib) else {
continue;
};
if is_docblock(&el) {
parts.push(el.html());
}
// First element sibling decides: docblock or unrelated, stop either way.
break;
}

parts.join("")
}

/// Extracts a heading element and all following siblings up to (but not
/// including) the next heading of the same or higher level.
fn extract_heading_section(heading: &ElementRef<'_>) -> String {
Expand Down Expand Up @@ -429,7 +486,13 @@ fn list_section_headers(html: &str) -> Vec<SectionHeader> {
current_heading_level = level;

let id = match resolve_heading_id(&el) {
Some(id) if !id.is_empty() && seen_ids.insert(id.clone()) => id,
Some(id)
if !id.is_empty()
&& !is_rustdoc_scaffolding_id(&id)
&& seen_ids.insert(id.clone()) =>
{
id
}
_ => continue,
};

Expand All @@ -442,7 +505,13 @@ fn list_section_headers(html: &str) -> Vec<SectionHeader> {
} else {
// <dt> element. Skip if we can't resolve a usable anchor ID.
let id = match resolve_dt_id(&el) {
Some(id) if !id.is_empty() && seen_ids.insert(id.clone()) => id,
Some(id)
if !id.is_empty()
&& !is_rustdoc_scaffolding_id(&id)
&& seen_ids.insert(id.clone()) =>
{
id
}
_ => continue,
};

Expand Down Expand Up @@ -580,9 +649,45 @@ fn extract_preview_after_heading(heading: &ElementRef<'_>) -> String {
}
}

// Rustdoc puts the signature heading inside `<section>` with no useful
// following siblings; the prose lives in a sibling `<div class="docblock">`
// of the section (or of its enclosing `<summary>` in toggle layout).
// Without this fallback, every method on every docs.rs page reports an
// empty preview.
if text.is_empty()
&& let Some(parent) = heading.parent().and_then(ElementRef::wrap)
&& parent.value().name() == "section"
{
text = preview_from_section_docblock(parent);
}

truncate_str(&text, PREVIEW_MAX)
}

/// Collect preview text from a rustdoc section's neighboring docblock.
/// Mirrors `extract_rustdoc_section`'s sibling-resolution logic so listing
/// previews and extracted bodies agree on where the docs live.
fn preview_from_section_docblock(section: ElementRef<'_>) -> String {
let anchor = section
.parent()
.and_then(ElementRef::wrap)
.filter(|el| el.value().name() == "summary")
.unwrap_or(section);

for sib in anchor.next_siblings() {
let Some(el) = ElementRef::wrap(sib) else {
continue;
};
if is_docblock(&el) {
let raw: String = el.text().collect();
return raw.split_whitespace().collect::<Vec<_>>().join(" ");
}
break;
}

String::new()
}

fn truncate_str(s: &str, max: usize) -> String {
if s.len() <= max {
return s.to_owned();
Expand Down Expand Up @@ -639,6 +744,15 @@ fn extract_section_html_from_doc(doc: &Html, anchor: &str) -> Option<String> {
});
}

// Rustdoc wraps each item in `<section id="...">` (e.g. `method.X`,
// `variant.Y`, `impl-Foo-for-Bar`). The signature heading is inside
// the section; the documentation is in a sibling `<div class="docblock">`,
// optionally further wrapped in a `<details>` toggle. The container
// fallback below would only return the heading and miss the docs.
if target.value().name() == "section" {
return Some(extract_rustdoc_section(&target));
}

// Container element with an internal heading (e.g. `<section id="x"><h3>`).
let heading_sel = Selector::parse("h1, h2, h3, h4, h5, h6").ok()?;
if let Some(inner_heading) = target.select(&heading_sel).next() {
Expand Down
214 changes: 214 additions & 0 deletions .config/jp/tools/src/web/fetch/html_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,220 @@ mod extract_preview_after_heading {
}
}

/// Tests covering rustdoc-generated item pages (the docs.rs / `cargo doc`
/// shape). Two structural quirks matter:
///
/// - Method/variant/assoc-type signatures live inside `<section id="...">`
/// and the docs are in a sibling `<div class="docblock">`, optionally
/// wrapped in a `<details>` toggle. The heading's own siblings are empty,
/// so a naïve walk misses every method's documentation.
/// - The page has a handful of structural IDs (`rustdoc_body_wrapper`,
/// `rustdoc-toc`, `rustdoc-modnav`, `main-content`) that get picked up by
/// the ancestor-id fallback and clutter the section listing.
mod rustdoc_pages {
use scraper::Html;

use super::*;

const RUSTDOC_ITEM_PAGE: &str = r#"<!DOCTYPE html>
<html>
<head>
<title>Value in serde_json - Rust</title>
<meta name="generator" content="rustdoc">
</head>
<body>
<div id="rustdoc_body_wrapper">
<nav id="rustdoc-modnav"><h2>In crate</h2></nav>
<main>
<nav id="rustdoc-toc"><h2>TOC</h2></nav>
<section id="main-content">
<h1>Enum Value</h1>
<h2 id="variants">Variants</h2>
<ul>
<li>
<section id="variant.Null"><h3>Null</h3></section>
<div class="docblock"><p>Represents a JSON null value.</p></div>
</li>
</ul>
<h2 id="implementations">Implementations</h2>
<details class="toggle">
<summary><section id="impl-Default-for-Value">
<h3>impl Default for Value</h3>
</section></summary>
<div class="docblock"><p>The default value is Value::Null.</p></div>
<div class="impl-items">
<details class="toggle">
<summary><section id="method.pointer">
<h4>pub fn pointer(&amp;self) -&gt; Option</h4>
</section></summary>
<div class="docblock">
<p>Looks up a value by a JSON Pointer.</p>
<h5 id="examples">§Examples</h5>
<pre>let data = json!({});</pre>
</div>
</details>
</div>
</details>
</section>
</main>
</div>
</body>
</html>"#;

#[test]
fn list_skips_rustdoc_scaffolding_ids() {
let headers = list_section_headers(RUSTDOC_ITEM_PAGE);
let ids: Vec<&str> = headers.iter().map(|h| h.id.as_str()).collect();

for forbidden in [
"rustdoc_body_wrapper",
"rustdoc-toc",
"rustdoc-modnav",
"main-content",
] {
assert!(
!ids.contains(&forbidden),
"scaffolding id `{forbidden}` should be skipped, got {ids:?}"
);
}
}

#[test]
fn list_includes_item_anchors() {
let headers = list_section_headers(RUSTDOC_ITEM_PAGE);
let ids: Vec<&str> = headers.iter().map(|h| h.id.as_str()).collect();

for required in [
"variants",
"implementations",
"variant.Null",
"impl-Default-for-Value",
"method.pointer",
] {
assert!(
ids.contains(&required),
"expected anchor `{required}` in listing, got {ids:?}"
);
}
}

#[test]
fn list_h4_method_preview_uses_sibling_docblock() {
// The h4 signature lives inside a `<section>` with no useful
// siblings; the docs are in a `<div class="docblock">` that's a
// sibling of the enclosing `<summary>`. The preview must peek
// through to that docblock or every method on the page reports an
// empty description.
let headers = list_section_headers(RUSTDOC_ITEM_PAGE);
let pointer = headers
.iter()
.find(|h| h.id == "method.pointer")
.expect("method.pointer in headers");
assert!(
pointer.preview.contains("Looks up a value"),
"preview should peek into sibling docblock; got {:?}",
pointer.preview
);
}

#[test]
fn extract_method_in_toggle_includes_signature_and_docs() {
let doc = Html::parse_document(RUSTDOC_ITEM_PAGE);
let result = extract_section_html_from_doc(&doc, "method.pointer").unwrap();

assert!(
result.contains("pub fn pointer"),
"missing signature: {result}"
);
assert!(
result.contains("Looks up a value"),
"missing docblock prose: {result}"
);
assert!(
result.contains("§Examples"),
"missing examples heading: {result}"
);
}

#[test]
fn extract_method_does_not_bleed_into_neighbors() {
let doc = Html::parse_document(RUSTDOC_ITEM_PAGE);
let result = extract_section_html_from_doc(&doc, "method.pointer").unwrap();

assert!(
!result.contains("Represents a JSON null"),
"variant doc bled in: {result}"
);
assert!(
!result.contains("The default value"),
"impl doc bled in: {result}"
);
}

#[test]
fn extract_impl_returns_header_and_docs_without_nested_methods() {
// The impl block's `<details>` also contains a `<div class="impl-items">`
// with every method inside. Extracting the impl must return only the
// signature and its own docblock — not every nested method.
let doc = Html::parse_document(RUSTDOC_ITEM_PAGE);
let result = extract_section_html_from_doc(&doc, "impl-Default-for-Value").unwrap();

assert!(
result.contains("impl Default for Value"),
"missing header: {result}"
);
assert!(
result.contains("The default value"),
"missing impl docblock: {result}"
);
assert!(
!result.contains("pub fn pointer"),
"impl extraction should not include nested method: {result}"
);
assert!(
!result.contains("Looks up a value"),
"impl extraction should not include nested method docs: {result}"
);
}

#[test]
fn extract_variant_with_flat_docblock_returns_both() {
// variant.Null is the no-toggle case: a `<section>` followed by a
// sibling `<div class="docblock">` directly inside the parent `<li>`.
let doc = Html::parse_document(RUSTDOC_ITEM_PAGE);
let result = extract_section_html_from_doc(&doc, "variant.Null").unwrap();

assert!(result.contains(">Null<"), "missing variant name: {result}");
assert!(
result.contains("Represents a JSON null"),
"missing docblock: {result}"
);
assert!(
!result.contains("Looks up a value"),
"method doc bled in: {result}"
);
}

#[test]
fn extract_heading_anchor_unaffected_by_section_branch() {
// h2-anchored sections (e.g. `variants`) still extract via the
// existing heading code path: they include everything up to the
// next sibling heading of the same level.
let doc = Html::parse_document(RUSTDOC_ITEM_PAGE);
let result = extract_section_html_from_doc(&doc, "variants").unwrap();

assert!(result.contains("Variants"));
assert!(
result.contains("Represents a JSON null"),
"variants section should contain the variant: {result}"
);
assert!(
!result.contains("Implementations"),
"variants section should stop at the next h2: {result}"
);
}
}

/// Tests covering AsciiDoctor-style horizontal definition lists, as used by
/// git's manpages on git-scm.com. Fixture is reduced from the actual
/// `gitglossary` page; each `<dt>` carries three IDs (its own auto-generated
Expand Down
Loading