diff --git a/crates/html-extractor/src/lib.rs b/crates/html-extractor/src/lib.rs index fe62748..1456e45 100644 --- a/crates/html-extractor/src/lib.rs +++ b/crates/html-extractor/src/lib.rs @@ -68,15 +68,54 @@ pub fn extract(html: &str, options: &ExtractOptions) -> Result= 200 && kept_text_len * 100 < body_text_len * 15; let (final_root, quality, used_fallback) = if let Some(idx) = selected_root { if kept_text_len < min_len { + // (b): too short to be useful — fall through. let (fb_root, q) = fallback::fallback(&tree, options); (fb_root.or(Some(idx)), q.max(0.15), true) + } else if suspiciously_small { + // (c): try the fallback chain and pick whichever produced more + // text-excluding-links content. + let (fb_root, fb_q) = fallback::fallback(&tree, options); + let fb_text = fb_root + .map(|i| tree.text_len_excluding_links(i)) + .unwrap_or(0); + if fb_text > kept_text_len * 2 { + (fb_root, fb_q.max(0.2), true) + } else { + ( + Some(idx), + confidence_from_score(score, kept_text_len), + false, + ) + } } else { ( Some(idx),