Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ url: https://example.com
status_code: 200
source_content_type: text/html; charset=UTF-8
source_size: 1256
quality_score: 1.00
extraction_method: "full"
---
# Example Domain

Expand Down Expand Up @@ -236,6 +238,7 @@ response = tool.fetch("https://example.com")
| `saved_path` | string? | Filesystem path when `save_to_file` succeeded |
| `bytes_written` | int? | Bytes saved to file |
| `metadata` | object? | Structured `PageMetadata` (title, description, links, headings, extraction method, …) |
| `quality` | object? | Agent-facing `PageQuality` (score, warnings, link density, suggested next action) |
| `word_count` | int? | Word count of returned content |
| `redirect_chain` | string[] | URLs visited during redirects (empty if none) |
| `is_paywall` | bool? | Heuristic paywall signal (soft, not guaranteed) |
Expand Down
27 changes: 26 additions & 1 deletion crates/fetchkit-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,20 @@ fn format_md_with_frontmatter(response: &fetchkit::FetchResponse) -> String {
output.push_str("truncated: true\n");
}
}
if let Some(ref quality) = response.quality {
output.push_str(&format!("quality_score: {:.2}\n", quality.score));
if !quality.warnings.is_empty() {
let warnings =
serde_json::to_string(&quality.warnings).unwrap_or_else(|_| "[]".to_string());
output.push_str(&format!("quality_warnings: {}\n", warnings));
}
if let Some(ref method) = quality.extraction_method {
output.push_str(&format!("extraction_method: {}\n", yaml_quote(method)));
}
if let Some(ref action) = quality.suggested_next_action {
output.push_str(&format!("suggested_next_action: {}\n", yaml_quote(action)));
}
}
output.push_str("---\n");

// Append content, or error as body for unsupported content
Expand Down Expand Up @@ -291,7 +305,7 @@ fn writeln_safe(s: &str) {
#[cfg(test)]
mod tests {
use super::*;
use fetchkit::FetchResponse;
use fetchkit::{FetchResponse, PageQuality};

#[test]
fn test_format_md_basic() {
Expand Down Expand Up @@ -322,6 +336,13 @@ mod tests {
last_modified: Some("Wed, 01 Jan 2025 00:00:00 GMT".to_string()),
filename: Some("page.html".to_string()),
truncated: Some(true),
quality: Some(PageQuality {
score: 0.72,
warnings: vec!["low_content".to_string()],
extraction_method: Some("agent_main".to_string()),
suggested_next_action: Some("retry_with_agent_focus_or_crawl".to_string()),
..Default::default()
}),
content: Some("Content here".to_string()),
..Default::default()
};
Expand All @@ -332,6 +353,10 @@ mod tests {
assert!(output.contains("last_modified: \"Wed, 01 Jan 2025 00:00:00 GMT\"\n"));
assert!(output.contains("filename: \"page.html\"\n"));
assert!(output.contains("truncated: true\n"));
assert!(output.contains("quality_score: 0.72\n"));
assert!(output.contains("quality_warnings: [\"low_content\"]\n"));
assert!(output.contains("extraction_method: \"agent_main\"\n"));
assert!(output.contains("suggested_next_action: \"retry_with_agent_focus_or_crawl\"\n"));
}

#[test]
Expand Down
43 changes: 43 additions & 0 deletions crates/fetchkit-cli/src/mcp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,20 @@ fn format_md_with_frontmatter(response: &fetchkit::FetchResponse) -> String {
output.push_str("truncated: true\n");
}
}
if let Some(ref quality) = response.quality {
output.push_str(&format!("quality_score: {:.2}\n", quality.score));
if !quality.warnings.is_empty() {
let warnings =
serde_json::to_string(&quality.warnings).unwrap_or_else(|_| "[]".to_string());
output.push_str(&format!("quality_warnings: {}\n", warnings));
}
if let Some(ref method) = quality.extraction_method {
output.push_str(&format!("extraction_method: {}\n", yaml_quote(method)));
}
if let Some(ref action) = quality.suggested_next_action {
output.push_str(&format!("suggested_next_action: {}\n", yaml_quote(action)));
}
}
output.push_str("---\n");

// Append content, or error as body for unsupported content
Expand Down Expand Up @@ -273,3 +287,32 @@ pub async fn run_server(tool: Tool) {
let _ = stdout.flush();
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_format_md_includes_quality_frontmatter() {
let response = fetchkit::FetchResponse {
url: "https://example.com".to_string(),
status_code: 200,
quality: Some(fetchkit::PageQuality {
score: 0.4,
warnings: vec!["low_content".to_string()],
extraction_method: Some("agent_main".to_string()),
suggested_next_action: Some("retry_with_agent_focus_or_crawl".to_string()),
..Default::default()
}),
content: Some("short".to_string()),
..Default::default()
};

let output = format_md_with_frontmatter(&response);

assert!(output.contains("quality_score: 0.40\n"));
assert!(output.contains("quality_warnings: [\"low_content\"]\n"));
assert!(output.contains("extraction_method: \"agent_main\"\n"));
assert!(output.contains("suggested_next_action: \"retry_with_agent_focus_or_crawl\"\n"));
}
}
191 changes: 190 additions & 1 deletion crates/fetchkit/src/fetchers/default.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use crate::error::FetchError;
use crate::fetchers::Fetcher;
use crate::file_saver::FileSaver;
use crate::transport::{BodyStream, TransportMethod, TransportRequest, TransportResponse};
use crate::types::{FetchRequest, FetchResponse, HttpMethod};
use crate::types::{FetchRequest, FetchResponse, HttpMethod, PageQuality};
use crate::DEFAULT_USER_AGENT;
use async_trait::async_trait;
use bytes::Bytes;
Expand Down Expand Up @@ -312,6 +312,7 @@ impl Fetcher for DefaultFetcher {
"Binary content is not supported. Only textual content (HTML, text, JSON, etc.) can be fetched."
.to_string(),
),
quality: Some(binary_quality_signal()),
..Default::default()
});
}
Expand Down Expand Up @@ -404,6 +405,14 @@ impl Fetcher for DefaultFetcher {
if let (Some(metadata), Some(method)) = (&mut page_metadata, extraction_method) {
metadata.extraction_method = Some(method.to_string());
}
let quality = compute_quality_signal(
&final_content,
status_code,
truncated,
is_paywall,
extraction_method,
word_count,
);

Ok(FetchResponse {
url: final_url,
Expand All @@ -417,6 +426,7 @@ impl Fetcher for DefaultFetcher {
content: Some(final_content),
truncated: if truncated { Some(true) } else { None },
metadata: page_metadata,
quality: Some(quality),
word_count: Some(word_count),
redirect_chain,
is_paywall: if is_paywall { Some(true) } else { None },
Expand Down Expand Up @@ -788,6 +798,145 @@ fn count_words(text: &str) -> u64 {
text.split_whitespace().count() as u64
}

fn binary_quality_signal() -> PageQuality {
PageQuality {
score: 0.0,
warnings: vec!["binary_content".to_string()],
suggested_next_action: Some("use_save_to_file".to_string()),
..Default::default()
}
}

fn compute_quality_signal(
content: &str,
status_code: u16,
truncated: bool,
is_paywall: bool,
extraction_method: Option<&str>,
word_count: u64,
) -> PageQuality {
let mut warnings = Vec::new();
let mut score = 1.0f32;
let link_count = count_markdown_links(content);
let link_density = if word_count == 0 {
0.0
} else {
link_count as f32 / word_count as f32
};
let lower = content.to_lowercase();

if status_code >= 400 {
push_warning(&mut warnings, "http_error");
score -= 0.35;
}
if truncated {
push_warning(&mut warnings, "truncated");
score -= 0.20;
}
if word_count < 30 {
push_warning(&mut warnings, "low_content");
score -= 0.30;
}
if link_count >= 20 && link_density > 0.15 {
push_warning(&mut warnings, "too_many_links");
score -= 0.20;
}
if is_paywall {
push_warning(&mut warnings, "possible_paywall");
score -= 0.25;
}
if looks_like_login_wall(&lower) {
push_warning(&mut warnings, "possible_login_wall");
score -= 0.25;
}
if looks_like_consent_wall(&lower) {
push_warning(&mut warnings, "possible_consent_wall");
score -= 0.20;
}
if looks_like_javascript_required(&lower) {
push_warning(&mut warnings, "javascript_required");
score -= 0.30;
}

PageQuality {
score: score.clamp(0.0, 1.0),
suggested_next_action: suggested_next_action(&warnings).map(str::to_string),
warnings,
link_density: Some(link_density),
extraction_method: extraction_method.map(str::to_string),
}
}

fn count_markdown_links(content: &str) -> usize {
content.matches("](").count()
}

fn push_warning(warnings: &mut Vec<String>, warning: &str) {
if !warnings.iter().any(|existing| existing == warning) {
warnings.push(warning.to_string());
}
}

fn suggested_next_action(warnings: &[String]) -> Option<&'static str> {
if has_warning(warnings, "javascript_required") {
Some("retry_with_browser_rendering")
} else if has_warning(warnings, "possible_login_wall") {
Some("authenticate_or_use_browser")
} else if has_warning(warnings, "possible_paywall") {
Some("try_alternate_source")
} else if has_warning(warnings, "truncated") {
Some("retry_with_larger_limit_or_narrower_scope")
} else if has_warning(warnings, "low_content") || has_warning(warnings, "too_many_links") {
Some("retry_with_agent_focus_or_crawl")
} else if has_warning(warnings, "http_error") {
Some("check_url_or_retry_later")
} else {
None
}
}

fn has_warning(warnings: &[String], warning: &str) -> bool {
warnings.iter().any(|existing| existing == warning)
}

fn looks_like_login_wall(lower_content: &str) -> bool {
[
"sign in to continue",
"log in to continue",
"please sign in",
"please log in",
"login required",
"sign in required",
]
.iter()
.any(|needle| lower_content.contains(needle))
}

fn looks_like_consent_wall(lower_content: &str) -> bool {
[
"accept cookies",
"cookie consent",
"manage cookies",
"privacy choices",
"we use cookies",
"consent preferences",
]
.iter()
.any(|needle| lower_content.contains(needle))
}

fn looks_like_javascript_required(lower_content: &str) -> bool {
[
"enable javascript",
"javascript is disabled",
"requires javascript",
"please enable js",
"enable js",
]
.iter()
.any(|needle| lower_content.contains(needle))
}

/// Common paywall indicators in raw HTML content.
const PAYWALL_INDICATORS: &[&str] = &[
"paywall",
Expand Down Expand Up @@ -1302,6 +1451,46 @@ mod tests {
assert_eq!(count_words("word"), 1);
}

#[test]
fn test_compute_quality_signal_clean_content() {
let content = "This page has enough useful words for an AI agent to answer with confidence. It includes actual content instead of just a menu, and it gives a short but complete explanation that should be useful for downstream reasoning.";
let quality = compute_quality_signal(
content,
200,
false,
false,
Some("agent_readable"),
count_words(content),
);

assert!(quality.score > 0.9, "{quality:?}");
assert!(quality.warnings.is_empty(), "{quality:?}");
assert_eq!(quality.extraction_method.as_deref(), Some("agent_readable"));
assert!(quality.suggested_next_action.is_none());
}

#[test]
fn test_compute_quality_signal_low_js_content() {
let quality = compute_quality_signal(
"Please enable JavaScript to view this app.",
200,
false,
false,
Some("full"),
7,
);

assert!(quality.score < 0.5, "{quality:?}");
assert!(quality.warnings.contains(&"low_content".to_string()));
assert!(quality
.warnings
.contains(&"javascript_required".to_string()));
assert_eq!(
quality.suggested_next_action.as_deref(),
Some("retry_with_browser_rendering")
);
}

#[test]
fn test_detect_paywall() {
assert!(detect_paywall("<div class=\"paywall\">Subscribe</div>"));
Expand Down
2 changes: 1 addition & 1 deletion crates/fetchkit/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ pub use transport::{
BodyStream, HttpTransport, ReqwestTransport, TransportError, TransportMethod, TransportRequest,
TransportResponse,
};
pub use types::{FetchRequest, FetchResponse, HttpMethod, PageLink, PageMetadata};
pub use types::{FetchRequest, FetchResponse, HttpMethod, PageLink, PageMetadata, PageQuality};

#[cfg(feature = "bot-auth")]
pub use bot_auth::{BotAuthConfig, BotAuthError};
Expand Down
Loading
Loading