apply skip_prefixes before parsing external link domain (#1833)

* apply skip_prefixes before parsing external link domain * log number of links skipped by skip_prefixes
getzola · Apr 26, 2022 · 92e80b5 · 92e80b5
1 parent 896ea59
commit 92e80b5
Show file tree

Hide file tree

Showing 4 changed files with 86 additions and 31 deletions.
diff --git a/components/site/src/link_checking.rs b/components/site/src/link_checking.rs
@@ -96,6 +96,10 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
     }
 }
 
+fn should_skip_by_prefix(link: &String, skip_prefixes: &Vec<String>) -> bool {
+    skip_prefixes.iter().any(|prefix| link.starts_with(prefix))
+}
+
 fn get_link_domain(link: &str) -> Result<String> {
     return match Url::parse(link) {
         Ok(url) => match url.host_str().map(String::from) {
@@ -109,36 +113,58 @@ fn get_link_domain(link: &str) -> Result<String> {
 pub fn check_external_links(site: &Site) -> Result<()> {
     let library = site.library.write().expect("Get lock for check_external_links");
 
-    let mut all_links: Vec<(PathBuf, String, String)> = vec![];
+    struct LinkDef {
+        file_path: PathBuf,
+        external_link: String,
+        domain: String,
+    }
+
+    impl LinkDef {
+        pub fn new(file_path: PathBuf, external_link: String, domain: String) -> Self {
+            Self { file_path, external_link, domain }
+        }
+    }
+
+    let mut checked_links: Vec<LinkDef> = vec![];
+    let mut skipped_link_count: u32 = 0;
 
     for p in library.pages_values().into_iter() {
         for external_link in p.clone().external_links.into_iter() {
-            let domain = get_link_domain(&external_link)?;
-            all_links.push((p.file.path.clone(), external_link, domain));
+            if should_skip_by_prefix(&external_link, &site.config.link_checker.skip_prefixes) {
+                skipped_link_count += 1;
+            } else {
+                let domain = get_link_domain(&external_link)?;
+                checked_links.push(LinkDef::new(p.file.path.clone(), external_link, domain));
+            }
         }
     }
 
     for s in library.sections_values().into_iter() {
         for external_link in s.clone().external_links.into_iter() {
-            let domain = get_link_domain(&external_link)?;
-            all_links.push((s.file.path.clone(), external_link, domain));
+            if should_skip_by_prefix(&external_link, &site.config.link_checker.skip_prefixes) {
+                skipped_link_count += 1;
+            } else {
+                let domain = get_link_domain(&external_link)?;
+                checked_links.push(LinkDef::new(s.file.path.clone(), external_link, domain));
+            }
         }
     }
 
-    println!("Checking {} external link(s).", all_links.len());
+    println!(
+        "Checking {} external link(s).  Skipping {} external link(s).",
+        checked_links.len(),
+        skipped_link_count
+    );
 
-    let mut links_by_domain: HashMap<String, Vec<(PathBuf, String)>> = HashMap::new();
+    let mut links_by_domain: HashMap<String, Vec<&LinkDef>> = HashMap::new();
 
-    for link in all_links.iter() {
-        links_by_domain.entry(link.2.to_string()).or_default();
+    for link in checked_links.iter() {
+        links_by_domain.entry(link.domain.to_string()).or_default();
         // Insert content path and link under the domain key
-        links_by_domain
-            .get_mut(&link.2.to_string())
-            .unwrap()
-            .push((link.0.clone(), link.1.clone()));
+        links_by_domain.get_mut(&link.domain).unwrap().push(&link);
     }
 
-    if all_links.is_empty() {
+    if checked_links.is_empty() {
         return Ok(());
     }
 
@@ -155,20 +181,13 @@ pub fn check_external_links(site: &Site) -> Result<()> {
                 let mut links_to_process = links.len();
                 links
                     .iter()
-                    .filter_map(move |(page_path, link)| {
+                    .filter_map(move |link_def| {
                         links_to_process -= 1;
 
-                        if site
-                            .config
-                            .link_checker
-                            .skip_prefixes
-                            .iter()
-                            .any(|prefix| link.starts_with(prefix))
-                        {
-                            return None;
-                        }
-
-                        let res = link_checker::check_url(link, &site.config.link_checker);
+                        let res = link_checker::check_url(
+                            &link_def.external_link,
+                            &site.config.link_checker,
+                        );
 
                         if links_to_process > 0 {
                             // Prevent rate-limiting, wait before next crawl unless we're done with this domain
@@ -178,7 +197,7 @@ pub fn check_external_links(site: &Site) -> Result<()> {
                         if link_checker::is_valid(&res) {
                             None
                         } else {
-                            Some((page_path, link, res))
+                            Some((&link_def.file_path, &link_def.external_link, res))
                         }
                     })
                     .collect::<Vec<_>>()
@@ -187,7 +206,11 @@ pub fn check_external_links(site: &Site) -> Result<()> {
             .collect::<Vec<_>>()
     });
 
-    println!("> Checked {} external link(s): {} error(s) found.", all_links.len(), errors.len());
+    println!(
+        "> Checked {} external link(s): {} error(s) found.",
+        checked_links.len(),
+        errors.len()
+    );
 
     if errors.is_empty() {
         return Ok(());

diff --git a/components/site/tests/site.rs b/components/site/tests/site.rs
@@ -19,7 +19,7 @@ fn can_parse_site() {
     let library = site.library.read().unwrap();
 
     // Correct number of pages (sections do not count as pages, draft are ignored)
-    assert_eq!(library.pages().len(), 32);
+    assert_eq!(library.pages().len(), 33);
     let posts_path = path.join("content").join("posts");
 
     // Make sure the page with a url doesn't have any sections
@@ -596,7 +596,7 @@ fn can_build_site_with_pagination_for_taxonomy() {
         "tags/a/page/1/index.html",
         "http-equiv=\"refresh\" content=\"0; url=https://replace-this-with-your-url.com/tags/a/\""
     ));
-    assert!(file_contains!(public, "tags/a/index.html", "Num pagers: 8"));
+    assert!(file_contains!(public, "tags/a/index.html", "Num pagers: 9"));
     assert!(file_contains!(public, "tags/a/index.html", "Page size: 2"));
     assert!(file_contains!(public, "tags/a/index.html", "Current index: 1"));
     assert!(!file_contains!(public, "tags/a/index.html", "has_prev"));
@@ -609,7 +609,7 @@ fn can_build_site_with_pagination_for_taxonomy() {
     assert!(file_contains!(
         public,
         "tags/a/index.html",
-        "Last: https://replace-this-with-your-url.com/tags/a/page/8/"
+        "Last: https://replace-this-with-your-url.com/tags/a/page/9/"
     ));
     assert!(!file_contains!(public, "tags/a/index.html", "has_prev"));
 
@@ -774,8 +774,35 @@ fn check_site() {
         site.config.link_checker.skip_anchor_prefixes,
         vec!["https://github.com/rust-lang/rust/blob/"]
     );
+    assert_eq!(
+        site.config.link_checker.skip_prefixes,
+        vec!["http://[2001:db8::]/", "http://invaliddomain"]
+    );
+
+    site.config.enable_check_mode();
+    site.load().expect("link check test_site");
+}
+
+#[test]
+#[should_panic]
+fn panics_on_invalid_external_domain() {
+    let (mut site, _tmp_dir, _public) = build_site("test_site");
+
+    // remove the invalid domain skip prefix
+    let i = site
+        .config
+        .link_checker
+        .skip_prefixes
+        .iter()
+        .position(|prefix| prefix == "http://invaliddomain")
+        .unwrap();
+    site.config.link_checker.skip_prefixes.remove(i);
+
+    // confirm the invalid domain skip prefix was removed
     assert_eq!(site.config.link_checker.skip_prefixes, vec!["http://[2001:db8::]/"]);
 
+    // check the test site, this time without the invalid domain skip prefix, which should cause a
+    // panic
     site.config.enable_check_mode();
     site.load().expect("link check test_site");
 }

diff --git a/test_site/config.toml b/test_site/config.toml
@@ -24,6 +24,7 @@ anchors = "on"
 [link_checker]
 skip_prefixes = [
     "http://[2001:db8::]/",
+    "http://invaliddomain",
 ]
 
 skip_anchor_prefixes = [

diff --git a/test_site/content/posts/skip_prefixes.md b/test_site/content/posts/skip_prefixes.md
@@ -0,0 +1,4 @@
++++
++++
+
+[test skip 1](http://invaliddomain</)