From fab77b1efcbc4d9aa7cbd80a033821da5d4403c9 Mon Sep 17 00:00:00 2001
From: Mykhailo Chalyi <mike@chaliy.name>
Date: Sun, 17 May 2026 11:47:16 -0500
Subject: [PATCH] fix(fetchers): enforce fetch options on youtube secondary
 requests

---
 crates/fetchkit/src/fetchers/youtube.rs | 28 +++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)
diff --git a/crates/fetchkit/src/fetchers/youtube.rs b/crates/fetchkit/src/fetchers/youtube.rs
index 0a197f3..b58ed8c 100644
--- a/crates/fetchkit/src/fetchers/youtube.rs
+++ b/crates/fetchkit/src/fetchers/youtube.rs
@@ -102,7 +102,7 @@ impl Fetcher for YouTubeFetcher {
         let mut client_builder = reqwest::Client::builder()
             .connect_timeout(API_TIMEOUT)
             .timeout(API_TIMEOUT)
-            .redirect(reqwest::redirect::Policy::limited(3));
+            .redirect(reqwest::redirect::Policy::none());
 
         if !options.respect_proxy_env {
             client_builder = client_builder.no_proxy();
@@ -120,6 +120,7 @@ impl Fetcher for YouTubeFetcher {
         // Fetch oEmbed metadata
         // The canonical URL only contains safe ASCII chars, so it can be passed directly
         let mut oembed_url = Url::parse("https://www.youtube.com/oembed").unwrap();
+        options.validate_url(&oembed_url)?;
         oembed_url
             .query_pairs_mut()
             .append_pair("url", &canonical_url)
@@ -144,8 +145,7 @@ impl Fetcher for YouTubeFetcher {
         let author_url = oembed.as_ref().and_then(|o| o.author_url.clone());
 
         // Attempt transcript extraction via timedtext API
-        let transcript =
-            fetch_transcript(&client, &ua_header, &video_id, options.max_body_size).await;
+        let transcript = fetch_transcript(&client, &ua_header, &video_id, options).await;
 
         let content = format_youtube_response(
             &title,
@@ -173,7 +173,7 @@ async fn fetch_transcript(
     client: &reqwest::Client,
     ua: &HeaderValue,
     video_id: &str,
-    max_body_size: Option<usize>,
+    options: &FetchOptions,
 ) -> Option<String> {
     // Try the legacy timedtext API (auto-generated English captions)
     let timedtext_url = format!(
@@ -181,8 +181,11 @@ async fn fetch_transcript(
         video_id
     );
 
+    let timedtext_url = Url::parse(&timedtext_url).ok()?;
+    options.validate_url(&timedtext_url).ok()?;
+
     let resp = client
-        .get(&timedtext_url)
+        .get(timedtext_url.as_str())
         .header(USER_AGENT, ua.clone())
         .send()
         .await
@@ -193,7 +196,7 @@ async fn fetch_transcript(
     }
 
     let xml = resp.text().await.ok()?;
-    if let Some(max_body_size) = max_body_size {
+    if let Some(max_body_size) = options.max_body_size {
         if xml.len() > max_body_size {
             return None;
         }
@@ -475,6 +478,19 @@ mod tests {
         assert!(segments.is_empty());
     }
 
+    #[tokio::test]
+    async fn test_fetch_blocked_secondary_host() {
+        let fetcher = YouTubeFetcher::new();
+        let request = FetchRequest::new("https://youtu.be/dQw4w9WgXcQ");
+        let options = FetchOptions {
+            blocked_hosts: vec![".youtube.com".to_string()],
+            ..Default::default()
+        };
+
+        let result = fetcher.fetch(&request, &options).await;
+        assert!(matches!(result, Err(FetchError::BlockedUrl)));
+    }
+
     #[test]
     fn test_decode_xml_entities() {
         assert_eq!(decode_xml_entities("a &amp; b"), "a & b");