From a9d52b633c18b39826d0d3dd4a853dc4d0f26cf0 Mon Sep 17 00:00:00 2001
From: Mykhailo Chalyi <mike@chaliy.name>
Date: Thu, 26 Mar 2026 23:02:55 +0000
Subject: [PATCH] feat(fetchers): add GitHubCodeFetcher for source file
 fetching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #51 — Adds a dedicated fetcher for GitHub blob URLs that returns
raw source file content with language metadata via the GitHub API.

Matches github.com/{owner}/{repo}/blob/{ref}/{path}. Returns file content
in a fenced code block with language detection, plus metadata (repo, ref,
size, language). Handles binary files gracefully (metadata only).
---
 crates/fetchkit/src/fetchers/github_code.rs | 531 ++++++++++++++++++++
 crates/fetchkit/src/fetchers/mod.rs         |  24 +-
 crates/fetchkit/src/lib.rs                  |   4 +-
 3 files changed, 549 insertions(+), 10 deletions(-)
 create mode 100644 crates/fetchkit/src/fetchers/github_code.rs
diff --git a/crates/fetchkit/src/fetchers/github_code.rs b/crates/fetchkit/src/fetchers/github_code.rs
new file mode 100644
index 0000000..9fc5720
--- /dev/null
+++ b/crates/fetchkit/src/fetchers/github_code.rs
@@ -0,0 +1,531 @@
+//! GitHub source file fetcher
+//!
+//! Handles GitHub blob URLs, returning raw source file content with language
+//! metadata via the GitHub API, optimized for LLM consumption.
+
+use crate::client::FetchOptions;
+use crate::error::FetchError;
+use crate::fetchers::Fetcher;
+use crate::types::{FetchRequest, FetchResponse};
+use crate::DEFAULT_USER_AGENT;
+use async_trait::async_trait;
+use reqwest::header::{HeaderValue, ACCEPT, USER_AGENT};
+use serde::Deserialize;
+use std::time::Duration;
+use url::Url;
+
+const API_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// Max file size we'll return inline (1 MB, matching GitHub contents API limit)
+const MAX_INLINE_SIZE: u64 = 1_048_576;
+
+/// GitHub source file fetcher
+///
+/// Matches `https://github.com/{owner}/{repo}/blob/{ref}/{path}` and returns
+/// raw file content with language metadata.
+pub struct GitHubCodeFetcher;
+
+impl GitHubCodeFetcher {
+    pub fn new() -> Self {
+        Self
+    }
+
+    /// Extract owner, repo, ref, and path from a GitHub blob URL
+    fn parse_url(url: &Url) -> Option<ParsedBlobUrl> {
+        if url.host_str() != Some("github.com") {
+            return None;
+        }
+
+        let segments: Vec<&str> = url.path_segments().map(|s| s.collect()).unwrap_or_default();
+
+        // Minimum: /{owner}/{repo}/blob/{ref}/{path} = 5+ segments
+        if segments.len() < 5 {
+            return None;
+        }
+
+        let owner = segments[0];
+        let repo = segments[1];
+        let kind = segments[2];
+        let git_ref = segments[3];
+
+        if owner.is_empty() || repo.is_empty() || git_ref.is_empty() {
+            return None;
+        }
+
+        if kind != "blob" {
+            return None;
+        }
+
+        // Exclude reserved owner paths
+        let reserved = [
+            "settings",
+            "explore",
+            "trending",
+            "collections",
+            "events",
+            "sponsors",
+            "notifications",
+            "marketplace",
+            "pulls",
+            "issues",
+            "codespaces",
+            "features",
+            "enterprise",
+            "organizations",
+            "pricing",
+            "about",
+            "team",
+            "security",
+            "login",
+            "join",
+        ];
+        if reserved.contains(&owner) {
+            return None;
+        }
+
+        // Path is everything after the ref
+        let file_path = segments[4..].join("/");
+        if file_path.is_empty() {
+            return None;
+        }
+
+        Some(ParsedBlobUrl {
+            owner: owner.to_string(),
+            repo: repo.to_string(),
+            git_ref: git_ref.to_string(),
+            path: file_path,
+        })
+    }
+}
+
+impl Default for GitHubCodeFetcher {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+struct ParsedBlobUrl {
+    owner: String,
+    repo: String,
+    git_ref: String,
+    path: String,
+}
+
+#[derive(Debug, Deserialize)]
+struct GitHubContents {
+    name: String,
+    path: String,
+    size: u64,
+    #[serde(rename = "type")]
+    content_type: String,
+    content: Option<String>,
+    html_url: Option<String>,
+}
+
+#[async_trait]
+impl Fetcher for GitHubCodeFetcher {
+    fn name(&self) -> &'static str {
+        "github_code"
+    }
+
+    fn matches(&self, url: &Url) -> bool {
+        Self::parse_url(url).is_some()
+    }
+
+    async fn fetch(
+        &self,
+        request: &FetchRequest,
+        options: &FetchOptions,
+    ) -> Result<FetchResponse, FetchError> {
+        let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?;
+
+        let parsed = Self::parse_url(&url)
+            .ok_or_else(|| FetchError::FetcherError("Not a valid GitHub blob URL".to_string()))?;
+
+        // Build HTTP client
+        let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT);
+        let mut client_builder = reqwest::Client::builder()
+            .connect_timeout(API_TIMEOUT)
+            .timeout(API_TIMEOUT)
+            .redirect(reqwest::redirect::Policy::none());
+
+        if !options.respect_proxy_env {
+            // THREAT[TM-NET-004]: Ignore ambient proxy env by default
+            client_builder = client_builder.no_proxy();
+        }
+
+        if options.dns_policy.block_private {
+            let validated_addr = options
+                .dns_policy
+                .resolve_and_validate("api.github.com", 443)
+                .map_err(|_| FetchError::BlockedUrl)?;
+            // THREAT[TM-SSRF-010]: Pin DNS
+            client_builder = client_builder.resolve("api.github.com", validated_addr);
+        }
+
+        let client = client_builder
+            .build()
+            .map_err(FetchError::ClientBuildError)?;
+
+        let ua_header = HeaderValue::from_str(user_agent)
+            .unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT));
+        let accept_header = HeaderValue::from_static("application/vnd.github+json");
+
+        // Fetch file contents via GitHub API
+        let api_url = format!(
+            "https://api.github.com/repos/{}/{}/contents/{}?ref={}",
+            parsed.owner, parsed.repo, parsed.path, parsed.git_ref
+        );
+
+        let response = client
+            .get(&api_url)
+            .header(USER_AGENT, ua_header)
+            .header(ACCEPT, accept_header)
+            .send()
+            .await
+            .map_err(FetchError::from_reqwest)?;
+
+        let status_code = response.status().as_u16();
+        if !response.status().is_success() {
+            let error_msg = if status_code == 404 {
+                format!(
+                    "{}/{}:{} {} not found",
+                    parsed.owner, parsed.repo, parsed.git_ref, parsed.path
+                )
+            } else if status_code == 403 {
+                "GitHub API rate limit exceeded".to_string()
+            } else {
+                format!("GitHub API error: HTTP {}", status_code)
+            };
+            return Ok(FetchResponse {
+                url: request.url.clone(),
+                status_code,
+                error: Some(error_msg),
+                ..Default::default()
+            });
+        }
+
+        let contents: GitHubContents = response
+            .json()
+            .await
+            .map_err(|e| FetchError::FetcherError(format!("Failed to parse contents: {}", e)))?;
+
+        // Handle directories (content_type == "dir")
+        if contents.content_type != "file" {
+            return Ok(FetchResponse {
+                url: request.url.clone(),
+                status_code: 200,
+                format: Some("github_file".to_string()),
+                error: Some(format!("Path is a {} (not a file)", contents.content_type)),
+                ..Default::default()
+            });
+        }
+
+        // Handle binary/large files — return metadata only
+        if contents.size > MAX_INLINE_SIZE || contents.content.is_none() {
+            let content = format_metadata_only(&parsed, &contents);
+            return Ok(FetchResponse {
+                url: request.url.clone(),
+                status_code: 200,
+                content_type: Some("text/markdown".to_string()),
+                format: Some("github_file".to_string()),
+                content: Some(content),
+                size: Some(contents.size),
+                ..Default::default()
+            });
+        }
+
+        // Decode base64 content
+        let raw_content = contents.content.as_deref().and_then(decode_base64_content);
+
+        let (file_content, is_binary) = match raw_content {
+            Some(bytes) => match String::from_utf8(bytes) {
+                Ok(text) => (Some(text), false),
+                Err(_) => (None, true),
+            },
+            None => (None, true),
+        };
+
+        if is_binary {
+            let content = format_metadata_only(&parsed, &contents);
+            return Ok(FetchResponse {
+                url: request.url.clone(),
+                status_code: 200,
+                content_type: Some("text/markdown".to_string()),
+                format: Some("github_file".to_string()),
+                content: Some(content),
+                size: Some(contents.size),
+                error: Some("Binary file — metadata only".to_string()),
+                ..Default::default()
+            });
+        }
+
+        let lang = detect_language(&contents.name);
+        let content = format_file_response(&parsed, &contents, file_content.as_deref(), lang);
+
+        Ok(FetchResponse {
+            url: request.url.clone(),
+            status_code: 200,
+            content_type: Some("text/markdown".to_string()),
+            format: Some("github_file".to_string()),
+            content: Some(content),
+            size: Some(contents.size),
+            ..Default::default()
+        })
+    }
+}
+
+/// Decode base64 with whitespace (GitHub API includes newlines in base64)
+fn decode_base64_content(encoded: &str) -> Option<Vec<u8>> {
+    let cleaned: String = encoded.chars().filter(|c| !c.is_whitespace()).collect();
+    base64_decode(&cleaned)
+}
+
+/// Basic base64 decoder (same approach as github_repo.rs)
+fn base64_decode(input: &str) -> Option<Vec<u8>> {
+    const ALPHABET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+    fn decode_char(c: u8) -> Option<u8> {
+        if c == b'=' {
+            return Some(0);
+        }
+        ALPHABET.iter().position(|&x| x == c).map(|p| p as u8)
+    }
+
+    let bytes: Vec<u8> = input.bytes().collect();
+    if !bytes.is_empty() && !bytes.len().is_multiple_of(4) {
+        return None;
+    }
+
+    let mut result = Vec::with_capacity(bytes.len() * 3 / 4);
+
+    for chunk in bytes.chunks(4) {
+        if chunk.len() != 4 {
+            return None;
+        }
+        let a = decode_char(chunk[0])?;
+        let b = decode_char(chunk[1])?;
+        let c = decode_char(chunk[2])?;
+        let d = decode_char(chunk[3])?;
+
+        result.push((a << 2) | (b >> 4));
+        if chunk[2] != b'=' {
+            result.push((b << 4) | (c >> 2));
+        }
+        if chunk[3] != b'=' {
+            result.push((c << 6) | d);
+        }
+    }
+
+    Some(result)
+}
+
+/// Simple language detection from file extension
+fn detect_language(filename: &str) -> Option<&'static str> {
+    let ext = filename.rsplit('.').next()?;
+    match ext.to_ascii_lowercase().as_str() {
+        "rs" => Some("rust"),
+        "py" => Some("python"),
+        "js" => Some("javascript"),
+        "ts" => Some("typescript"),
+        "tsx" => Some("tsx"),
+        "jsx" => Some("jsx"),
+        "rb" => Some("ruby"),
+        "go" => Some("go"),
+        "java" => Some("java"),
+        "kt" | "kts" => Some("kotlin"),
+        "swift" => Some("swift"),
+        "c" => Some("c"),
+        "cpp" | "cc" | "cxx" => Some("cpp"),
+        "h" | "hpp" => Some("cpp"),
+        "cs" => Some("csharp"),
+        "php" => Some("php"),
+        "sh" | "bash" => Some("bash"),
+        "zsh" => Some("zsh"),
+        "fish" => Some("fish"),
+        "yml" | "yaml" => Some("yaml"),
+        "json" => Some("json"),
+        "toml" => Some("toml"),
+        "xml" => Some("xml"),
+        "html" | "htm" => Some("html"),
+        "css" => Some("css"),
+        "scss" | "sass" => Some("scss"),
+        "sql" => Some("sql"),
+        "md" | "markdown" => Some("markdown"),
+        "dockerfile" => Some("dockerfile"),
+        "tf" => Some("terraform"),
+        "ex" | "exs" => Some("elixir"),
+        "erl" => Some("erlang"),
+        "hs" => Some("haskell"),
+        "ml" | "mli" => Some("ocaml"),
+        "r" => Some("r"),
+        "scala" => Some("scala"),
+        "lua" => Some("lua"),
+        "zig" => Some("zig"),
+        "nim" => Some("nim"),
+        "v" => Some("v"),
+        "dart" => Some("dart"),
+        "proto" => Some("protobuf"),
+        "graphql" | "gql" => Some("graphql"),
+        _ => None,
+    }
+}
+
+fn format_metadata_only(parsed: &ParsedBlobUrl, contents: &GitHubContents) -> String {
+    let lang = detect_language(&contents.name);
+    let mut out = String::new();
+    out.push_str(&format!("# {}\n\n", contents.path));
+    out.push_str("## File Info\n\n");
+    out.push_str(&format!(
+        "- **Repository:** {}/{}\n",
+        parsed.owner, parsed.repo
+    ));
+    out.push_str(&format!("- **Ref:** {}\n", parsed.git_ref));
+    out.push_str(&format!("- **Size:** {} bytes\n", contents.size));
+    if let Some(lang) = lang {
+        out.push_str(&format!("- **Language:** {}\n", lang));
+    }
+    if let Some(url) = &contents.html_url {
+        out.push_str(&format!("- **URL:** {}\n", url));
+    }
+    out
+}
+
+fn format_file_response(
+    parsed: &ParsedBlobUrl,
+    contents: &GitHubContents,
+    file_content: Option<&str>,
+    lang: Option<&str>,
+) -> String {
+    let mut out = String::new();
+
+    out.push_str(&format!("# {}\n\n", contents.path));
+    out.push_str("## File Info\n\n");
+    out.push_str(&format!(
+        "- **Repository:** {}/{}\n",
+        parsed.owner, parsed.repo
+    ));
+    out.push_str(&format!("- **Ref:** {}\n", parsed.git_ref));
+    out.push_str(&format!("- **Size:** {} bytes\n", contents.size));
+    if let Some(lang) = lang {
+        out.push_str(&format!("- **Language:** {}\n", lang));
+    }
+    if let Some(url) = &contents.html_url {
+        out.push_str(&format!("- **URL:** {}\n", url));
+    }
+
+    if let Some(content) = file_content {
+        let lang_hint = lang.unwrap_or("");
+        out.push_str(&format!(
+            "\n## Content\n\n```{}\n{}\n```\n",
+            lang_hint, content
+        ));
+    }
+
+    out
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_blob_url() {
+        let url = Url::parse("https://github.com/owner/repo/blob/main/src/lib.rs").unwrap();
+        let parsed = GitHubCodeFetcher::parse_url(&url).unwrap();
+        assert_eq!(parsed.owner, "owner");
+        assert_eq!(parsed.repo, "repo");
+        assert_eq!(parsed.git_ref, "main");
+        assert_eq!(parsed.path, "src/lib.rs");
+    }
+
+    #[test]
+    fn test_parse_blob_url_nested_path() {
+        let url = Url::parse("https://github.com/owner/repo/blob/v1.0.0/crates/core/src/main.rs")
+            .unwrap();
+        let parsed = GitHubCodeFetcher::parse_url(&url).unwrap();
+        assert_eq!(parsed.git_ref, "v1.0.0");
+        assert_eq!(parsed.path, "crates/core/src/main.rs");
+    }
+
+    #[test]
+    fn test_rejects_non_blob() {
+        let url = Url::parse("https://github.com/owner/repo/tree/main/src").unwrap();
+        assert!(GitHubCodeFetcher::parse_url(&url).is_none());
+    }
+
+    #[test]
+    fn test_rejects_too_few_segments() {
+        let url = Url::parse("https://github.com/owner/repo/blob/main").unwrap();
+        assert!(GitHubCodeFetcher::parse_url(&url).is_none());
+    }
+
+    #[test]
+    fn test_rejects_non_github() {
+        let url = Url::parse("https://gitlab.com/owner/repo/blob/main/file.rs").unwrap();
+        assert!(GitHubCodeFetcher::parse_url(&url).is_none());
+    }
+
+    #[test]
+    fn test_rejects_reserved_owner() {
+        let url = Url::parse("https://github.com/settings/repo/blob/main/file.rs").unwrap();
+        assert!(GitHubCodeFetcher::parse_url(&url).is_none());
+    }
+
+    #[test]
+    fn test_fetcher_matches() {
+        let fetcher = GitHubCodeFetcher::new();
+
+        let url = Url::parse("https://github.com/rust-lang/rust/blob/master/Cargo.toml").unwrap();
+        assert!(fetcher.matches(&url));
+
+        let url = Url::parse("https://github.com/rust-lang/rust").unwrap();
+        assert!(!fetcher.matches(&url));
+
+        let url = Url::parse("https://github.com/rust-lang/rust/issues/1").unwrap();
+        assert!(!fetcher.matches(&url));
+    }
+
+    #[test]
+    fn test_detect_language() {
+        assert_eq!(detect_language("main.rs"), Some("rust"));
+        assert_eq!(detect_language("app.py"), Some("python"));
+        assert_eq!(detect_language("index.tsx"), Some("tsx"));
+        assert_eq!(detect_language("Cargo.toml"), Some("toml"));
+        assert_eq!(detect_language("unknown.xyz"), None);
+        assert_eq!(detect_language("Dockerfile"), Some("dockerfile"));
+    }
+
+    #[test]
+    fn test_format_file_response() {
+        let parsed = ParsedBlobUrl {
+            owner: "owner".to_string(),
+            repo: "repo".to_string(),
+            git_ref: "main".to_string(),
+            path: "src/lib.rs".to_string(),
+        };
+        let contents = GitHubContents {
+            name: "lib.rs".to_string(),
+            path: "src/lib.rs".to_string(),
+            size: 42,
+            content_type: "file".to_string(),
+            content: None,
+            html_url: Some("https://github.com/owner/repo/blob/main/src/lib.rs".to_string()),
+        };
+
+        let output = format_file_response(&parsed, &contents, Some("fn main() {}"), Some("rust"));
+
+        assert!(output.contains("# src/lib.rs"));
+        assert!(output.contains("**Repository:** owner/repo"));
+        assert!(output.contains("**Language:** rust"));
+        assert!(output.contains("```rust\nfn main() {}\n```"));
+    }
+
+    #[test]
+    fn test_base64_decode() {
+        // "Hello" in base64
+        assert_eq!(base64_decode("SGVsbG8="), Some(b"Hello".to_vec()));
+        assert_eq!(base64_decode(""), Some(vec![]));
+        assert_eq!(base64_decode("abc"), None);
+    }
+}
diff --git a/crates/fetchkit/src/fetchers/mod.rs b/crates/fetchkit/src/fetchers/mod.rs
index 494ec71..dc90c9f 100644
--- a/crates/fetchkit/src/fetchers/mod.rs
+++ b/crates/fetchkit/src/fetchers/mod.rs
@@ -4,11 +4,13 @@
 //! FetcherRegistry dispatches to the first matching fetcher.
 
 mod default;
+mod github_code;
 mod github_issue;
 mod github_repo;
 mod twitter;
 
 pub use default::DefaultFetcher;
+pub use github_code::GitHubCodeFetcher;
 pub use github_issue::GitHubIssueFetcher;
 pub use github_repo::GitHubRepoFetcher;
 pub use twitter::TwitterFetcher;
@@ -113,13 +115,16 @@ impl FetcherRegistry {
     /// Create a registry with default fetchers pre-registered
     ///
     /// Includes (in order of priority):
-    /// 1. GitHubRepoFetcher - handles GitHub repository URLs
-    /// 2. TwitterFetcher - handles Twitter/X tweet URLs
-    /// 3. DefaultFetcher - handles all HTTP/HTTPS URLs
+    /// 1. GitHubCodeFetcher - handles GitHub blob/file URLs
+    /// 2. GitHubIssueFetcher - handles GitHub issue/PR URLs
+    /// 3. GitHubRepoFetcher - handles GitHub repository URLs
+    /// 4. TwitterFetcher - handles Twitter/X tweet URLs
+    /// 5. DefaultFetcher - handles all HTTP/HTTPS URLs
     pub fn with_defaults() -> Self {
         let mut registry = Self::new();
         // Register specialized fetchers first (higher priority)
-        // GitHubIssueFetcher before GitHubRepoFetcher (more specific path)
+        // GitHub fetchers: code > issue > repo (most specific first)
+        registry.register(Box::new(GitHubCodeFetcher::new()));
         registry.register(Box::new(GitHubIssueFetcher::new()));
         registry.register(Box::new(GitHubRepoFetcher::new()));
         registry.register(Box::new(TwitterFetcher::new()));
@@ -279,11 +284,12 @@ mod tests {
     #[test]
     fn test_registry_with_defaults() {
         let registry = FetcherRegistry::with_defaults();
-        assert_eq!(registry.fetchers.len(), 4);
-        assert_eq!(registry.fetchers[0].name(), "github_issue");
-        assert_eq!(registry.fetchers[1].name(), "github_repo");
-        assert_eq!(registry.fetchers[2].name(), "twitter_tweet");
-        assert_eq!(registry.fetchers[3].name(), "default");
+        assert_eq!(registry.fetchers.len(), 5);
+        assert_eq!(registry.fetchers[0].name(), "github_code");
+        assert_eq!(registry.fetchers[1].name(), "github_issue");
+        assert_eq!(registry.fetchers[2].name(), "github_repo");
+        assert_eq!(registry.fetchers[3].name(), "twitter_tweet");
+        assert_eq!(registry.fetchers[4].name(), "default");
     }
 
     #[test]
diff --git a/crates/fetchkit/src/lib.rs b/crates/fetchkit/src/lib.rs
index 763a1a7..5a38960 100644
--- a/crates/fetchkit/src/lib.rs
+++ b/crates/fetchkit/src/lib.rs
@@ -59,6 +59,7 @@
 //!
 //! Built-in fetchers:
 //! - [`DefaultFetcher`] - General HTTP/HTTPS fetcher with HTML conversion
+//! - [`GitHubCodeFetcher`] - GitHub source file content with language metadata
 //! - [`GitHubIssueFetcher`] - GitHub issue and PR metadata with comments
 //! - [`GitHubRepoFetcher`] - GitHub repository metadata and README
 //! - [`TwitterFetcher`] - Twitter/X tweet content with article metadata
@@ -80,7 +81,8 @@ pub use convert::{html_to_markdown, html_to_text};
 pub use dns::DnsPolicy;
 pub use error::{FetchError, ToolError};
 pub use fetchers::{
-    DefaultFetcher, Fetcher, FetcherRegistry, GitHubIssueFetcher, GitHubRepoFetcher, TwitterFetcher,
+    DefaultFetcher, Fetcher, FetcherRegistry, GitHubCodeFetcher, GitHubIssueFetcher,
+    GitHubRepoFetcher, TwitterFetcher,
 };
 pub use file_saver::{FileSaveError, FileSaver, LocalFileSaver, SaveResult};
 pub use tool::{