From 219a6cd58ae732b908c169229079ba303c9be788 Mon Sep 17 00:00:00 2001 From: Juan Cruz-Benito Date: Fri, 27 Jun 2025 16:36:12 +0200 Subject: [PATCH 1/2] Improve checks for github.com links & adding compatibility with GHE companies that use github..xxx type of links --- src/gitingest/cloning.py | 7 +-- src/gitingest/utils/git_utils.py | 53 ++++++++++++++++++----- src/gitingest/utils/query_parser_utils.py | 6 +-- 3 files changed, 48 insertions(+), 18 deletions(-) diff --git a/src/gitingest/cloning.py b/src/gitingest/cloning.py index 284b353e..682e5ff0 100644 --- a/src/gitingest/cloning.py +++ b/src/gitingest/cloning.py @@ -6,6 +6,7 @@ from gitingest.config import DEFAULT_TIMEOUT from gitingest.schemas import CloneConfig from gitingest.utils.git_utils import ( + _is_github_host, check_repo_exists, create_git_auth_header, create_git_command, @@ -48,7 +49,7 @@ async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None: partial_clone: bool = config.subpath != "/" # Validate token if provided - if token and url.startswith("https://github.com"): + if token and _is_github_host(url): validate_github_token(token) # Create parent directory if it doesn't exist @@ -59,8 +60,8 @@ async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None: raise ValueError("Repository not found. Make sure it is public or that you have provided a valid token.") clone_cmd = ["git"] - if token and url.startswith("https://github.com"): - clone_cmd += ["-c", create_git_auth_header(token)] + if token and _is_github_host(url): + clone_cmd += ["-c", create_git_auth_header(token, url)] clone_cmd += ["clone", "--single-branch"] # TODO: Re-enable --recurse-submodules when submodule support is needed diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index 7d18499e..acf7d806 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -10,6 +10,26 @@ GITHUB_PAT_PATTERN = r"^(?:github_pat_|ghp_)[A-Za-z0-9_]{36,}$" +def _is_github_host(url: str) -> bool: + """ + Check if a URL is from a GitHub host (github.com or GitHub Enterprise). + + Parameters + ---------- + url : str + The URL to check + + Returns + ------- + bool + True if the URL is from a GitHub host, False otherwise + """ + from urllib.parse import urlparse + parsed = urlparse(url) + hostname = parsed.hostname or "" + return hostname == "github.com" or hostname.startswith("github.") + + async def run_command(*args: str) -> Tuple[bytes, bytes]: """ Execute a shell command asynchronously and return (stdout, stderr) bytes. @@ -80,7 +100,7 @@ async def check_repo_exists(url: str, token: Optional[str] = None) -> bool: RuntimeError If the curl command returns an unexpected status code. """ - if token and "github.com" in url: + if token and _is_github_host(url): return await _check_github_repo_exists(url, token) proc = await asyncio.create_subprocess_exec( @@ -131,12 +151,18 @@ async def _check_github_repo_exists(url: str, token: Optional[str] = None) -> bo RuntimeError If the repository is not found, if the provided URL is invalid, or if the token format is invalid. """ - m = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$", url) + m = re.match(r"https?://github\.([^/]*)/([^/]+)/([^/]+?)(?:\.git)?/?$", url) if not m: - raise ValueError(f"Un-recognised GitHub URL: {url!r}") - owner, repo = m.groups() - - api = f"https://api.github.com/repos/{owner}/{repo}" + m = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$", url) + if not m: + raise ValueError(f"Un-recognised GitHub URL: {url!r}") + owner, repo = m.groups() + api = f"https://api.github.com/repos/{owner}/{repo}" + else: + domain, owner, repo = m.groups() + from urllib.parse import urlparse + parsed = urlparse(url) + api = f"https://{parsed.hostname}/api/v3/repos/{owner}/{repo}" cmd = [ "curl", "--silent", @@ -189,8 +215,8 @@ async def fetch_remote_branch_list(url: str, token: Optional[str] = None) -> Lis fetch_branches_command = ["git"] # Add authentication if needed - if token and "github.com" in url: - fetch_branches_command += ["-c", create_git_auth_header(token)] + if token and _is_github_host(url): + fetch_branches_command += ["-c", create_git_auth_header(token, url)] fetch_branches_command += ["ls-remote", "--heads", url] @@ -225,13 +251,13 @@ def create_git_command(base_cmd: List[str], local_path: str, url: str, token: Op The git command with authentication if needed """ cmd = base_cmd + ["-C", local_path] - if token and url.startswith("https://github.com"): + if token and _is_github_host(url): validate_github_token(token) - cmd += ["-c", create_git_auth_header(token)] + cmd += ["-c", create_git_auth_header(token, url)] return cmd -def create_git_auth_header(token: str) -> str: +def create_git_auth_header(token: str, url: str = "https://github.com") -> str: """Create a Basic authentication header for GitHub git operations. Parameters @@ -244,8 +270,11 @@ def create_git_auth_header(token: str) -> str: str The git config command for setting the authentication header """ + from urllib.parse import urlparse + parsed = urlparse(url) + hostname = parsed.hostname or "github.com" basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() - return f"http.https://github.com/.extraheader=Authorization: Basic {basic}" + return f"http.https://{hostname}/.extraheader=Authorization: Basic {basic}" def validate_github_token(token: str) -> None: diff --git a/src/gitingest/utils/query_parser_utils.py b/src/gitingest/utils/query_parser_utils.py index 2922dd2a..0181d371 100644 --- a/src/gitingest/utils/query_parser_utils.py +++ b/src/gitingest/utils/query_parser_utils.py @@ -85,8 +85,8 @@ def _looks_like_git_host(host: str) -> bool: """ Check if the given host looks like a Git host. - The current heuristic returns `True` when the host starts with `git.` (e.g. `git.example.com`) or starts with - `gitlab.` (e.g. `gitlab.company.com`). + The current heuristic returns `True` when the host starts with `git.` (e.g. `git.example.com`), starts with + `gitlab.` (e.g. `gitlab.company.com`), or starts with `github.` (e.g. `github.company.com` for GitHub Enterprise). Parameters ---------- @@ -99,7 +99,7 @@ def _looks_like_git_host(host: str) -> bool: True if the host looks like a Git host, otherwise False. """ host = host.lower() - return host.startswith(("git.", "gitlab.")) + return host.startswith(("git.", "gitlab.", "github.")) def _validate_url_scheme(scheme: str) -> None: From 5344fb521a89804f3fc2e558d66672eb081c30d4 Mon Sep 17 00:00:00 2001 From: Juan Cruz-Benito Date: Fri, 27 Jun 2025 17:14:07 +0200 Subject: [PATCH 2/2] Fully fixing retro-compatibility with existing tests & adding GHE-related tests --- src/gitingest/cloning.py | 9 ++- src/gitingest/utils/git_utils.py | 31 ++++++-- tests/test_git_utils.py | 122 +++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 9 deletions(-) diff --git a/src/gitingest/cloning.py b/src/gitingest/cloning.py index 682e5ff0..1d4487fb 100644 --- a/src/gitingest/cloning.py +++ b/src/gitingest/cloning.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import Optional +from urllib.parse import urlparse from gitingest.config import DEFAULT_TIMEOUT from gitingest.schemas import CloneConfig @@ -61,7 +62,13 @@ async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None: clone_cmd = ["git"] if token and _is_github_host(url): - clone_cmd += ["-c", create_git_auth_header(token, url)] + # Only pass URL if it's not the default github.com to maintain backward compatibility + + parsed = urlparse(url) + if parsed.hostname == "github.com": + clone_cmd += ["-c", create_git_auth_header(token)] + else: + clone_cmd += ["-c", create_git_auth_header(token, url)] clone_cmd += ["clone", "--single-branch"] # TODO: Re-enable --recurse-submodules when submodule support is needed diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index acf7d806..5735c27e 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -4,6 +4,7 @@ import base64 import re from typing import List, Optional, Tuple +from urllib.parse import urlparse from gitingest.utils.exceptions import InvalidGitHubTokenError @@ -13,18 +14,17 @@ def _is_github_host(url: str) -> bool: """ Check if a URL is from a GitHub host (github.com or GitHub Enterprise). - + Parameters ---------- url : str The URL to check - + Returns ------- bool True if the URL is from a GitHub host, False otherwise """ - from urllib.parse import urlparse parsed = urlparse(url) hostname = parsed.hostname or "" return hostname == "github.com" or hostname.startswith("github.") @@ -159,8 +159,8 @@ async def _check_github_repo_exists(url: str, token: Optional[str] = None) -> bo owner, repo = m.groups() api = f"https://api.github.com/repos/{owner}/{repo}" else: - domain, owner, repo = m.groups() - from urllib.parse import urlparse + _, owner, repo = m.groups() + parsed = urlparse(url) api = f"https://{parsed.hostname}/api/v3/repos/{owner}/{repo}" cmd = [ @@ -216,7 +216,13 @@ async def fetch_remote_branch_list(url: str, token: Optional[str] = None) -> Lis # Add authentication if needed if token and _is_github_host(url): - fetch_branches_command += ["-c", create_git_auth_header(token, url)] + # Only pass URL if it's not the default github.com to maintain backward compatibility + + parsed = urlparse(url) + if parsed.hostname == "github.com": + fetch_branches_command += ["-c", create_git_auth_header(token)] + else: + fetch_branches_command += ["-c", create_git_auth_header(token, url)] fetch_branches_command += ["ls-remote", "--heads", url] @@ -253,7 +259,13 @@ def create_git_command(base_cmd: List[str], local_path: str, url: str, token: Op cmd = base_cmd + ["-C", local_path] if token and _is_github_host(url): validate_github_token(token) - cmd += ["-c", create_git_auth_header(token, url)] + # Only pass URL if it's not the default github.com to maintain backward compatibility + + parsed = urlparse(url) + if parsed.hostname == "github.com": + cmd += ["-c", create_git_auth_header(token)] + else: + cmd += ["-c", create_git_auth_header(token, url)] return cmd @@ -264,13 +276,16 @@ def create_git_auth_header(token: str, url: str = "https://github.com") -> str: ---------- token : str GitHub personal access token + url : str + The GitHub URL to create the authentication header for. + Defaults to "https://github.com". Returns ------- str The git config command for setting the authentication header """ - from urllib.parse import urlparse + parsed = urlparse(url) hostname = parsed.hostname or "github.com" basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() diff --git a/tests/test_git_utils.py b/tests/test_git_utils.py index 9d4e842d..df6e4e72 100644 --- a/tests/test_git_utils.py +++ b/tests/test_git_utils.py @@ -11,6 +11,7 @@ from gitingest.utils.exceptions import InvalidGitHubTokenError from gitingest.utils.git_utils import ( + _is_github_host, create_git_auth_header, create_git_command, validate_github_token, @@ -140,3 +141,124 @@ def test_create_git_command_helper_calls(mocker, url, token, should_call): header_mock.assert_not_called() # HEADER should not be included in command list assert "HEADER" not in cmd + + +@pytest.mark.parametrize( + "url, expected", + [ + # GitHub.com URLs + ("https://github.com/owner/repo.git", True), + ("http://github.com/owner/repo.git", True), + ("https://github.com/owner/repo", True), + # GitHub Enterprise URLs + ("https://github.company.com/owner/repo.git", True), + ("https://github.enterprise.org/owner/repo.git", True), + ("http://github.internal/owner/repo.git", True), + ("https://github.example.co.uk/owner/repo.git", True), + # Non-GitHub URLs + ("https://gitlab.com/owner/repo.git", False), + ("https://bitbucket.org/owner/repo.git", False), + ("https://git.example.com/owner/repo.git", False), + ("https://mygithub.com/owner/repo.git", False), # doesn't start with "github." + ("https://subgithub.com/owner/repo.git", False), + ("https://example.com/github/repo.git", False), + # Edge cases + ("", False), + ("not-a-url", False), + ("ftp://github.com/owner/repo.git", True), # Different protocol but still github.com + ], +) +def test_is_github_host(url, expected): + """_is_github_host should correctly identify GitHub and GitHub Enterprise URLs.""" + assert _is_github_host(url) == expected + + +@pytest.mark.parametrize( + "token, url, expected_hostname", + [ + # GitHub.com URLs (default) + ("ghp_" + "a" * 36, "https://github.com", "github.com"), + ("ghp_" + "a" * 36, "https://github.com/owner/repo.git", "github.com"), + # GitHub Enterprise URLs + ("ghp_" + "b" * 36, "https://github.company.com", "github.company.com"), + ("ghp_" + "c" * 36, "https://github.enterprise.org/owner/repo.git", "github.enterprise.org"), + ("ghp_" + "d" * 36, "http://github.internal", "github.internal"), + ], +) +def test_create_git_auth_header_with_ghe_url(token, url, expected_hostname): + """create_git_auth_header should handle GitHub Enterprise URLs correctly.""" + header = create_git_auth_header(token, url) + expected_basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() + expected = f"http.https://{expected_hostname}/.extraheader=Authorization: Basic {expected_basic}" + assert header == expected + + +@pytest.mark.parametrize( + "base_cmd, local_path, url, token, expected_auth_hostname", + [ + # GitHub.com URLs - should use default hostname + ( + ["git", "clone"], + "/some/path", + "https://github.com/owner/repo.git", + "ghp_" + "a" * 36, + "github.com", + ), + # GitHub Enterprise URLs - should use custom hostname + ( + ["git", "clone"], + "/some/path", + "https://github.company.com/owner/repo.git", + "ghp_" + "b" * 36, + "github.company.com", + ), + ( + ["git", "clone"], + "/some/path", + "https://github.enterprise.org/owner/repo.git", + "ghp_" + "c" * 36, + "github.enterprise.org", + ), + ( + ["git", "clone"], + "/some/path", + "http://github.internal/owner/repo.git", + "ghp_" + "d" * 36, + "github.internal", + ), + ], +) +def test_create_git_command_with_ghe_urls(base_cmd, local_path, url, token, expected_auth_hostname): + """create_git_command should handle GitHub Enterprise URLs correctly.""" + cmd = create_git_command(base_cmd, local_path, url, token) + + # Should have base command and -C option + expected_prefix = base_cmd + ["-C", local_path] + assert cmd[: len(expected_prefix)] == expected_prefix + + # Should have -c and auth header + assert "-c" in cmd + auth_header_index = cmd.index("-c") + 1 + auth_header = cmd[auth_header_index] + + # Verify the auth header contains the expected hostname + assert f"http.https://{expected_auth_hostname}/" in auth_header + assert "Authorization: Basic" in auth_header + + +@pytest.mark.parametrize( + "base_cmd, local_path, url, token", + [ + # Should NOT add auth headers for non-GitHub URLs + (["git", "clone"], "/some/path", "https://gitlab.com/owner/repo.git", "ghp_" + "a" * 36), + (["git", "clone"], "/some/path", "https://bitbucket.org/owner/repo.git", "ghp_" + "b" * 36), + (["git", "clone"], "/some/path", "https://git.example.com/owner/repo.git", "ghp_" + "c" * 36), + ], +) +def test_create_git_command_ignores_non_github_urls(base_cmd, local_path, url, token): + """create_git_command should not add auth headers for non-GitHub URLs.""" + cmd = create_git_command(base_cmd, local_path, url, token) + + # Should only have base command and -C option, no auth headers + expected = base_cmd + ["-C", local_path] + assert cmd == expected