Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions src/gitingest/cloning.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

from pathlib import Path
from typing import Optional
from urllib.parse import urlparse

from gitingest.config import DEFAULT_TIMEOUT
from gitingest.schemas import CloneConfig
from gitingest.utils.git_utils import (
_is_github_host,
check_repo_exists,
create_git_auth_header,
create_git_command,
Expand Down Expand Up @@ -48,7 +50,7 @@ async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None:
partial_clone: bool = config.subpath != "/"

# Validate token if provided
if token and url.startswith("https://github.com"):
if token and _is_github_host(url):
validate_github_token(token)

# Create parent directory if it doesn't exist
Expand All @@ -59,8 +61,14 @@ async def clone_repo(config: CloneConfig, token: Optional[str] = None) -> None:
raise ValueError("Repository not found. Make sure it is public or that you have provided a valid token.")

clone_cmd = ["git"]
if token and url.startswith("https://github.com"):
clone_cmd += ["-c", create_git_auth_header(token)]
if token and _is_github_host(url):
# Only pass URL if it's not the default github.com to maintain backward compatibility

parsed = urlparse(url)
if parsed.hostname == "github.com":
clone_cmd += ["-c", create_git_auth_header(token)]
else:
clone_cmd += ["-c", create_git_auth_header(token, url)]

clone_cmd += ["clone", "--single-branch"]
# TODO: Re-enable --recurse-submodules when submodule support is needed
Expand Down
68 changes: 56 additions & 12 deletions src/gitingest/utils/git_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,32 @@
import base64
import re
from typing import List, Optional, Tuple
from urllib.parse import urlparse

from gitingest.utils.exceptions import InvalidGitHubTokenError

GITHUB_PAT_PATTERN = r"^(?:github_pat_|ghp_)[A-Za-z0-9_]{36,}$"


def _is_github_host(url: str) -> bool:
"""
Check if a URL is from a GitHub host (github.com or GitHub Enterprise).

Parameters
----------
url : str
The URL to check

Returns
-------
bool
True if the URL is from a GitHub host, False otherwise
"""
parsed = urlparse(url)
hostname = parsed.hostname or ""
return hostname == "github.com" or hostname.startswith("github.")


async def run_command(*args: str) -> Tuple[bytes, bytes]:
"""
Execute a shell command asynchronously and return (stdout, stderr) bytes.
Expand Down Expand Up @@ -80,7 +100,7 @@ async def check_repo_exists(url: str, token: Optional[str] = None) -> bool:
RuntimeError
If the curl command returns an unexpected status code.
"""
if token and "github.com" in url:
if token and _is_github_host(url):
return await _check_github_repo_exists(url, token)

proc = await asyncio.create_subprocess_exec(
Expand Down Expand Up @@ -131,12 +151,18 @@ async def _check_github_repo_exists(url: str, token: Optional[str] = None) -> bo
RuntimeError
If the repository is not found, if the provided URL is invalid, or if the token format is invalid.
"""
m = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$", url)
m = re.match(r"https?://github\.([^/]*)/([^/]+)/([^/]+?)(?:\.git)?/?$", url)
if not m:
raise ValueError(f"Un-recognised GitHub URL: {url!r}")
owner, repo = m.groups()

api = f"https://api.github.com/repos/{owner}/{repo}"
m = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$", url)
if not m:
raise ValueError(f"Un-recognised GitHub URL: {url!r}")
owner, repo = m.groups()
api = f"https://api.github.com/repos/{owner}/{repo}"
else:
_, owner, repo = m.groups()

parsed = urlparse(url)
api = f"https://{parsed.hostname}/api/v3/repos/{owner}/{repo}"
cmd = [
"curl",
"--silent",
Expand Down Expand Up @@ -189,8 +215,14 @@ async def fetch_remote_branch_list(url: str, token: Optional[str] = None) -> Lis
fetch_branches_command = ["git"]

# Add authentication if needed
if token and "github.com" in url:
fetch_branches_command += ["-c", create_git_auth_header(token)]
if token and _is_github_host(url):
# Only pass URL if it's not the default github.com to maintain backward compatibility

parsed = urlparse(url)
if parsed.hostname == "github.com":
fetch_branches_command += ["-c", create_git_auth_header(token)]
else:
fetch_branches_command += ["-c", create_git_auth_header(token, url)]

fetch_branches_command += ["ls-remote", "--heads", url]

Expand Down Expand Up @@ -225,27 +257,39 @@ def create_git_command(base_cmd: List[str], local_path: str, url: str, token: Op
The git command with authentication if needed
"""
cmd = base_cmd + ["-C", local_path]
if token and url.startswith("https://github.com"):
if token and _is_github_host(url):
validate_github_token(token)
cmd += ["-c", create_git_auth_header(token)]
# Only pass URL if it's not the default github.com to maintain backward compatibility

parsed = urlparse(url)
if parsed.hostname == "github.com":
cmd += ["-c", create_git_auth_header(token)]
else:
cmd += ["-c", create_git_auth_header(token, url)]
return cmd


def create_git_auth_header(token: str) -> str:
def create_git_auth_header(token: str, url: str = "https://github.com") -> str:
"""Create a Basic authentication header for GitHub git operations.

Parameters
----------
token : str
GitHub personal access token
url : str
The GitHub URL to create the authentication header for.
Defaults to "https://github.com".

Returns
-------
str
The git config command for setting the authentication header
"""

parsed = urlparse(url)
hostname = parsed.hostname or "github.com"
basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode()
return f"http.https://github.com/.extraheader=Authorization: Basic {basic}"
return f"http.https://{hostname}/.extraheader=Authorization: Basic {basic}"


def validate_github_token(token: str) -> None:
Expand Down
6 changes: 3 additions & 3 deletions src/gitingest/utils/query_parser_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@ def _looks_like_git_host(host: str) -> bool:
"""
Check if the given host looks like a Git host.

The current heuristic returns `True` when the host starts with `git.` (e.g. `git.example.com`) or starts with
`gitlab.` (e.g. `gitlab.company.com`).
The current heuristic returns `True` when the host starts with `git.` (e.g. `git.example.com`), starts with
`gitlab.` (e.g. `gitlab.company.com`), or starts with `github.` (e.g. `github.company.com` for GitHub Enterprise).

Parameters
----------
Expand All @@ -99,7 +99,7 @@ def _looks_like_git_host(host: str) -> bool:
True if the host looks like a Git host, otherwise False.
"""
host = host.lower()
return host.startswith(("git.", "gitlab."))
return host.startswith(("git.", "gitlab.", "github."))


def _validate_url_scheme(scheme: str) -> None:
Expand Down
122 changes: 122 additions & 0 deletions tests/test_git_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from gitingest.utils.exceptions import InvalidGitHubTokenError
from gitingest.utils.git_utils import (
_is_github_host,
create_git_auth_header,
create_git_command,
validate_github_token,
Expand Down Expand Up @@ -140,3 +141,124 @@ def test_create_git_command_helper_calls(mocker, url, token, should_call):
header_mock.assert_not_called()
# HEADER should not be included in command list
assert "HEADER" not in cmd


@pytest.mark.parametrize(
"url, expected",
[
# GitHub.com URLs
("https://github.com/owner/repo.git", True),
("http://github.com/owner/repo.git", True),
("https://github.com/owner/repo", True),
# GitHub Enterprise URLs
("https://github.company.com/owner/repo.git", True),
("https://github.enterprise.org/owner/repo.git", True),
("http://github.internal/owner/repo.git", True),
("https://github.example.co.uk/owner/repo.git", True),
# Non-GitHub URLs
("https://gitlab.com/owner/repo.git", False),
("https://bitbucket.org/owner/repo.git", False),
("https://git.example.com/owner/repo.git", False),
("https://mygithub.com/owner/repo.git", False), # doesn't start with "github."
("https://subgithub.com/owner/repo.git", False),
("https://example.com/github/repo.git", False),
# Edge cases
("", False),
("not-a-url", False),
("ftp://github.com/owner/repo.git", True), # Different protocol but still github.com
],
)
def test_is_github_host(url, expected):
"""_is_github_host should correctly identify GitHub and GitHub Enterprise URLs."""
assert _is_github_host(url) == expected


@pytest.mark.parametrize(
"token, url, expected_hostname",
[
# GitHub.com URLs (default)
("ghp_" + "a" * 36, "https://github.com", "github.com"),
("ghp_" + "a" * 36, "https://github.com/owner/repo.git", "github.com"),
# GitHub Enterprise URLs
("ghp_" + "b" * 36, "https://github.company.com", "github.company.com"),
("ghp_" + "c" * 36, "https://github.enterprise.org/owner/repo.git", "github.enterprise.org"),
("ghp_" + "d" * 36, "http://github.internal", "github.internal"),
],
)
def test_create_git_auth_header_with_ghe_url(token, url, expected_hostname):
"""create_git_auth_header should handle GitHub Enterprise URLs correctly."""
header = create_git_auth_header(token, url)
expected_basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode()
expected = f"http.https://{expected_hostname}/.extraheader=Authorization: Basic {expected_basic}"
assert header == expected


@pytest.mark.parametrize(
"base_cmd, local_path, url, token, expected_auth_hostname",
[
# GitHub.com URLs - should use default hostname
(
["git", "clone"],
"/some/path",
"https://github.com/owner/repo.git",
"ghp_" + "a" * 36,
"github.com",
),
# GitHub Enterprise URLs - should use custom hostname
(
["git", "clone"],
"/some/path",
"https://github.company.com/owner/repo.git",
"ghp_" + "b" * 36,
"github.company.com",
),
(
["git", "clone"],
"/some/path",
"https://github.enterprise.org/owner/repo.git",
"ghp_" + "c" * 36,
"github.enterprise.org",
),
(
["git", "clone"],
"/some/path",
"http://github.internal/owner/repo.git",
"ghp_" + "d" * 36,
"github.internal",
),
],
)
def test_create_git_command_with_ghe_urls(base_cmd, local_path, url, token, expected_auth_hostname):
"""create_git_command should handle GitHub Enterprise URLs correctly."""
cmd = create_git_command(base_cmd, local_path, url, token)

# Should have base command and -C option
expected_prefix = base_cmd + ["-C", local_path]
assert cmd[: len(expected_prefix)] == expected_prefix

# Should have -c and auth header
assert "-c" in cmd
auth_header_index = cmd.index("-c") + 1
auth_header = cmd[auth_header_index]

# Verify the auth header contains the expected hostname
assert f"http.https://{expected_auth_hostname}/" in auth_header
assert "Authorization: Basic" in auth_header


@pytest.mark.parametrize(
"base_cmd, local_path, url, token",
[
# Should NOT add auth headers for non-GitHub URLs
(["git", "clone"], "/some/path", "https://gitlab.com/owner/repo.git", "ghp_" + "a" * 36),
(["git", "clone"], "/some/path", "https://bitbucket.org/owner/repo.git", "ghp_" + "b" * 36),
(["git", "clone"], "/some/path", "https://git.example.com/owner/repo.git", "ghp_" + "c" * 36),
],
)
def test_create_git_command_ignores_non_github_urls(base_cmd, local_path, url, token):
"""create_git_command should not add auth headers for non-GitHub URLs."""
cmd = create_git_command(base_cmd, local_path, url, token)

# Should only have base command and -C option, no auth headers
expected = base_cmd + ["-C", local_path]
assert cmd == expected
Loading