Skip to content

Commit 110e6aa

Browse files
chore: migrate to Ruff, deprecate .gitingest, and general code-quality sweep
* Pre-commit • Drop `black` + `darglint` hooks • Add `ruff-check` & `ruff-format` (astral-sh/ruff-pre-commit) • Swap `darglint` → `pydoclint` for doc-string lint/validation * Dependencies • Remove `tomli` • Tighten `typing_extensions` pin; add `eval-type-backport` • `requirements-dev`: drop `black`, `djlint`, `pylint` (only used inside pre-commit virtualenv) * Ignore handling • Deprecate `.gitingest` (TOML) → new `.gitingestignore` with git-wildmatch syntax • Delete `apply_gitingest_file`; reuse `_parse_ignore_file` shared with `.gitignore` * Tooling / config • New `[tool.ruff]` section in `pyproject.toml` • Remove `[tool.black]` * README • Refresh badges (PyPI, CI, Ruff, Discord, Trendshift) * Codebase refactor • Enable `from __future__ import annotations`; add rich type hints throughout • Re-order function parameters and `__all__` exports for consistency • Move type-only imports under `if TYPE_CHECKING` • Extract CLI args to `TypedDict` `_CLIArgs`; form data to pydantic `QueryForm` • Deduplicate `cli.main` / `_async_main` • Split large blocks into helpers • Remove magic numbers → constants • Use `pathlib` over `os`; avoid `try/except` inside loops; no file-IO in async paths • Enforce kw-only args • Minor Ruff-driven clean-ups * Bug fix • Remove silent logic bug in `notebook_utils._process_cell` * Tests • Update fixtures & assertions No functional API changes except the `.gitingestignore` addition & `.gitingest` deprecation.
1 parent 29dbf0a commit 110e6aa

File tree

3 files changed

+88
-53
lines changed

3 files changed

+88
-53
lines changed

src/gitingest/clone.py

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,10 @@
44

55
from pathlib import Path
66
from typing import TYPE_CHECKING
7-
from urllib.parse import urlparse
87

98
from gitingest.config import DEFAULT_TIMEOUT
109
from gitingest.utils.git_utils import (
11-
_is_github_host,
10+
is_github_host,
1211
check_repo_exists,
1312
create_git_auth_header,
1413
create_git_command,
@@ -53,7 +52,7 @@ async def clone_repo(config: CloneConfig, token: str | None = None) -> None:
5352
partial_clone: bool = config.subpath != "/"
5453

5554
# Validate token if provided
56-
if token and _is_github_host(url):
55+
if token and is_github_host(url):
5756
validate_github_token(token)
5857

5958
# Create parent directory if it doesn't exist
@@ -65,14 +64,8 @@ async def clone_repo(config: CloneConfig, token: str | None = None) -> None:
6564
raise ValueError(msg)
6665

6766
clone_cmd = ["git"]
68-
if token and _is_github_host(url):
69-
# Only pass URL if it's not the default github.com to maintain backward compatibility
70-
71-
parsed = urlparse(url)
72-
if parsed.hostname == "github.com":
73-
clone_cmd += ["-c", create_git_auth_header(token)]
74-
else:
75-
clone_cmd += ["-c", create_git_auth_header(token, url)]
67+
if token and is_github_host(url):
68+
clone_cmd += ["-c", create_git_auth_header(token, url)]
7669

7770
clone_cmd += ["clone", "--single-branch"]
7871
# TODO: Re-enable --recurse-submodules when submodule support is needed

src/gitingest/utils/git_utils.py

Lines changed: 61 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,15 @@
55
import asyncio
66
import base64
77
import re
8+
import warnings
89
from urllib.parse import urlparse
910

1011
from gitingest.utils.exceptions import InvalidGitHubTokenError
1112

1213
GITHUB_PAT_PATTERN = r"^(?:github_pat_|ghp_)[A-Za-z0-9_]{36,}$"
1314

1415

15-
def _is_github_host(url: str) -> bool:
16+
def is_github_host(url: str) -> bool:
1617
"""
1718
Check if a URL is from a GitHub host (github.com or GitHub Enterprise).
1819
@@ -26,14 +27,12 @@ def _is_github_host(url: str) -> bool:
2627
bool
2728
True if the URL is from a GitHub host, False otherwise
2829
"""
29-
parsed = urlparse(url)
30-
hostname = parsed.hostname or ""
30+
hostname = urlparse(url).hostname or ""
3131
return hostname == "github.com" or hostname.startswith("github.")
3232

3333

3434
async def run_command(*args: str) -> tuple[bytes, bytes]:
35-
"""
36-
Execute a shell command asynchronously and return (stdout, stderr) bytes.
35+
"""Execute a shell command asynchronously and return (stdout, stderr) bytes.
3736
3837
Parameters
3938
----------
@@ -105,7 +104,7 @@ async def check_repo_exists(url: str, token: str | None = None) -> bool:
105104
106105
"""
107106
expected_path_length = 2
108-
if token and _is_github_host(url):
107+
if token and is_github_host(url):
109108
return await _check_github_repo_exists(url, token=token)
110109

111110
proc = await asyncio.create_subprocess_exec(
@@ -157,18 +156,13 @@ async def _check_github_repo_exists(url: str, token: str | None = None) -> bool:
157156
If the repository is not found, if the provided URL is invalid, or if the token format is invalid.
158157
159158
"""
160-
m = re.match(r"https?://github\.([^/]*)/([^/]+)/([^/]+?)(?:\.git)?/?$", url)
161-
if not m:
162-
m = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$", url)
163-
if not m:
164-
raise ValueError(f"Un-recognised GitHub URL: {url!r}")
165-
owner, repo = m.groups()
159+
host, owner, repo = _parse_github_url(url)
160+
161+
if host == "github.com":
166162
api = f"https://api.github.com/repos/{owner}/{repo}"
167-
else:
168-
_, owner, repo = m.groups()
163+
else: # GitHub Enterprise
164+
api = f"https://{host}/api/v3/repos/{owner}/{repo}"
169165

170-
parsed = urlparse(url)
171-
api = f"https://{parsed.hostname}/api/v3/repos/{owner}/{repo}"
172166
cmd = [
173167
"curl",
174168
"--silent",
@@ -203,6 +197,39 @@ async def _check_github_repo_exists(url: str, token: str | None = None) -> bool:
203197
raise RuntimeError(msg)
204198

205199

200+
def _parse_github_url(url: str) -> tuple[str, str, str]:
201+
"""Parse a GitHub URL and return (hostname, owner, repo).
202+
203+
Parameters
204+
----------
205+
url : str
206+
The URL of the GitHub repository to parse.
207+
208+
Returns
209+
-------
210+
tuple[str, str, str]
211+
A tuple containing the hostname, owner, and repository name.
212+
213+
Raises
214+
------
215+
ValueError
216+
If the URL is not a valid GitHub repository URL.
217+
"""
218+
parsed = urlparse(url)
219+
if parsed.scheme not in {"http", "https"}:
220+
raise ValueError("URL must start with http:// or https://")
221+
222+
if not parsed.hostname or not parsed.hostname.startswith("github."):
223+
raise ValueError(f"Un-recognised GitHub hostname: {parsed.hostname!r}")
224+
225+
parts = parsed.path.strip("/").removesuffix(".git").split("/")
226+
if len(parts) != 2:
227+
raise ValueError("Path must look like /<owner>/<repo>")
228+
229+
owner, repo = parts
230+
return parsed.hostname, owner, repo
231+
232+
206233
async def fetch_remote_branch_list(url: str, token: str | None = None) -> list[str]:
207234
"""Fetch the list of branches from a remote Git repository.
208235
@@ -223,14 +250,8 @@ async def fetch_remote_branch_list(url: str, token: str | None = None) -> list[s
223250
fetch_branches_command = ["git"]
224251

225252
# Add authentication if needed
226-
if token and _is_github_host(url):
227-
# Only pass URL if it's not the default github.com to maintain backward compatibility
228-
229-
parsed = urlparse(url)
230-
if parsed.hostname == "github.com":
231-
fetch_branches_command += ["-c", create_git_auth_header(token)]
232-
else:
233-
fetch_branches_command += ["-c", create_git_auth_header(token, url)]
253+
if token and is_github_host(url):
254+
fetch_branches_command += ["-c", create_git_auth_header(token, url)]
234255

235256
fetch_branches_command += ["ls-remote", "--heads", url]
236257

@@ -266,32 +287,42 @@ def create_git_command(base_cmd: list[str], local_path: str, url: str, token: st
266287
267288
"""
268289
cmd = [*base_cmd, "-C", local_path]
269-
if token and _is_github_host(url):
290+
if token and is_github_host(url):
270291
validate_github_token(token)
271292
cmd += ["-c", create_git_auth_header(token, url=url)]
272293
return cmd
273294

274295

275-
def create_git_auth_header(token: str, url: str = "https://github.com") -> str:
296+
def create_git_auth_header(token: str, url: str | None = None) -> str:
276297
"""Create a Basic authentication header for GitHub git operations.
277298
278299
Parameters
279300
----------
280301
token : str
281302
GitHub personal access token
282-
url : str
303+
url : str | None
283304
The GitHub URL to create the authentication header for.
284-
Defaults to "https://github.com".
305+
Defaults to "https://github.com" if not provided.
285306
286307
Returns
287308
-------
288309
str
289310
The git config command for setting the authentication header
290311
291312
"""
313+
if url is None:
314+
# TODO: Deprecate implicit github.com URL and require passing the url
315+
warnings.warn(
316+
"Implicitly assuming 'github.com' as the host is deprecated and will be "
317+
"removed in a future release. Explicitly pass the repository `url` to "
318+
"create_git_auth_header() instead.",
319+
DeprecationWarning,
320+
stacklevel=2,
321+
)
322+
hostname = "github.com"
323+
else:
324+
hostname = urlparse(url).hostname
292325

293-
parsed = urlparse(url)
294-
hostname = parsed.hostname or "github.com"
295326
basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode()
296327
return f"http.https://{hostname}/.extraheader=Authorization: Basic {basic}"
297328

tests/test_git_utils.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313

1414
from gitingest.utils.exceptions import InvalidGitHubTokenError
1515
from gitingest.utils.git_utils import (
16-
_is_github_host,
1716
create_git_auth_header,
1817
create_git_command,
18+
is_github_host,
1919
validate_github_token,
2020
)
2121

@@ -187,9 +187,9 @@ def test_create_git_command_helper_calls(
187187
("ftp://github.com/owner/repo.git", True), # Different protocol but still github.com
188188
],
189189
)
190-
def test_is_github_host(url, expected):
191-
"""_is_github_host should correctly identify GitHub and GitHub Enterprise URLs."""
192-
assert _is_github_host(url) == expected
190+
def test_is_github_host(url: str, expected: bool) -> None:
191+
"""Test that ``is_github_host`` correctly identifies GitHub and GitHub Enterprise URLs."""
192+
assert is_github_host(url) == expected
193193

194194

195195
@pytest.mark.parametrize(
@@ -204,16 +204,16 @@ def test_is_github_host(url, expected):
204204
("ghp_" + "d" * 36, "http://github.internal", "github.internal"),
205205
],
206206
)
207-
def test_create_git_auth_header_with_ghe_url(token, url, expected_hostname):
208-
"""create_git_auth_header should handle GitHub Enterprise URLs correctly."""
207+
def test_create_git_auth_header_with_ghe_url(token: str, url: str, expected_hostname: str) -> None:
208+
"""Test that ``create_git_auth_header`` handles GitHub Enterprise URLs correctly."""
209209
header = create_git_auth_header(token, url)
210210
expected_basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode()
211211
expected = f"http.https://{expected_hostname}/.extraheader=Authorization: Basic {expected_basic}"
212212
assert header == expected
213213

214214

215215
@pytest.mark.parametrize(
216-
"base_cmd, local_path, url, token, expected_auth_hostname",
216+
("base_cmd", "local_path", "url", "token", "expected_auth_hostname"),
217217
[
218218
# GitHub.com URLs - should use default hostname
219219
(
@@ -247,8 +247,14 @@ def test_create_git_auth_header_with_ghe_url(token, url, expected_hostname):
247247
),
248248
],
249249
)
250-
def test_create_git_command_with_ghe_urls(base_cmd, local_path, url, token, expected_auth_hostname):
251-
"""create_git_command should handle GitHub Enterprise URLs correctly."""
250+
def test_create_git_command_with_ghe_urls(
251+
base_cmd: list[str],
252+
local_path: str,
253+
url: str,
254+
token: str,
255+
expected_auth_hostname: str,
256+
) -> None:
257+
"""Test that ``create_git_command`` handles GitHub Enterprise URLs correctly."""
252258
cmd = create_git_command(base_cmd, local_path, url, token)
253259

254260
# Should have base command and -C option
@@ -266,16 +272,21 @@ def test_create_git_command_with_ghe_urls(base_cmd, local_path, url, token, expe
266272

267273

268274
@pytest.mark.parametrize(
269-
"base_cmd, local_path, url, token",
275+
("base_cmd", "local_path", "url", "token"),
270276
[
271277
# Should NOT add auth headers for non-GitHub URLs
272278
(["git", "clone"], "/some/path", "https://gitlab.com/owner/repo.git", "ghp_" + "a" * 36),
273279
(["git", "clone"], "/some/path", "https://bitbucket.org/owner/repo.git", "ghp_" + "b" * 36),
274280
(["git", "clone"], "/some/path", "https://git.example.com/owner/repo.git", "ghp_" + "c" * 36),
275281
],
276282
)
277-
def test_create_git_command_ignores_non_github_urls(base_cmd, local_path, url, token):
278-
"""create_git_command should not add auth headers for non-GitHub URLs."""
283+
def test_create_git_command_ignores_non_github_urls(
284+
base_cmd: list[str],
285+
local_path: str,
286+
url: str,
287+
token: str,
288+
) -> None:
289+
"""Test that ``create_git_command`` does not add auth headers for non-GitHub URLs."""
279290
cmd = create_git_command(base_cmd, local_path, url, token)
280291

281292
# Should only have base command and -C option, no auth headers

0 commit comments

Comments
 (0)