-
+
diff --git a/src/templates/github.jinja b/src/templates/git.jinja
similarity index 97%
rename from src/templates/github.jinja
rename to src/templates/git.jinja
index c373367c..62def5c1 100644
--- a/src/templates/github.jinja
+++ b/src/templates/git.jinja
@@ -6,7 +6,7 @@
data-message="{{ error_message }}">{{ error_message }}
{% endif %}
{% with is_index=true, show_examples=false %}
- {% include 'components/github_form.jinja' %}
+ {% include 'components/git_form.jinja' %}
{% endwith %}
{% if loading %}
diff --git a/src/templates/index.jinja b/src/templates/index.jinja
index 467b2f3f..f5beac08 100644
--- a/src/templates/index.jinja
+++ b/src/templates/index.jinja
@@ -73,10 +73,10 @@
data-message="{{ error_message }}">{{ error_message }}
{% endif %}
{% with is_index=true, show_examples=true %}
- {% include 'components/github_form.jinja' %}
+ {% include 'components/git_form.jinja' %}
{% endwith %}
- You can also replace 'hub' with 'ingest' in any Github URL
+ You can also replace 'hub' with 'ingest' in any GitHub URL.
{% include 'components/result.jinja' %}
{% endblock %}
From 95b5e27bbcd8225be9db93dc11f7bfdf28eab8f8 Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Fri, 10 Jan 2025 12:41:22 +0100
Subject: [PATCH 2/7] refactor: convert key functions and tests to asynchronous
- Made `parse_query` in query_processor.py asynchronous
- Made `main` in cli.py asynchronous
- Made `ingest` in repository_ingest.py asynchronous
- Updated test functions in test_query_parser.py to support async
---
src/gitingest/cli.py | 4 +-
src/gitingest/query_parser.py | 2 +-
src/gitingest/repository_ingest.py | 4 +-
src/query_processor.py | 2 +-
tests/test_query_parser.py | 80 +++++++++++++++---------------
5 files changed, 46 insertions(+), 46 deletions(-)
diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py
index ada231a4..371263a7 100644
--- a/src/gitingest/cli.py
+++ b/src/gitingest/cli.py
@@ -14,7 +14,7 @@
@click.option("--max-size", "-s", default=MAX_FILE_SIZE, help="Maximum file size to process in bytes")
@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude")
@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include")
-def main(
+async def main(
source: str,
output: str | None,
max_size: int,
@@ -54,7 +54,7 @@ def main(
if not output:
output = "digest.txt"
- summary, _, _ = ingest(source, max_size, include_patterns, exclude_patterns, output=output)
+ summary, _, _ = await ingest(source, max_size, include_patterns, exclude_patterns, output=output)
click.echo(f"Analysis complete! Output written to: {output}")
click.echo("\nSummary:")
diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py
index f232e63e..9b796ac9 100644
--- a/src/gitingest/query_parser.py
+++ b/src/gitingest/query_parser.py
@@ -15,7 +15,7 @@
HEX_DIGITS = set(string.hexdigits)
-def parse_query(
+async def parse_query(
source: str,
max_file_size: int,
from_web: bool,
diff --git a/src/gitingest/repository_ingest.py b/src/gitingest/repository_ingest.py
index a1149847..c7efa942 100644
--- a/src/gitingest/repository_ingest.py
+++ b/src/gitingest/repository_ingest.py
@@ -10,7 +10,7 @@
from gitingest.repository_clone import CloneConfig, clone_repo
-def ingest(
+async def ingest(
source: str,
max_file_size: int = 10 * 1024 * 1024, # 10 MB
include_patterns: list[str] | str | None = None,
@@ -52,7 +52,7 @@ def ingest(
If `clone_repo` does not return a coroutine, or if the `source` is of an unsupported type.
"""
try:
- query = parse_query(
+ query = await parse_query(
source=source,
max_file_size=max_file_size,
from_web=False,
diff --git a/src/query_processor.py b/src/query_processor.py
index 544a2eea..a66bdd3c 100644
--- a/src/query_processor.py
+++ b/src/query_processor.py
@@ -77,7 +77,7 @@ async def process_query(
}
try:
- query = parse_query(
+ query = await parse_query(
source=input_text,
max_file_size=max_file_size,
from_web=True,
diff --git a/tests/test_query_parser.py b/tests/test_query_parser.py
index 97a829d9..472875f7 100644
--- a/tests/test_query_parser.py
+++ b/tests/test_query_parser.py
@@ -8,9 +8,9 @@
from gitingest.query_parser import _parse_patterns, _parse_url, parse_query
-def test_parse_url_valid_https() -> None:
+async def test_parse_url_valid_https() -> None:
"""
- Test `_parse_url` with valid HTTPS URLs from supported platforms (GitHub, GitLab, Bitbucket).
+ Test `_parse_url` with valid HTTPS URLs from supported platforms (GitHub, GitLab, Bitbucket, Gitea).
Verifies that user and repository names are correctly extracted.
"""
test_cases = [
@@ -19,13 +19,13 @@ def test_parse_url_valid_https() -> None:
"https://bitbucket.org/user/repo",
]
for url in test_cases:
- result = _parse_url(url)
+ result = await _parse_url(url)
assert result["user_name"] == "user"
assert result["repo_name"] == "repo"
assert result["url"] == url
-def test_parse_url_valid_http() -> None:
+async def test_parse_url_valid_http() -> None:
"""
Test `_parse_url` with valid HTTP URLs from supported platforms.
Verifies that user and repository names, as well as the slug, are correctly extracted.
@@ -36,88 +36,88 @@ def test_parse_url_valid_http() -> None:
"http://bitbucket.org/user/repo",
]
for url in test_cases:
- result = _parse_url(url)
+ result = await _parse_url(url)
assert result["user_name"] == "user"
assert result["repo_name"] == "repo"
assert result["slug"] == "user-repo"
-def test_parse_url_invalid() -> None:
+async def test_parse_url_invalid() -> None:
"""
Test `_parse_url` with an invalid URL that does not include a repository structure.
Verifies that a ValueError is raised with an appropriate error message.
"""
- url = "https://only-domain.com"
+ url = "https://github.com"
with pytest.raises(ValueError, match="Invalid repository URL"):
- _parse_url(url)
+ await _parse_url(url)
-def test_parse_query_basic() -> None:
+async def test_parse_query_basic() -> None:
"""
Test `parse_query` with basic inputs including valid repository URLs.
Verifies that user and repository names, URL, and ignore patterns are correctly parsed.
"""
test_cases = ["https://github.com/user/repo", "https://gitlab.com/user/repo"]
for url in test_cases:
- result = parse_query(url, max_file_size=50, from_web=True, ignore_patterns="*.txt")
+ result = await parse_query(url, max_file_size=50, from_web=True, ignore_patterns="*.txt")
assert result["user_name"] == "user"
assert result["repo_name"] == "repo"
assert result["url"] == url
assert "*.txt" in result["ignore_patterns"]
-def test_parse_query_mixed_case() -> None:
+async def test_parse_query_mixed_case() -> None:
"""
Test `parse_query` with mixed case URLs.
"""
url = "Https://GitHub.COM/UsEr/rEpO"
- result = parse_query(url, max_file_size=50, from_web=True)
+ result = await parse_query(url, max_file_size=50, from_web=True)
assert result["user_name"] == "user"
assert result["repo_name"] == "repo"
-def test_parse_query_include_pattern() -> None:
+async def test_parse_query_include_pattern() -> None:
"""
Test `parse_query` with an include pattern.
Verifies that the include pattern is set correctly and default ignore patterns are applied.
"""
url = "https://github.com/user/repo"
- result = parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py")
+ result = await parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py")
assert result["include_patterns"] == ["*.py"]
assert set(result["ignore_patterns"]) == set(DEFAULT_IGNORE_PATTERNS)
-def test_parse_query_invalid_pattern() -> None:
+async def test_parse_query_invalid_pattern() -> None:
"""
Test `parse_query` with an invalid pattern containing special characters.
Verifies that a ValueError is raised with an appropriate error message.
"""
url = "https://github.com/user/repo"
with pytest.raises(ValueError, match="Pattern.*contains invalid characters"):
- parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py;rm -rf")
+ await parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py;rm -rf")
-def test_parse_url_with_subpaths() -> None:
+async def test_parse_url_with_subpaths() -> None:
"""
Test `_parse_url` with a URL containing a branch and subpath.
Verifies that user name, repository name, branch, and subpath are correctly extracted.
"""
url = "https://github.com/user/repo/tree/main/subdir/file"
- result = _parse_url(url)
+ result = await _parse_url(url)
assert result["user_name"] == "user"
assert result["repo_name"] == "repo"
assert result["branch"] == "main"
assert result["subpath"] == "/subdir/file"
-def test_parse_url_invalid_repo_structure() -> None:
+async def test_parse_url_invalid_repo_structure() -> None:
"""
Test `_parse_url` with an invalid repository structure in the URL.
Verifies that a ValueError is raised with an appropriate error message.
"""
url = "https://github.com/user"
with pytest.raises(ValueError, match="Invalid repository URL"):
- _parse_url(url)
+ await _parse_url(url)
def test_parse_patterns_valid() -> None:
@@ -140,35 +140,35 @@ def test_parse_patterns_invalid_characters() -> None:
_parse_patterns(patterns)
-def test_parse_query_with_large_file_size() -> None:
+async def test_parse_query_with_large_file_size() -> None:
"""
Test `parse_query` with a very large file size limit.
Verifies that the file size limit and default ignore patterns are set correctly.
"""
url = "https://github.com/user/repo"
- result = parse_query(url, max_file_size=10**9, from_web=True)
+ result = await parse_query(url, max_file_size=10**9, from_web=True)
assert result["max_file_size"] == 10**9
assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS
-def test_parse_query_empty_patterns() -> None:
+async def test_parse_query_empty_patterns() -> None:
"""
Test `parse_query` with empty include and ignore patterns.
Verifies that the include patterns are set to None and default ignore patterns are applied.
"""
url = "https://github.com/user/repo"
- result = parse_query(url, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="")
+ result = await parse_query(url, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="")
assert result["include_patterns"] is None
assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS
-def test_parse_query_include_and_ignore_overlap() -> None:
+async def test_parse_query_include_and_ignore_overlap() -> None:
"""
Test `parse_query` with overlapping include and ignore patterns.
Verifies that overlapping patterns are removed from the ignore patterns.
"""
url = "https://github.com/user/repo"
- result = parse_query(
+ result = await parse_query(
url,
max_file_size=50,
from_web=True,
@@ -180,41 +180,41 @@ def test_parse_query_include_and_ignore_overlap() -> None:
assert "*.txt" in result["ignore_patterns"]
-def test_parse_query_local_path() -> None:
+async def test_parse_query_local_path() -> None:
"""
Test `parse_query` with a local file path.
Verifies that the local path is set, a unique ID is generated, and the slug is correctly created.
"""
path = "/home/user/project"
- result = parse_query(path, max_file_size=100, from_web=False)
+ result = await parse_query(path, max_file_size=100, from_web=False)
tail = Path("home/user/project")
assert result["local_path"].parts[-len(tail.parts) :] == tail.parts
assert result["id"] is not None
assert result["slug"] == "user/project"
-def test_parse_query_relative_path() -> None:
+async def test_parse_query_relative_path() -> None:
"""
Test `parse_query` with a relative file path.
Verifies that the local path and slug are correctly resolved.
"""
path = "./project"
- result = parse_query(path, max_file_size=100, from_web=False)
+ result = await parse_query(path, max_file_size=100, from_web=False)
tail = Path("project")
assert result["local_path"].parts[-len(tail.parts) :] == tail.parts
assert result["slug"].endswith("project")
-def test_parse_query_empty_source() -> None:
+async def test_parse_query_empty_source() -> None:
"""
Test `parse_query` with an empty source input.
Verifies that a ValueError is raised with an appropriate error message.
"""
with pytest.raises(ValueError, match="Invalid repository URL"):
- parse_query("", max_file_size=100, from_web=True)
+ await parse_query("", max_file_size=100, from_web=True)
-def test_parse_url_branch_and_commit_distinction() -> None:
+async def test_parse_url_branch_and_commit_distinction() -> None:
"""
Test `_parse_url` with URLs containing either a branch name or a commit hash.
Verifies that the branch and commit are correctly distinguished.
@@ -222,8 +222,8 @@ def test_parse_url_branch_and_commit_distinction() -> None:
url_branch = "https://github.com/user/repo/tree/main"
url_commit = "https://github.com/user/repo/tree/abcd1234abcd1234abcd1234abcd1234abcd1234"
- result_branch = _parse_url(url_branch)
- result_commit = _parse_url(url_commit)
+ result_branch = await _parse_url(url_branch)
+ result_commit = await _parse_url(url_commit)
assert result_branch["branch"] == "main"
assert result_branch["commit"] is None
@@ -232,23 +232,23 @@ def test_parse_url_branch_and_commit_distinction() -> None:
assert result_commit["commit"] == "abcd1234abcd1234abcd1234abcd1234abcd1234"
-def test_parse_query_uuid_uniqueness() -> None:
+async def test_parse_query_uuid_uniqueness() -> None:
"""
Test `parse_query` to ensure that each call generates a unique UUID for the query result.
"""
path = "/home/user/project"
- result1 = parse_query(path, max_file_size=100, from_web=False)
- result2 = parse_query(path, max_file_size=100, from_web=False)
+ result1 = await parse_query(path, max_file_size=100, from_web=False)
+ result2 = await parse_query(path, max_file_size=100, from_web=False)
assert result1["id"] != result2["id"]
-def test_parse_url_with_query_and_fragment() -> None:
+async def test_parse_url_with_query_and_fragment() -> None:
"""
Test `_parse_url` with a URL containing query parameters and a fragment.
Verifies that the URL is cleaned and other fields are correctly extracted.
"""
url = "https://github.com/user/repo?arg=value#fragment"
- result = _parse_url(url)
+ result = await _parse_url(url)
assert result["user_name"] == "user"
assert result["repo_name"] == "repo"
assert result["url"] == "https://github.com/user/repo" # URL should be cleaned
From 9a19c92142625b20d4428b4dff3b08e42fac1ba6 Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Fri, 10 Jan 2025 13:17:48 +0100
Subject: [PATCH 3/7] refactor: rename _parse_url and standardize docstrings
- Renamed `_parse_url` to `_parse_repo_source` in query_parser.py
- Adjusted docstrings to adhere to PEP 257 by using imperative tense
---
src/gitingest/query_parser.py | 14 +++++++-------
tests/test_query_parser.py | 32 ++++++++++++++++----------------
2 files changed, 23 insertions(+), 23 deletions(-)
diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py
index 9b796ac9..bc356988 100644
--- a/src/gitingest/query_parser.py
+++ b/src/gitingest/query_parser.py
@@ -53,7 +53,7 @@ async def parse_query(
# Determine the parsing method based on the source type
if from_web or source.startswith("https://") or "github.com" in source:
- query = _parse_url(source)
+ query = _parse_repo_source(source)
else:
query = _parse_path(source)
@@ -80,7 +80,7 @@ async def parse_query(
return query
-def _parse_url(url: str) -> dict[str, Any]:
+def _parse_repo_source(url: str) -> dict[str, Any]:
"""
Parse a GitHub repository URL into a structured query dictionary.
@@ -165,7 +165,7 @@ def _parse_url(url: str) -> dict[str, Any]:
def _is_valid_git_commit_hash(commit: str) -> bool:
"""
- Validates if the provided string is a valid Git commit hash.
+ Validate if the provided string is a valid Git commit hash.
This function checks if the commit hash is a 40-character string consisting only
of hexadecimal digits, which is the standard format for Git commit hashes.
@@ -185,7 +185,7 @@ def _is_valid_git_commit_hash(commit: str) -> bool:
def _normalize_pattern(pattern: str) -> str:
"""
- Normalizes the given pattern by removing leading separators and appending a wildcard.
+ Normalize the given pattern by removing leading separators and appending a wildcard.
This function processes the pattern string by stripping leading directory separators
and appending a wildcard (`*`) if the pattern ends with a separator.
@@ -249,7 +249,7 @@ def _parse_patterns(pattern: list[str] | str) -> list[str]:
def _override_ignore_patterns(ignore_patterns: list[str], include_patterns: list[str]) -> list[str]:
"""
- Removes patterns from ignore_patterns that are present in include_patterns using set difference.
+ Remove patterns from ignore_patterns that are present in include_patterns using set difference.
Parameters
----------
@@ -268,7 +268,7 @@ def _override_ignore_patterns(ignore_patterns: list[str], include_patterns: list
def _parse_path(path_str: str) -> dict[str, Any]:
"""
- Parses a file path into a structured query dictionary.
+ Parse a file path into a structured query dictionary.
This function takes a file path and constructs a query dictionary that includes
relevant details such as the absolute path and the slug (a combination of the
@@ -297,7 +297,7 @@ def _parse_path(path_str: str) -> dict[str, Any]:
def _is_valid_pattern(pattern: str) -> bool:
"""
- Validates if the given pattern contains only valid characters.
+ Validate if the given pattern contains only valid characters.
This function checks if the pattern contains only alphanumeric characters or one
of the following allowed characters: dash (`-`), underscore (`_`), dot (`.`),
diff --git a/tests/test_query_parser.py b/tests/test_query_parser.py
index 472875f7..1fe666b6 100644
--- a/tests/test_query_parser.py
+++ b/tests/test_query_parser.py
@@ -5,12 +5,12 @@
import pytest
from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS
-from gitingest.query_parser import _parse_patterns, _parse_url, parse_query
+from gitingest.query_parser import _parse_patterns, _parse_repo_source, parse_query
async def test_parse_url_valid_https() -> None:
"""
- Test `_parse_url` with valid HTTPS URLs from supported platforms (GitHub, GitLab, Bitbucket, Gitea).
+ Test `_parse_repo_source` with valid HTTPS URLs from supported platforms (GitHub, GitLab, Bitbucket, Gitea).
Verifies that user and repository names are correctly extracted.
"""
test_cases = [
@@ -19,7 +19,7 @@ async def test_parse_url_valid_https() -> None:
"https://bitbucket.org/user/repo",
]
for url in test_cases:
- result = await _parse_url(url)
+ result = await _parse_repo_source(url)
assert result["user_name"] == "user"
assert result["repo_name"] == "repo"
assert result["url"] == url
@@ -27,7 +27,7 @@ async def test_parse_url_valid_https() -> None:
async def test_parse_url_valid_http() -> None:
"""
- Test `_parse_url` with valid HTTP URLs from supported platforms.
+ Test `_parse_repo_source` with valid HTTP URLs from supported platforms.
Verifies that user and repository names, as well as the slug, are correctly extracted.
"""
test_cases = [
@@ -36,7 +36,7 @@ async def test_parse_url_valid_http() -> None:
"http://bitbucket.org/user/repo",
]
for url in test_cases:
- result = await _parse_url(url)
+ result = await _parse_repo_source(url)
assert result["user_name"] == "user"
assert result["repo_name"] == "repo"
assert result["slug"] == "user-repo"
@@ -44,12 +44,12 @@ async def test_parse_url_valid_http() -> None:
async def test_parse_url_invalid() -> None:
"""
- Test `_parse_url` with an invalid URL that does not include a repository structure.
+ Test `_parse_repo_source` with an invalid URL that does not include a repository structure.
Verifies that a ValueError is raised with an appropriate error message.
"""
url = "https://github.com"
with pytest.raises(ValueError, match="Invalid repository URL"):
- await _parse_url(url)
+ await _parse_repo_source(url)
async def test_parse_query_basic() -> None:
@@ -99,11 +99,11 @@ async def test_parse_query_invalid_pattern() -> None:
async def test_parse_url_with_subpaths() -> None:
"""
- Test `_parse_url` with a URL containing a branch and subpath.
+ Test `_parse_repo_source` with a URL containing a branch and subpath.
Verifies that user name, repository name, branch, and subpath are correctly extracted.
"""
url = "https://github.com/user/repo/tree/main/subdir/file"
- result = await _parse_url(url)
+ result = await _parse_repo_source(url)
assert result["user_name"] == "user"
assert result["repo_name"] == "repo"
assert result["branch"] == "main"
@@ -112,12 +112,12 @@ async def test_parse_url_with_subpaths() -> None:
async def test_parse_url_invalid_repo_structure() -> None:
"""
- Test `_parse_url` with an invalid repository structure in the URL.
+ Test `_parse_repo_source` with an invalid repository structure in the URL.
Verifies that a ValueError is raised with an appropriate error message.
"""
url = "https://github.com/user"
with pytest.raises(ValueError, match="Invalid repository URL"):
- await _parse_url(url)
+ await _parse_repo_source(url)
def test_parse_patterns_valid() -> None:
@@ -216,14 +216,14 @@ async def test_parse_query_empty_source() -> None:
async def test_parse_url_branch_and_commit_distinction() -> None:
"""
- Test `_parse_url` with URLs containing either a branch name or a commit hash.
+ Test `_parse_repo_source` with URLs containing either a branch name or a commit hash.
Verifies that the branch and commit are correctly distinguished.
"""
url_branch = "https://github.com/user/repo/tree/main"
url_commit = "https://github.com/user/repo/tree/abcd1234abcd1234abcd1234abcd1234abcd1234"
- result_branch = await _parse_url(url_branch)
- result_commit = await _parse_url(url_commit)
+ result_branch = await _parse_repo_source(url_branch)
+ result_commit = await _parse_repo_source(url_commit)
assert result_branch["branch"] == "main"
assert result_branch["commit"] is None
@@ -244,11 +244,11 @@ async def test_parse_query_uuid_uniqueness() -> None:
async def test_parse_url_with_query_and_fragment() -> None:
"""
- Test `_parse_url` with a URL containing query parameters and a fragment.
+ Test `_parse_repo_source` with a URL containing query parameters and a fragment.
Verifies that the URL is cleaned and other fields are correctly extracted.
"""
url = "https://github.com/user/repo?arg=value#fragment"
- result = await _parse_url(url)
+ result = await _parse_repo_source(url)
assert result["user_name"] == "user"
assert result["repo_name"] == "repo"
assert result["url"] == "https://github.com/user/repo" # URL should be cleaned
From a57f614987746058b23a2b72380b0af607b9607a Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Fri, 10 Jan 2025 13:28:42 +0100
Subject: [PATCH 4/7] refactor: implement _get_status_code, adjust
_check_repo_exists, and update tests
- Implemented function `_get_status_code` in repository_clone.py to extract the status code from an HTTP response
- Adjusted `_check_repo_exists` in repository_clone.py to utilize the new `_get_status_code` function
- Modified `_check_repo_exists` to return True for status codes 200 and 301, and False for 404 and 302
- Updated `test_check_repo_exists_with_redirect` in test_repository_clone.py to verify that `_check_repo_exists` returns False for status code 302
- Implemented test `test_check_repo_exists_with_permanent_redirect` in test_repository_clone.py to verify that `_check_repo_exists` returns True for status code 301
---
src/gitingest/repository_clone.py | 39 ++++++++++++++++++++++++++++---
tests/test_repository_clone.py | 22 +++++++++++++++--
2 files changed, 56 insertions(+), 5 deletions(-)
diff --git a/src/gitingest/repository_clone.py b/src/gitingest/repository_clone.py
index 57374ada..d251a6f1 100644
--- a/src/gitingest/repository_clone.py
+++ b/src/gitingest/repository_clone.py
@@ -111,6 +111,11 @@ async def _check_repo_exists(url: str) -> bool:
-------
bool
True if the repository exists, False otherwise.
+
+ Raises
+ ------
+ RuntimeError
+ If the curl command returns an unexpected status code.
"""
proc = await asyncio.create_subprocess_exec(
"curl",
@@ -120,11 +125,20 @@ async def _check_repo_exists(url: str) -> bool:
stderr=asyncio.subprocess.PIPE,
)
stdout, _ = await proc.communicate()
+
if proc.returncode != 0:
return False
- # Check if stdout contains "404" status code
- stdout_str = stdout.decode()
- return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str
+
+ response = stdout.decode()
+ status_code = _get_status_code(response)
+
+ if status_code in (200, 301):
+ return True
+
+ if status_code in (404, 302):
+ return False
+
+ raise RuntimeError(f"Unexpected status code: {status_code}")
async def _run_git_command(*args: str) -> tuple[bytes, bytes]:
@@ -157,3 +171,22 @@ async def _run_git_command(*args: str) -> tuple[bytes, bytes]:
raise RuntimeError(f"Git command failed: {' '.join(args)}\nError: {error_message}")
return stdout, stderr
+
+
+def _get_status_code(response: str) -> int:
+ """
+ Extract the status code from an HTTP response.
+
+ Parameters
+ ----------
+ response : str
+ The HTTP response string.
+
+ Returns
+ -------
+ int
+ The status code of the response
+ """
+ status_line = response.splitlines()[0].strip()
+ status_code = int(status_line.split(" ", 2)[1])
+ return status_code
diff --git a/tests/test_repository_clone.py b/tests/test_repository_clone.py
index 892bd04e..3bfa3b2f 100644
--- a/tests/test_repository_clone.py
+++ b/tests/test_repository_clone.py
@@ -204,8 +204,9 @@ async def test_clone_repo_commit_without_branch() -> None:
@pytest.mark.asyncio
async def test_check_repo_exists_with_redirect() -> None:
"""
- Test the `_check_repo_exists` function for handling HTTP redirects (302 Found).
- Verifies that it correctly identifies the repository's existence.
+ Test the `_check_repo_exists` function when the repository URL returns a redirect response.
+
+ Verifies that the function returns False when a 302 Found response is received.
"""
url = "https://github.com/user/repo"
with patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) as mock_exec:
@@ -214,4 +215,21 @@ async def test_check_repo_exists_with_redirect() -> None:
mock_process.returncode = 0 # Simulate successful request
mock_exec.return_value = mock_process
+ assert await _check_repo_exists(url) is False
+
+
+@pytest.mark.asyncio
+async def test_check_repo_exists_with_permanent_redirect() -> None:
+ """
+ Test the `_check_repo_exists` function when the repository URL returns a redirect response.
+
+ Verifies that the function returns True when a 301 Found response is received.
+ """
+ url = "https://github.com/user/repo"
+ with patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) as mock_exec:
+ mock_process = AsyncMock()
+ mock_process.communicate.return_value = (b"HTTP/1.1 301 Found\n", b"")
+ mock_process.returncode = 0 # Simulate successful request
+ mock_exec.return_value = mock_process
+
assert await _check_repo_exists(url)
From 9bdee8f7012cdf42485387eb11c09d741b37435d Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Fri, 10 Jan 2025 13:50:05 +0100
Subject: [PATCH 5/7] feat: make parser domain-agnostic to support multiple Git
hosts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- added list of known domains/Git hosts in `query_parser.py`
- fixed bug from [#115](https://github.com/cyclotruc/gitingest/pull/115): corrected case handling for URL components—scheme, domain, username, and repository are case-insensitive, but paths beyond (e.g., file names, branches) are case-sensitive
- implemented `try_domains_for_user_and_repo` in `query_parser.py` to iteratively guess the correct domain until success or supported hosts are exhausted
- added helper functions `_get_user_and_repo_from_path`, `_validate_host`, and `_validate_scheme` in `query_parser.py`
- extended `_parse_repo_source` in `query_parser.py` to be Git host agnostic by using `try_domains_for_user_and_repo`
- added tests `test_parse_url_unsupported_host` and `test_parse_query_with_branch` in `test_query_parser.py`
- created new file `test_git_host_agnostic.py` to verify domain/Git host agnostic behavior
---
src/gitingest/query_parser.py | 208 ++++++++++++++-----
src/main.py | 2 +-
tests/query_parser/test_git_host_agnostic.py | 71 +++++++
tests/test_query_parser.py | 22 ++
4 files changed, 251 insertions(+), 52 deletions(-)
create mode 100644 tests/query_parser/test_git_host_agnostic.py
diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py
index bc356988..2981f097 100644
--- a/src/gitingest/query_parser.py
+++ b/src/gitingest/query_parser.py
@@ -11,8 +11,16 @@
from config import TMP_BASE_PATH
from gitingest.exceptions import InvalidPatternError
from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS
+from gitingest.repository_clone import _check_repo_exists
-HEX_DIGITS = set(string.hexdigits)
+HEX_DIGITS: set[str] = set(string.hexdigits)
+
+KNOWN_GIT_HOSTS: list[str] = [
+ "github.com",
+ "gitlab.com",
+ "bitbucket.org",
+ "gitea.com",
+]
async def parse_query(
@@ -48,16 +56,16 @@ async def parse_query(
A dictionary containing the parsed query parameters, including 'max_file_size',
'ignore_patterns', and 'include_patterns'.
"""
- # Normalize and clean up the source string to make it case-insensitive
- source = source.lower().strip()
# Determine the parsing method based on the source type
- if from_web or source.startswith("https://") or "github.com" in source:
- query = _parse_repo_source(source)
+ if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS):
+ # We either have a full URL or a domain-less slug
+ query = await _parse_repo_source(source)
else:
+ # Local path scenario
query = _parse_path(source)
- # Process ignore patterns
+ # Combine ignore patterns
ignore_patterns_list = DEFAULT_IGNORE_PATTERNS.copy()
if ignore_patterns:
ignore_patterns_list += _parse_patterns(ignore_patterns)
@@ -69,7 +77,6 @@ async def parse_query(
else:
parsed_include = None
- # Update the query dictionary with max_file_size and processed patterns
query.update(
{
"max_file_size": max_file_size,
@@ -80,52 +87,54 @@ async def parse_query(
return query
-def _parse_repo_source(url: str) -> dict[str, Any]:
+async def _parse_repo_source(source: str) -> dict[str, Any]:
"""
- Parse a GitHub repository URL into a structured query dictionary.
+ Parse a repository URL into a structured query dictionary.
- This function extracts relevant information from a GitHub URL, such as the username,
- repository name, commit, branch, and subpath, and returns them in a structured format.
+ If source is:
+ - A fully qualified URL (https://gitlab.com/...), parse & verify that domain
+ - A URL missing 'https://' (gitlab.com/...), add 'https://' and parse
+ - A 'slug' (like 'pandas-dev/pandas'), attempt known domains until we find one that exists.
Parameters
----------
- url : str
- The GitHub URL to parse.
+ source : str
+ The URL or domain-less slug to parse.
Returns
-------
dict[str, Any]
- A dictionary containing the parsed details of the GitHub repository, including
- the username, repository name, commit, branch, and other relevant information.
-
- Raises
- ------
- ValueError
- If the URL is invalid or does not correspond to a valid Git repository.
+ A dictionary containing the parsed details of the repository, including the username,
+ repository name, commit, branch, and other relevant information.
"""
- # Clean up the URL
- url = url.split(" ")[0] # remove trailing text
- url = unquote(url) # decode URL-encoded characters
+ source = unquote(source)
- if not url.startswith(("https://", "http://")):
- url = "https://" + url
+ # Attempt to parse
+ parsed_url = urlparse(source)
- # Parse URL and reconstruct it without query parameters and fragments
- parsed_url = urlparse(url)
- url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
+ if parsed_url.scheme:
+ _validate_scheme(parsed_url.scheme)
+ _validate_host(parsed_url.netloc.lower())
- # Extract domain and path
- url_parts = url.split("/")
- domain = url_parts[2]
- path_parts = url_parts[3:]
+ else: # Will be of the form 'host/user/repo' or 'user/repo'
+ tmp_host = source.split("/")[0].lower()
+ if "." in tmp_host:
+ _validate_host(tmp_host)
+ else:
+ # No scheme, no domain => user typed "user/repo", so we'll guess the domain.
+ host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source))
+ source = f"{host}/{source}"
- if len(path_parts) < 2:
- raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.")
+ source = "https://" + source
+ parsed_url = urlparse(source)
+
+ host = parsed_url.netloc.lower()
+ user_name, repo_name = _get_user_and_repo_from_path(parsed_url.path)
- user_name = path_parts[0]
- repo_name = path_parts[1]
_id = str(uuid.uuid4())
slug = f"{user_name}-{repo_name}"
+ local_path = Path(TMP_BASE_PATH) / _id / slug
+ url = f"https://{host}/{user_name}/{repo_name}"
parsed = {
"user_name": user_name,
@@ -134,31 +143,39 @@ def _parse_repo_source(url: str) -> dict[str, Any]:
"branch": None,
"commit": None,
"subpath": "/",
- "local_path": Path(TMP_BASE_PATH) / _id / slug,
- "url": f"https://{domain}/{user_name}/{repo_name}",
- "slug": slug,
+ "local_path": local_path,
+ "url": url,
+ "slug": slug, # e.g. "pandas-dev-pandas"
"id": _id,
}
- # If this is an issues page or pull requests, return early without processing subpath
- if len(path_parts) > 2 and (path_parts[2] == "issues" or path_parts[2] == "pull"):
+ remaining_parts = parsed_url.path.strip("/").split("/")[2:]
+
+ if not remaining_parts:
return parsed
+ possible_type = remaining_parts.pop(0) # e.g. 'issues', 'pull', 'tree', 'blob'
+
# If no extra path parts, just return
- if len(path_parts) < 4:
+ if not remaining_parts:
+ return parsed
+
+ # If this is an issues page or pull requests, return early without processing subpath
+ if remaining_parts and possible_type in ("issues", "pull"):
return parsed
- parsed["type"] = path_parts[2] # Usually 'tree' or 'blob'
- commit = path_parts[3]
+ parsed["type"] = possible_type
- if _is_valid_git_commit_hash(commit):
- parsed["commit"] = commit
- if len(path_parts) > 4:
- parsed["subpath"] += "/".join(path_parts[4:])
+ # Commit or branch
+ commit_or_branch = remaining_parts.pop(0)
+ if _is_valid_git_commit_hash(commit_or_branch):
+ parsed["commit"] = commit_or_branch
else:
- parsed["branch"] = commit
- if len(path_parts) > 4:
- parsed["subpath"] += "/".join(path_parts[4:])
+ parsed["branch"] = commit_or_branch
+
+ # Subpath if anything left
+ if remaining_parts:
+ parsed["subpath"] += "/".join(remaining_parts)
return parsed
@@ -314,3 +331,92 @@ def _is_valid_pattern(pattern: str) -> bool:
True if the pattern is valid, otherwise False.
"""
return all(c.isalnum() or c in "-_./+*" for c in pattern)
+
+
+async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str:
+ """
+ Attempt to find a valid repository host for the given user_name and repo_name.
+
+ Parameters
+ ----------
+ user_name : str
+ The username or owner of the repository.
+ repo_name : str
+ The name of the repository.
+
+ Returns
+ -------
+ str
+ The domain of the valid repository host.
+
+ Raises
+ ------
+ ValueError
+ If no valid repository host is found for the given user_name and repo_name.
+ """
+ for domain in KNOWN_GIT_HOSTS:
+ candidate = f"https://{domain}/{user_name}/{repo_name}"
+ if await _check_repo_exists(candidate):
+ return domain
+ raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.")
+
+
+def _get_user_and_repo_from_path(path: str) -> tuple[str, str]:
+ """
+ Extract the user and repository names from a given path.
+
+ Parameters
+ ----------
+ path : str
+ The path to extract the user and repository names from.
+
+ Returns
+ -------
+ tuple[str, str]
+ A tuple containing the user and repository names.
+
+ Raises
+ ------
+ ValueError
+ If the path does not contain at least two parts.
+ """
+ path_parts = path.lower().strip("/").split("/")
+ if len(path_parts) < 2:
+ raise ValueError(f"Invalid repository URL '{path}'")
+ return path_parts[0], path_parts[1]
+
+
+def _validate_host(host: str) -> None:
+ """
+ Validate the given host against the known Git hosts.
+
+ Parameters
+ ----------
+ host : str
+ The host to validate.
+
+ Raises
+ ------
+ ValueError
+ If the host is not a known Git host.
+ """
+ if host not in KNOWN_GIT_HOSTS:
+ raise ValueError(f"Unknown domain '{host}' in URL")
+
+
+def _validate_scheme(scheme: str) -> None:
+ """
+ Validate the given scheme against the known schemes.
+
+ Parameters
+ ----------
+ scheme : str
+ The scheme to validate.
+
+ Raises
+ ------
+ ValueError
+ If the scheme is not 'http' or 'https'.
+ """
+ if scheme not in ("https", "http"):
+ raise ValueError(f"Invalid URL scheme '{scheme}' in URL")
diff --git a/src/main.py b/src/main.py
index 7ba36a83..f2b63fdd 100644
--- a/src/main.py
+++ b/src/main.py
@@ -78,7 +78,7 @@ async def process_folder(folder: Path) -> None:
# Extract owner and repository name from the filename
if txt_files and "-" in (filename := txt_files[0].stem):
owner, repo = filename.split("-", 1)
- repo_url = f"https://github.com/{owner}/{repo}"
+ repo_url = f"{owner}/{repo}"
with open("history.txt", mode="a", encoding="utf-8") as history:
history.write(f"{repo_url}\n")
diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py
new file mode 100644
index 00000000..18308111
--- /dev/null
+++ b/tests/query_parser/test_git_host_agnostic.py
@@ -0,0 +1,71 @@
+""" Tests to verify that the query parser is Git host agnostic. """
+
+import pytest
+
+from gitingest.query_parser import parse_query
+
+
+@pytest.mark.parametrize(
+ "urls, expected_user, expected_repo, expected_url",
+ [
+ (
+ [
+ "https://github.com/tiangolo/fastapi",
+ "github.com/tiangolo/fastapi",
+ "tiangolo/fastapi",
+ ],
+ "tiangolo",
+ "fastapi",
+ "https://github.com/tiangolo/fastapi",
+ ),
+ (
+ [
+ "https://gitlab.com/gitlab-org/gitlab-runner",
+ "gitlab.com/gitlab-org/gitlab-runner",
+ "gitlab-org/gitlab-runner",
+ ],
+ "gitlab-org",
+ "gitlab-runner",
+ "https://gitlab.com/gitlab-org/gitlab-runner",
+ ),
+ (
+ [
+ "https://bitbucket.org/na-dna/llm-knowledge-share",
+ "bitbucket.org/na-dna/llm-knowledge-share",
+ "na-dna/llm-knowledge-share",
+ ],
+ "na-dna",
+ "llm-knowledge-share",
+ "https://bitbucket.org/na-dna/llm-knowledge-share",
+ ),
+ (
+ [
+ "https://gitea.com/xorm/xorm",
+ "gitea.com/xorm/xorm",
+ "xorm/xorm",
+ ],
+ "xorm",
+ "xorm",
+ "https://gitea.com/xorm/xorm",
+ ),
+ ],
+)
+@pytest.mark.asyncio
+async def test_parse_query_without_host(
+ urls: list[str],
+ expected_user: str,
+ expected_repo: str,
+ expected_url: str,
+) -> None:
+ for url in urls:
+ result = await parse_query(url, max_file_size=50, from_web=True)
+ # Common assertions for all cases
+ assert result["user_name"] == expected_user
+ assert result["repo_name"] == expected_repo
+ assert result["url"] == expected_url
+ assert result["slug"] == f"{expected_user}-{expected_repo}"
+ assert result["id"] is not None
+ assert result["subpath"] == "/"
+ assert result["branch"] is None
+ assert result["commit"] is None
+ assert result["type"] is None
diff --git a/tests/test_query_parser.py b/tests/test_query_parser.py
index 1fe666b6..0db65d3b 100644
--- a/tests/test_query_parser.py
+++ b/tests/test_query_parser.py
@@ -252,3 +252,25 @@ async def test_parse_url_with_query_and_fragment() -> None:
assert result["user_name"] == "user"
assert result["repo_name"] == "repo"
assert result["url"] == "https://github.com/user/repo" # URL should be cleaned
+
+
+async def test_parse_url_unsupported_host() -> None:
+ url = "https://only-domain.com"
+ with pytest.raises(ValueError, match="Unknown domain 'only-domain.com' in URL"):
+ await _parse_repo_source(url)
+
+
+async def test_parse_query_with_branch() -> None:
+ url = "https://github.com/pandas-dev/pandas/blob/2.2.x/.github/ISSUE_TEMPLATE/documentation_improvement.yaml"
+ result = await parse_query(url, max_file_size=10**9, from_web=True)
+ assert result["user_name"] == "pandas-dev"
+ assert result["repo_name"] == "pandas"
+ assert result["url"] == "https://github.com/pandas-dev/pandas"
+ assert result["slug"] == "pandas-dev-pandas"
+ assert result["id"] is not None
+ print('result["subpath"]', result["subpath"])
+ print("/.github/ISSUE_TEMPLATE/documentation_improvement.yaml")
+ assert result["subpath"] == "/.github/ISSUE_TEMPLATE/documentation_improvement.yaml"
+ assert result["branch"] == "2.2.x"
+ assert result["commit"] is None
+ assert result["type"] == "blob"
From cd1b14ef5689d5779099029d290e03618416ff04 Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Fri, 10 Jan 2025 13:52:26 +0100
Subject: [PATCH 6/7] chore: move test_query_parser.py from tests/ to
tests/query_parser/
---
tests/{ => query_parser}/test_query_parser.py | 0
1 file changed, 0 insertions(+), 0 deletions(-)
rename tests/{ => query_parser}/test_query_parser.py (100%)
diff --git a/tests/test_query_parser.py b/tests/query_parser/test_query_parser.py
similarity index 100%
rename from tests/test_query_parser.py
rename to tests/query_parser/test_query_parser.py
From 31c695de35f30260337af621bbf488b159de3431 Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Sat, 11 Jan 2025 23:45:40 +0100
Subject: [PATCH 7/7] add codeberg.org to supported git hosts
---
src/gitingest/query_parser.py | 1 +
tests/query_parser/test_git_host_agnostic.py | 10 ++++++++++
2 files changed, 11 insertions(+)
diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py
index 2981f097..78dd6cff 100644
--- a/src/gitingest/query_parser.py
+++ b/src/gitingest/query_parser.py
@@ -20,6 +20,7 @@
"gitlab.com",
"bitbucket.org",
"gitea.com",
+ "codeberg.org",
]
diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py
index 18308111..8e863555 100644
--- a/tests/query_parser/test_git_host_agnostic.py
+++ b/tests/query_parser/test_git_host_agnostic.py
@@ -48,6 +48,16 @@
"xorm",
"https://gitea.com/xorm/xorm",
),
+ (
+ [
+ "https://codeberg.org/forgejo/forgejo",
+ "codeberg.org/forgejo/forgejo",
+ "forgejo/forgejo",
+ ],
+ "forgejo",
+ "forgejo",
+ "https://codeberg.org/forgejo/forgejo",
+ ),
],
)
@pytest.mark.asyncio