diff --git a/pyproject.toml b/pyproject.toml index 50a746cb..6eb4cedc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,5 +76,6 @@ pythonpath = ["src"] testpaths = ["tests/"] python_files = "test_*.py" asyncio_mode = "auto" +asyncio_default_fixture_loop_scope = "function" python_classes = "Test*" python_functions = "test_*" diff --git a/src/gitingest/__init__.py b/src/gitingest/__init__.py index c291fd1b..684ec14f 100644 --- a/src/gitingest/__init__.py +++ b/src/gitingest/__init__.py @@ -1,8 +1,8 @@ """ Gitingest: A package for ingesting data from Git repositories. """ -from gitingest.cloning import clone_repo +from gitingest.cloning import clone +from gitingest.entrypoint import ingest, ingest_async from gitingest.ingestion import ingest_query from gitingest.query_parsing import parse_query -from gitingest.repository_ingest import ingest, ingest_async -__all__ = ["ingest_query", "clone_repo", "parse_query", "ingest", "ingest_async"] +__all__ = ["ingest_query", "clone", "parse_query", "ingest", "ingest_async"] diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 73b49b67..d5c5c4f5 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -8,7 +8,7 @@ import click from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME -from gitingest.repository_ingest import ingest_async +from gitingest.entrypoint import ingest_async @click.command() diff --git a/src/gitingest/cloning.py b/src/gitingest/cloning.py index e24d5230..8c717b38 100644 --- a/src/gitingest/cloning.py +++ b/src/gitingest/cloning.py @@ -2,47 +2,17 @@ import asyncio import os -from dataclasses import dataclass from pathlib import Path from typing import List, Optional, Tuple +from gitingest.ingestion_schema import CloneConfig from gitingest.utils.timeout_wrapper import async_timeout TIMEOUT: int = 60 -@dataclass -class CloneConfig: - """ - Configuration for cloning a Git repository. - - This class holds the necessary parameters for cloning a repository to a local path, including - the repository's URL, the target local path, and optional parameters for a specific commit or branch. - - Attributes - ---------- - url : str - The URL of the Git repository to clone. - local_path : str - The local directory where the repository will be cloned. - commit : str, optional - The specific commit hash to check out after cloning (default is None). - branch : str, optional - The branch to clone (default is None). - subpath : str - The subpath to clone from the repository (default is "/"). - """ - - url: str - local_path: str - commit: Optional[str] = None - branch: Optional[str] = None - subpath: str = "/" - blob: bool = False - - @async_timeout(TIMEOUT) -async def clone_repo(config: CloneConfig) -> None: +async def clone(config: CloneConfig) -> None: """ Clone a repository to a local path based on the provided configuration. diff --git a/src/gitingest/repository_ingest.py b/src/gitingest/entrypoint.py similarity index 88% rename from src/gitingest/repository_ingest.py rename to src/gitingest/entrypoint.py index f30d6001..776a6397 100644 --- a/src/gitingest/repository_ingest.py +++ b/src/gitingest/entrypoint.py @@ -5,10 +5,10 @@ import shutil from typing import Optional, Set, Tuple, Union -from gitingest.cloning import clone_repo +from gitingest.cloning import clone from gitingest.config import TMP_BASE_PATH from gitingest.ingestion import ingest_query -from gitingest.query_parsing import ParsedQuery, parse_query +from gitingest.query_parsing import IngestionQuery, parse_query async def ingest_async( @@ -53,12 +53,12 @@ async def ingest_async( Raises ------ TypeError - If `clone_repo` does not return a coroutine, or if the `source` is of an unsupported type. + If `clone` does not return a coroutine, or if the `source` is of an unsupported type. """ repo_cloned = False try: - parsed_query: ParsedQuery = await parse_query( + query: IngestionQuery = await parse_query( source=source, max_file_size=max_file_size, from_web=False, @@ -66,12 +66,12 @@ async def ingest_async( ignore_patterns=exclude_patterns, ) - if parsed_query.url: - selected_branch = branch if branch else parsed_query.branch # prioritize branch argument - parsed_query.branch = selected_branch + if query.url: + selected_branch = branch if branch else query.branch # prioritize branch argument + query.branch = selected_branch - clone_config = parsed_query.extact_clone_config() - clone_coroutine = clone_repo(clone_config) + clone_config = query.extract_clone_config() + clone_coroutine = clone(clone_config) if inspect.iscoroutine(clone_coroutine): if asyncio.get_event_loop().is_running(): @@ -79,11 +79,11 @@ async def ingest_async( else: asyncio.run(clone_coroutine) else: - raise TypeError("clone_repo did not return a coroutine as expected.") + raise TypeError("clone did not return a coroutine as expected.") repo_cloned = True - summary, tree, content = ingest_query(parsed_query) + summary, tree, content = ingest_query(query) if output is not None: with open(output, "w", encoding="utf-8") as f: diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index bdfbdbf6..ec5eb754 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -7,7 +7,7 @@ from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES from gitingest.filesystem_schema import FileSystemNode, FileSystemNodeType, FileSystemStats from gitingest.output_formatters import format_node -from gitingest.query_parsing import ParsedQuery +from gitingest.query_parsing import IngestionQuery from gitingest.utils.ingestion_utils import _should_exclude, _should_include from gitingest.utils.path_utils import _is_safe_symlink @@ -17,7 +17,7 @@ import tomli as tomllib -def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]: +def ingest_query(query: IngestionQuery) -> Tuple[str, str, str]: """ Run the ingestion process for a parsed query. @@ -27,7 +27,7 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]: Parameters ---------- - query : ParsedQuery + query : IngestionQuery The parsed query object containing information about the repository and query parameters. Returns @@ -87,7 +87,7 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]: return format_node(root_node, query) -def apply_gitingest_file(path: Path, query: ParsedQuery) -> None: +def apply_gitingest_file(path: Path, query: IngestionQuery) -> None: """ Apply the .gitingest file to the query object. @@ -98,7 +98,7 @@ def apply_gitingest_file(path: Path, query: ParsedQuery) -> None: ---------- path : Path The path of the directory to ingest. - query : ParsedQuery + query : IngestionQuery The parsed query object containing information about the repository and query parameters. It should have an attribute `ignore_patterns` which is either None or a set of strings. """ @@ -154,7 +154,7 @@ def apply_gitingest_file(path: Path, query: ParsedQuery) -> None: def _process_node( node: FileSystemNode, - query: ParsedQuery, + query: IngestionQuery, stats: FileSystemStats, ) -> None: """ @@ -167,7 +167,7 @@ def _process_node( ---------- node : FileSystemNode The current directory or file node being processed. - query : ParsedQuery + query : IngestionQuery The parsed query object containing information about the repository and query parameters. stats : FileSystemStats Statistics tracking object for the total file count and size. diff --git a/src/gitingest/ingestion_schema.py b/src/gitingest/ingestion_schema.py new file mode 100644 index 00000000..e28f6470 --- /dev/null +++ b/src/gitingest/ingestion_schema.py @@ -0,0 +1,90 @@ +""" This module contains the dataclasses for the ingestion process. """ + +from dataclasses import dataclass +from pathlib import Path +from typing import Optional, Set + +from pydantic import BaseModel, Field + +from gitingest.config import MAX_FILE_SIZE + + +@dataclass +class CloneConfig: + """ + Configuration for cloning a Git repository. + + This class holds the necessary parameters for cloning a repository to a local path, including + the repository's URL, the target local path, and optional parameters for a specific commit or branch. + + Attributes + ---------- + url : str + The URL of the Git repository to clone. + local_path : str + The local directory where the repository will be cloned. + commit : str, optional + The specific commit hash to check out after cloning (default is None). + branch : str, optional + The branch to clone (default is None). + subpath : str + The subpath to clone from the repository (default is "/"). + """ + + url: str + local_path: str + commit: Optional[str] = None + branch: Optional[str] = None + subpath: str = "/" + blob: bool = False + + +class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes + """ + Pydantic model to store the parsed details of the repository or file path. + """ + + user_name: Optional[str] = None + repo_name: Optional[str] = None + local_path: Path + url: Optional[str] = None + slug: str + id: str + subpath: str = "/" + type: Optional[str] = None + branch: Optional[str] = None + commit: Optional[str] = None + max_file_size: int = Field(default=MAX_FILE_SIZE) + ignore_patterns: Optional[Set[str]] = None + include_patterns: Optional[Set[str]] = None + + class Config: + """Pydantic model configuration.""" + + arbitrary_types_allowed = True + + def extract_clone_config(self) -> CloneConfig: + """ + Extract the relevant fields for the CloneConfig object. + + Returns + ------- + CloneConfig + A CloneConfig object containing the relevant fields. + + Raises + ------ + ValueError + If the 'url' parameter is not provided. + """ + if not self.url: + raise ValueError("The 'url' parameter is required.") + + return CloneConfig( + url=self.url, + local_path=str(self.local_path), + commit=self.commit, + branch=self.branch, + subpath=self.subpath, + blob=self.type == "blob", + ) diff --git a/src/gitingest/output_formatters.py b/src/gitingest/output_formatters.py index 8d5a278c..5f747387 100644 --- a/src/gitingest/output_formatters.py +++ b/src/gitingest/output_formatters.py @@ -5,10 +5,10 @@ import tiktoken from gitingest.filesystem_schema import FileSystemNode, FileSystemNodeType -from gitingest.query_parsing import ParsedQuery +from gitingest.query_parsing import IngestionQuery -def format_node(node: FileSystemNode, query: ParsedQuery) -> Tuple[str, str, str]: +def format_node(node: FileSystemNode, query: IngestionQuery) -> Tuple[str, str, str]: """ Generate a summary, directory structure, and file contents for a given file system node. @@ -18,7 +18,7 @@ def format_node(node: FileSystemNode, query: ParsedQuery) -> Tuple[str, str, str ---------- node : FileSystemNode The file system node to be summarized. - query : ParsedQuery + query : IngestionQuery The parsed query object containing information about the repository and query parameters. Returns @@ -47,7 +47,7 @@ def format_node(node: FileSystemNode, query: ParsedQuery) -> Tuple[str, str, str return summary, tree, content -def _create_summary_prefix(query: ParsedQuery, single_file: bool = False) -> str: +def _create_summary_prefix(query: IngestionQuery, single_file: bool = False) -> str: """ Create a prefix string for summarizing a repository or local directory. @@ -55,7 +55,7 @@ def _create_summary_prefix(query: ParsedQuery, single_file: bool = False) -> str Parameters ---------- - query : ParsedQuery + query : IngestionQuery The parsed query object containing information about the repository and query parameters. single_file : bool A flag indicating whether the summary is for a single file, by default False. @@ -108,7 +108,7 @@ def _gather_file_contents(node: FileSystemNode) -> str: return "\n".join(_gather_file_contents(child) for child in node.children) -def _create_tree_structure(query: ParsedQuery, node: FileSystemNode, prefix: str = "", is_last: bool = True) -> str: +def _create_tree_structure(query: IngestionQuery, node: FileSystemNode, prefix: str = "", is_last: bool = True) -> str: """ Generate a tree-like string representation of the file structure. @@ -117,7 +117,7 @@ def _create_tree_structure(query: ParsedQuery, node: FileSystemNode, prefix: str Parameters ---------- - query : ParsedQuery + query : IngestionQuery The parsed query object containing information about the repository and query parameters. node : FileSystemNode The current directory or file node being processed. diff --git a/src/gitingest/query_parsing.py b/src/gitingest/query_parsing.py index e2b0e0cf..434220ef 100644 --- a/src/gitingest/query_parsing.py +++ b/src/gitingest/query_parsing.py @@ -3,14 +3,14 @@ import re import uuid import warnings -from dataclasses import dataclass from pathlib import Path from typing import List, Optional, Set, Union from urllib.parse import unquote, urlparse -from gitingest.cloning import CloneConfig, _check_repo_exists, fetch_remote_branch_list -from gitingest.config import MAX_FILE_SIZE, TMP_BASE_PATH +from gitingest.cloning import _check_repo_exists, fetch_remote_branch_list +from gitingest.config import TMP_BASE_PATH from gitingest.exceptions import InvalidPatternError +from gitingest.ingestion_schema import IngestionQuery from gitingest.utils.ignore_patterns import DEFAULT_IGNORE_PATTERNS from gitingest.utils.query_parser_utils import ( KNOWN_GIT_HOSTS, @@ -23,61 +23,13 @@ ) -@dataclass -class ParsedQuery: # pylint: disable=too-many-instance-attributes - """ - Dataclass to store the parsed details of the repository or file path. - """ - - user_name: Optional[str] - repo_name: Optional[str] - local_path: Path - url: Optional[str] - slug: str - id: str - subpath: str = "/" - type: Optional[str] = None - branch: Optional[str] = None - commit: Optional[str] = None - max_file_size: int = MAX_FILE_SIZE - ignore_patterns: Optional[Set[str]] = None - include_patterns: Optional[Set[str]] = None - pattern_type: Optional[str] = None - - def extact_clone_config(self) -> CloneConfig: - """ - Extract the relevant fields for the CloneConfig object. - - Returns - ------- - CloneConfig - A CloneConfig object containing the relevant fields. - - Raises - ------ - ValueError - If the 'url' parameter is not provided. - """ - if not self.url: - raise ValueError("The 'url' parameter is required.") - - return CloneConfig( - url=self.url, - local_path=str(self.local_path), - commit=self.commit, - branch=self.branch, - subpath=self.subpath, - blob=self.type == "blob", - ) - - async def parse_query( source: str, max_file_size: int, from_web: bool, include_patterns: Optional[Union[str, Set[str]]] = None, ignore_patterns: Optional[Union[str, Set[str]]] = None, -) -> ParsedQuery: +) -> IngestionQuery: """ Parse the input source (URL or path) to extract relevant details for the query. @@ -100,17 +52,17 @@ async def parse_query( Returns ------- - ParsedQuery + IngestionQuery A dataclass object containing the parsed details of the repository or file path. """ # Determine the parsing method based on the source type if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS): # We either have a full URL or a domain-less slug - parsed_query = await _parse_remote_repo(source) + query = await _parse_remote_repo(source) else: # Local path scenario - parsed_query = _parse_local_dir_path(source) + query = _parse_local_dir_path(source) # Combine default ignore patterns + custom patterns ignore_patterns_set = DEFAULT_IGNORE_PATTERNS.copy() @@ -125,24 +77,24 @@ async def parse_query( else: parsed_include = None - return ParsedQuery( - user_name=parsed_query.user_name, - repo_name=parsed_query.repo_name, - url=parsed_query.url, - subpath=parsed_query.subpath, - local_path=parsed_query.local_path, - slug=parsed_query.slug, - id=parsed_query.id, - type=parsed_query.type, - branch=parsed_query.branch, - commit=parsed_query.commit, + return IngestionQuery( + user_name=query.user_name, + repo_name=query.repo_name, + url=query.url, + subpath=query.subpath, + local_path=query.local_path, + slug=query.slug, + id=query.id, + type=query.type, + branch=query.branch, + commit=query.commit, max_file_size=max_file_size, ignore_patterns=ignore_patterns_set, include_patterns=parsed_include, ) -async def _parse_remote_repo(source: str) -> ParsedQuery: +async def _parse_remote_repo(source: str) -> IngestionQuery: """ Parse a repository URL into a structured query dictionary. @@ -158,7 +110,7 @@ async def _parse_remote_repo(source: str) -> ParsedQuery: Returns ------- - ParsedQuery + IngestionQuery A dictionary containing the parsed details of the repository. """ source = unquote(source) @@ -190,7 +142,7 @@ async def _parse_remote_repo(source: str) -> ParsedQuery: local_path = TMP_BASE_PATH / _id / slug url = f"https://{host}/{user_name}/{repo_name}" - parsed = ParsedQuery( + parsed = IngestionQuery( user_name=user_name, repo_name=repo_name, url=url, @@ -307,7 +259,7 @@ def _parse_patterns(pattern: Union[str, Set[str]]) -> Set[str]: return {_normalize_pattern(p) for p in parsed_patterns} -def _parse_local_dir_path(path_str: str) -> ParsedQuery: +def _parse_local_dir_path(path_str: str) -> IngestionQuery: """ Parse the given file path into a structured query dictionary. @@ -318,12 +270,12 @@ def _parse_local_dir_path(path_str: str) -> ParsedQuery: Returns ------- - ParsedQuery + IngestionQuery A dictionary containing the parsed details of the file path. """ path_obj = Path(path_str).resolve() slug = path_obj.name if path_str == "." else path_str.strip("/") - return ParsedQuery( + return IngestionQuery( user_name=None, repo_name=None, url=None, diff --git a/src/server/query_processor.py b/src/server/query_processor.py index f6cdcea2..2e751479 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -5,9 +5,9 @@ from fastapi import Request from starlette.templating import _TemplateResponse -from gitingest.cloning import clone_repo +from gitingest.cloning import clone from gitingest.ingestion import ingest_query -from gitingest.query_parsing import ParsedQuery, parse_query +from gitingest.query_parsing import IngestionQuery, parse_query from server.server_config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE, templates from server.server_utils import Colors, log_slider_to_size @@ -74,25 +74,25 @@ async def process_query( } try: - parsed_query: ParsedQuery = await parse_query( + query: IngestionQuery = await parse_query( source=input_text, max_file_size=max_file_size, from_web=True, include_patterns=include_patterns, ignore_patterns=exclude_patterns, ) - if not parsed_query.url: + if not query.url: raise ValueError("The 'url' parameter is required.") - clone_config = parsed_query.extact_clone_config() - await clone_repo(clone_config) - summary, tree, content = ingest_query(parsed_query) + clone_config = query.extract_clone_config() + await clone(clone_config) + summary, tree, content = ingest_query(query) with open(f"{clone_config.local_path}.txt", "w", encoding="utf-8") as f: f.write(tree + "\n" + content) except Exception as exc: # hack to print error message when query is not defined - if "query" in locals() and parsed_query is not None and isinstance(parsed_query, dict): - _print_error(parsed_query["url"], exc, max_file_size, pattern_type, pattern) + if "query" in locals() and query is not None and isinstance(query, dict): + _print_error(query["url"], exc, max_file_size, pattern_type, pattern) else: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print(f"{Colors.RED}{exc}{Colors.END}") @@ -111,7 +111,7 @@ async def process_query( ) _print_success( - url=parsed_query.url, + url=query.url, max_file_size=max_file_size, pattern_type=pattern_type, pattern=pattern, @@ -124,7 +124,7 @@ async def process_query( "summary": summary, "tree": tree, "content": content, - "ingest_id": parsed_query.id, + "ingest_id": query.id, } ) diff --git a/tests/conftest.py b/tests/conftest.py index 86925005..33cf4df3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,24 +11,24 @@ import pytest -from gitingest.query_parsing import ParsedQuery +from gitingest.query_parsing import IngestionQuery WriteNotebookFunc = Callable[[str, Dict[str, Any]], Path] @pytest.fixture -def sample_query() -> ParsedQuery: +def sample_query() -> IngestionQuery: """ - Provide a default `ParsedQuery` object for use in tests. + Provide a default `IngestionQuery` object for use in tests. - This fixture returns a `ParsedQuery` pre-populated with typical fields and some default ignore patterns. + This fixture returns a `IngestionQuery` pre-populated with typical fields and some default ignore patterns. Returns ------- - ParsedQuery - The sample `ParsedQuery` object. + IngestionQuery + The sample `IngestionQuery` object. """ - return ParsedQuery( + return IngestionQuery( user_name="test_user", repo_name="test_repo", url=None, diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py index 61fb512e..0039d220 100644 --- a/tests/query_parser/test_git_host_agnostic.py +++ b/tests/query_parser/test_git_host_agnostic.py @@ -82,14 +82,14 @@ async def test_parse_query_without_host( Then the parser should correctly identify the user, repo, canonical URL, and other default fields. """ for url in urls: - parsed_query = await parse_query(url, max_file_size=50, from_web=True) + query = await parse_query(url, max_file_size=50, from_web=True) - assert parsed_query.user_name == expected_user - assert parsed_query.repo_name == expected_repo - assert parsed_query.url == expected_url - assert parsed_query.slug == f"{expected_user}-{expected_repo}" - assert parsed_query.id is not None - assert parsed_query.subpath == "/" - assert parsed_query.branch is None - assert parsed_query.commit is None - assert parsed_query.type is None + assert query.user_name == expected_user + assert query.repo_name == expected_repo + assert query.url == expected_url + assert query.slug == f"{expected_user}-{expected_repo}" + assert query.id is not None + assert query.subpath == "/" + assert query.branch is None + assert query.commit is None + assert query.type is None diff --git a/tests/query_parser/test_query_parser.py b/tests/query_parser/test_query_parser.py index 51beb8d5..a01b5e0f 100644 --- a/tests/query_parser/test_query_parser.py +++ b/tests/query_parser/test_query_parser.py @@ -32,11 +32,11 @@ async def test_parse_url_valid_https() -> None: "https://gist.github.com/user/repo", ] for url in test_cases: - parsed_query = await _parse_remote_repo(url) + query = await _parse_remote_repo(url) - assert parsed_query.user_name == "user" - assert parsed_query.repo_name == "repo" - assert parsed_query.url == url + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.url == url @pytest.mark.asyncio @@ -57,11 +57,11 @@ async def test_parse_url_valid_http() -> None: "http://gist.github.com/user/repo", ] for url in test_cases: - parsed_query = await _parse_remote_repo(url) + query = await _parse_remote_repo(url) - assert parsed_query.user_name == "user" - assert parsed_query.repo_name == "repo" - assert parsed_query.slug == "user-repo" + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.slug == "user-repo" @pytest.mark.asyncio @@ -88,13 +88,13 @@ async def test_parse_query_basic(url): When `parse_query` is called, Then user/repo, URL, and ignore patterns should be parsed correctly. """ - parsed_query = await parse_query(source=url, max_file_size=50, from_web=True, ignore_patterns="*.txt") + query = await parse_query(source=url, max_file_size=50, from_web=True, ignore_patterns="*.txt") - assert parsed_query.user_name == "user" - assert parsed_query.repo_name == "repo" - assert parsed_query.url == url - assert parsed_query.ignore_patterns - assert "*.txt" in parsed_query.ignore_patterns + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.url == url + assert query.ignore_patterns + assert "*.txt" in query.ignore_patterns @pytest.mark.asyncio @@ -107,10 +107,10 @@ async def test_parse_query_mixed_case() -> None: Then the user and repo names should be normalized to lowercase. """ url = "Https://GitHub.COM/UsEr/rEpO" - parsed_query = await parse_query(url, max_file_size=50, from_web=True) + query = await parse_query(url, max_file_size=50, from_web=True) - assert parsed_query.user_name == "user" - assert parsed_query.repo_name == "repo" + assert query.user_name == "user" + assert query.repo_name == "repo" @pytest.mark.asyncio @@ -123,10 +123,10 @@ async def test_parse_query_include_pattern() -> None: Then the include pattern should be set, and default ignore patterns remain applied. """ url = "https://github.com/user/repo" - parsed_query = await parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py") + query = await parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py") - assert parsed_query.include_patterns == {"*.py"} - assert parsed_query.ignore_patterns == DEFAULT_IGNORE_PATTERNS + assert query.include_patterns == {"*.py"} + assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS @pytest.mark.asyncio @@ -157,12 +157,12 @@ async def test_parse_url_with_subpaths() -> None: mock_run_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"") with patch("gitingest.cloning.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches: mock_fetch_branches.return_value = ["main", "dev", "feature-branch"] - parsed_query = await _parse_remote_repo(url) + query = await _parse_remote_repo(url) - assert parsed_query.user_name == "user" - assert parsed_query.repo_name == "repo" - assert parsed_query.branch == "main" - assert parsed_query.subpath == "/subdir/file" + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.branch == "main" + assert query.subpath == "/subdir/file" @pytest.mark.asyncio @@ -216,10 +216,10 @@ async def test_parse_query_with_large_file_size() -> None: Then `max_file_size` should be set correctly and default ignore patterns remain unchanged. """ url = "https://github.com/user/repo" - parsed_query = await parse_query(url, max_file_size=10**9, from_web=True) + query = await parse_query(url, max_file_size=10**9, from_web=True) - assert parsed_query.max_file_size == 10**9 - assert parsed_query.ignore_patterns == DEFAULT_IGNORE_PATTERNS + assert query.max_file_size == 10**9 + assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS @pytest.mark.asyncio @@ -232,10 +232,10 @@ async def test_parse_query_empty_patterns() -> None: Then include_patterns becomes None and default ignore patterns apply. """ url = "https://github.com/user/repo" - parsed_query = await parse_query(url, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="") + query = await parse_query(url, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="") - assert parsed_query.include_patterns is None - assert parsed_query.ignore_patterns == DEFAULT_IGNORE_PATTERNS + assert query.include_patterns is None + assert query.ignore_patterns == DEFAULT_IGNORE_PATTERNS @pytest.mark.asyncio @@ -248,7 +248,7 @@ async def test_parse_query_include_and_ignore_overlap() -> None: Then "*.py" should be removed from ignore patterns. """ url = "https://github.com/user/repo" - parsed_query = await parse_query( + query = await parse_query( url, max_file_size=50, from_web=True, @@ -256,10 +256,10 @@ async def test_parse_query_include_and_ignore_overlap() -> None: ignore_patterns={"*.py", "*.txt"}, ) - assert parsed_query.include_patterns == {"*.py"} - assert parsed_query.ignore_patterns is not None - assert "*.py" not in parsed_query.ignore_patterns - assert "*.txt" in parsed_query.ignore_patterns + assert query.include_patterns == {"*.py"} + assert query.ignore_patterns is not None + assert "*.py" not in query.ignore_patterns + assert "*.txt" in query.ignore_patterns @pytest.mark.asyncio @@ -272,12 +272,12 @@ async def test_parse_query_local_path() -> None: Then the local path should be set, id generated, and slug formed accordingly. """ path = "/home/user/project" - parsed_query = await parse_query(path, max_file_size=100, from_web=False) + query = await parse_query(path, max_file_size=100, from_web=False) tail = Path("home/user/project") - assert parsed_query.local_path.parts[-len(tail.parts) :] == tail.parts - assert parsed_query.id is not None - assert parsed_query.slug == "home/user/project" + assert query.local_path.parts[-len(tail.parts) :] == tail.parts + assert query.id is not None + assert query.slug == "home/user/project" @pytest.mark.asyncio @@ -290,11 +290,11 @@ async def test_parse_query_relative_path() -> None: Then local_path resolves relatively, and slug ends with "project". """ path = "./project" - parsed_query = await parse_query(path, max_file_size=100, from_web=False) + query = await parse_query(path, max_file_size=100, from_web=False) tail = Path("project") - assert parsed_query.local_path.parts[-len(tail.parts) :] == tail.parts - assert parsed_query.slug.endswith("project") + assert query.local_path.parts[-len(tail.parts) :] == tail.parts + assert query.slug.endswith("project") @pytest.mark.asyncio @@ -336,11 +336,11 @@ async def test_parse_url_branch_and_commit_distinction(url: str, expected_branch with patch("gitingest.cloning.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches: mock_fetch_branches.return_value = ["main", "dev", "feature-branch"] - parsed_query = await _parse_remote_repo(url) + query = await _parse_remote_repo(url) # Verify that `branch` and `commit` match our expectations - assert parsed_query.branch == expected_branch - assert parsed_query.commit == expected_commit + assert query.branch == expected_branch + assert query.commit == expected_commit @pytest.mark.asyncio @@ -353,10 +353,10 @@ async def test_parse_query_uuid_uniqueness() -> None: Then each call should produce a different query id. """ path = "/home/user/project" - parsed_query_1 = await parse_query(path, max_file_size=100, from_web=False) - parsed_query_2 = await parse_query(path, max_file_size=100, from_web=False) + query_1 = await parse_query(path, max_file_size=100, from_web=False) + query_2 = await parse_query(path, max_file_size=100, from_web=False) - assert parsed_query_1.id != parsed_query_2.id + assert query_1.id != query_2.id @pytest.mark.asyncio @@ -369,11 +369,11 @@ async def test_parse_url_with_query_and_fragment() -> None: Then those parts should be stripped, leaving a clean user/repo URL. """ url = "https://github.com/user/repo?arg=value#fragment" - parsed_query = await _parse_remote_repo(url) + query = await _parse_remote_repo(url) - assert parsed_query.user_name == "user" - assert parsed_query.repo_name == "repo" - assert parsed_query.url == "https://github.com/user/repo" # URL should be cleaned + assert query.user_name == "user" + assert query.repo_name == "repo" + assert query.url == "https://github.com/user/repo" # URL should be cleaned @pytest.mark.asyncio @@ -400,17 +400,17 @@ async def test_parse_query_with_branch() -> None: Then the branch should be identified, subpath set, and commit remain None. """ url = "https://github.com/pandas-dev/pandas/blob/2.2.x/.github/ISSUE_TEMPLATE/documentation_improvement.yaml" - parsed_query = await parse_query(url, max_file_size=10**9, from_web=True) + query = await parse_query(url, max_file_size=10**9, from_web=True) - assert parsed_query.user_name == "pandas-dev" - assert parsed_query.repo_name == "pandas" - assert parsed_query.url == "https://github.com/pandas-dev/pandas" - assert parsed_query.slug == "pandas-dev-pandas" - assert parsed_query.id is not None - assert parsed_query.subpath == "/.github/ISSUE_TEMPLATE/documentation_improvement.yaml" - assert parsed_query.branch == "2.2.x" - assert parsed_query.commit is None - assert parsed_query.type == "blob" + assert query.user_name == "pandas-dev" + assert query.repo_name == "pandas" + assert query.url == "https://github.com/pandas-dev/pandas" + assert query.slug == "pandas-dev-pandas" + assert query.id is not None + assert query.subpath == "/.github/ISSUE_TEMPLATE/documentation_improvement.yaml" + assert query.branch == "2.2.x" + assert query.commit is None + assert query.type == "blob" @pytest.mark.asyncio @@ -439,10 +439,10 @@ async def test_parse_repo_source_with_failed_git_command(url, expected_branch, e "git ls-remote --heads https://github.com/user/repo", ): - parsed_query = await _parse_remote_repo(url) + query = await _parse_remote_repo(url) - assert parsed_query.branch == expected_branch - assert parsed_query.subpath == expected_subpath + assert query.branch == expected_branch + assert query.subpath == expected_subpath @pytest.mark.asyncio @@ -473,7 +473,7 @@ async def test_parse_repo_source_with_various_url_patterns(url, expected_branch, ) mock_fetch_branches.return_value = ["feature/fix1", "main", "feature-branch"] - parsed_query = await _parse_remote_repo(url) + query = await _parse_remote_repo(url) - assert parsed_query.branch == expected_branch - assert parsed_query.subpath == expected_subpath + assert query.branch == expected_branch + assert query.subpath == expected_subpath diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index 901646d1..3e991f8f 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -8,10 +8,10 @@ from pathlib import Path from gitingest.ingestion import ingest_query -from gitingest.query_parsing import ParsedQuery +from gitingest.query_parsing import IngestionQuery -def test_run_ingest_query(temp_directory: Path, sample_query: ParsedQuery) -> None: +def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) -> None: """ Test `ingest_query` to ensure it processes the directory and returns expected results. diff --git a/tests/test_repository_clone.py b/tests/test_repository_clone.py index fcf61631..54f9f986 100644 --- a/tests/test_repository_clone.py +++ b/tests/test_repository_clone.py @@ -12,17 +12,17 @@ import pytest -from gitingest.cloning import CloneConfig, _check_repo_exists, clone_repo +from gitingest.cloning import CloneConfig, _check_repo_exists, clone from gitingest.exceptions import AsyncTimeoutError @pytest.mark.asyncio -async def test_clone_repo_with_commit() -> None: +async def test_clone_with_commit() -> None: """ Test cloning a repository with a specific commit hash. Given a valid URL and a commit hash: - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned and checked out at that commit. """ clone_config = CloneConfig( @@ -38,19 +38,19 @@ async def test_clone_repo_with_commit() -> None: mock_process.communicate.return_value = (b"output", b"error") mock_exec.return_value = mock_process - await clone_repo(clone_config) + await clone(clone_config) mock_check.assert_called_once_with(clone_config.url) assert mock_exec.call_count == 2 # Clone and checkout calls @pytest.mark.asyncio -async def test_clone_repo_without_commit() -> None: +async def test_clone_without_commit() -> None: """ Test cloning a repository when no commit hash is provided. Given a valid URL and no commit hash: - When `clone_repo` is called, + When `clone` is called, Then only the clone operation should be performed (no checkout). """ query = CloneConfig( @@ -66,19 +66,19 @@ async def test_clone_repo_without_commit() -> None: mock_process.communicate.return_value = (b"output", b"error") mock_exec.return_value = mock_process - await clone_repo(query) + await clone(query) mock_check.assert_called_once_with(query.url) assert mock_exec.call_count == 1 # Only clone call @pytest.mark.asyncio -async def test_clone_repo_nonexistent_repository() -> None: +async def test_clone_nonexistent_repository() -> None: """ Test cloning a nonexistent repository URL. Given an invalid or nonexistent URL: - When `clone_repo` is called, + When `clone` is called, Then a ValueError should be raised with an appropriate error message. """ clone_config = CloneConfig( @@ -89,7 +89,7 @@ async def test_clone_repo_nonexistent_repository() -> None: ) with patch("gitingest.cloning._check_repo_exists", return_value=False) as mock_check: with pytest.raises(ValueError, match="Repository not found"): - await clone_repo(clone_config) + await clone(clone_config) mock_check.assert_called_once_with(clone_config.url) @@ -126,18 +126,18 @@ async def test_check_repo_exists(mock_stdout: bytes, return_code: int, expected: @pytest.mark.asyncio -async def test_clone_repo_with_custom_branch() -> None: +async def test_clone_with_custom_branch() -> None: """ Test cloning a repository with a specified custom branch. Given a valid URL and a branch: - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned shallowly to that branch. """ clone_config = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo", branch="feature-branch") with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone(clone_config) mock_exec.assert_called_once_with( "git", @@ -157,7 +157,7 @@ async def test_git_command_failure() -> None: Test cloning when the Git command fails during execution. Given a valid URL, but `_run_command` raises a RuntimeError: - When `clone_repo` is called, + When `clone` is called, Then a RuntimeError should be raised with the correct message. """ clone_config = CloneConfig( @@ -167,16 +167,16 @@ async def test_git_command_failure() -> None: with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", side_effect=RuntimeError("Git command failed")): with pytest.raises(RuntimeError, match="Git command failed"): - await clone_repo(clone_config) + await clone(clone_config) @pytest.mark.asyncio -async def test_clone_repo_default_shallow_clone() -> None: +async def test_clone_default_shallow_clone() -> None: """ Test cloning a repository with the default shallow clone options. Given a valid URL and no branch or commit: - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned with `--depth=1` and `--single-branch`. """ clone_config = CloneConfig( @@ -186,7 +186,7 @@ async def test_clone_repo_default_shallow_clone() -> None: with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone(clone_config) mock_exec.assert_called_once_with( "git", @@ -199,12 +199,12 @@ async def test_clone_repo_default_shallow_clone() -> None: @pytest.mark.asyncio -async def test_clone_repo_commit_without_branch() -> None: +async def test_clone_commit_without_branch() -> None: """ Test cloning when a commit hash is provided but no branch is specified. Given a valid URL and a commit hash (but no branch): - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned and checked out at that commit. """ clone_config = CloneConfig( @@ -214,7 +214,7 @@ async def test_clone_repo_commit_without_branch() -> None: ) with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone(clone_config) assert mock_exec.call_count == 2 # Clone and checkout calls mock_exec.assert_any_call("git", "clone", "--single-branch", clone_config.url, clone_config.local_path) @@ -264,12 +264,12 @@ async def test_check_repo_exists_with_permanent_redirect() -> None: @pytest.mark.asyncio -async def test_clone_repo_with_timeout() -> None: +async def test_clone_with_timeout() -> None: """ Test cloning a repository when a timeout occurs. Given a valid URL, but `_run_command` times out: - When `clone_repo` is called, + When `clone` is called, Then an `AsyncTimeoutError` should be raised to indicate the operation exceeded time limits. """ clone_config = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo") @@ -278,7 +278,7 @@ async def test_clone_repo_with_timeout() -> None: with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: mock_exec.side_effect = asyncio.TimeoutError with pytest.raises(AsyncTimeoutError, match="Operation timed out after"): - await clone_repo(clone_config) + await clone(clone_config) @pytest.mark.asyncio @@ -287,7 +287,7 @@ async def test_clone_specific_branch(tmp_path): Test cloning a specific branch of a repository. Given a valid repository URL and a branch name: - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned and checked out at that branch. """ repo_url = "https://github.com/cyclotruc/gitingest.git" @@ -295,7 +295,7 @@ async def test_clone_specific_branch(tmp_path): local_path = tmp_path / "gitingest" config = CloneConfig(url=repo_url, local_path=str(local_path), branch=branch_name) - await clone_repo(config) + await clone(config) # Assertions assert local_path.exists(), "The repository was not cloned successfully." @@ -312,7 +312,7 @@ async def test_clone_branch_with_slashes(tmp_path): Test cloning a branch with slashes in the name. Given a valid repository URL and a branch name with slashes: - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned and checked out at that branch. """ repo_url = "https://github.com/user/repo" @@ -322,7 +322,7 @@ async def test_clone_branch_with_slashes(tmp_path): clone_config = CloneConfig(url=repo_url, local_path=str(local_path), branch=branch_name) with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone(clone_config) mock_exec.assert_called_once_with( "git", @@ -337,12 +337,12 @@ async def test_clone_branch_with_slashes(tmp_path): @pytest.mark.asyncio -async def test_clone_repo_creates_parent_directory(tmp_path: Path) -> None: +async def test_clone_creates_parent_directory(tmp_path: Path) -> None: """ - Test that clone_repo creates parent directories if they don't exist. + Test that clone creates parent directories if they don't exist. Given a local path with non-existent parent directories: - When `clone_repo` is called, + When `clone` is called, Then it should create the parent directories before attempting to clone. """ nested_path = tmp_path / "deep" / "nested" / "path" / "repo" @@ -353,7 +353,7 @@ async def test_clone_repo_creates_parent_directory(tmp_path: Path) -> None: with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone(clone_config) # Verify parent directory was created assert nested_path.parent.exists() @@ -375,14 +375,14 @@ async def test_clone_with_specific_subpath() -> None: Test cloning a repository with a specific subpath. Given a valid repository URL and a specific subpath: - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned with sparse checkout enabled and the specified subpath. """ clone_config = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo", subpath="src/docs") with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone(clone_config) # Verify the clone command includes sparse checkout flags mock_exec.assert_any_call( @@ -408,7 +408,7 @@ async def test_clone_with_commit_and_subpath() -> None: Test cloning a repository with both a specific commit and subpath. Given a valid repository URL, commit hash, and subpath: - When `clone_repo` is called, + When `clone` is called, Then the repository should be cloned with sparse checkout enabled, checked out at the specific commit, and only include the specified subpath. """ @@ -421,7 +421,7 @@ async def test_clone_with_commit_and_subpath() -> None: with patch("gitingest.cloning._check_repo_exists", return_value=True): with patch("gitingest.cloning._run_command", new_callable=AsyncMock) as mock_exec: - await clone_repo(clone_config) + await clone(clone_config) # Verify the clone command includes sparse checkout flags mock_exec.assert_any_call(