diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6e7c3b1b..710b2561 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,7 +13,7 @@ jobs: fail-fast: true matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ["3.10", "3.11", "3.12", "3.13"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 43c196dd..42b98e34 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -48,7 +48,7 @@ repos: hooks: - id: pyupgrade description: "Automatically upgrade syntax for newer versions." - args: [--py3-plus, --py36-plus, --py38-plus, --py39-plus, --py310-plus] + args: [--py3-plus, --py36-plus] - repo: https://github.com/pre-commit/pygrep-hooks rev: v1.10.0 diff --git a/README.md b/README.md index 7a92e864..5c815847 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,10 @@ You can also replace `hub` with `ingest` in any GitHub URL to access the corespo - **CLI tool**: Run it as a shell command - **Python package**: Import it in your code +## 📚 Requirements + +- Python 3.7+ + ## 📦 Installation ``` bash @@ -61,7 +65,7 @@ gitingest --help This will write the digest in a text file (default `digest.txt`) in your current working directory. -## 🐛 Python package usage +## 🐍 Python package usage ```python # Synchronous usage @@ -81,7 +85,7 @@ result = asyncio.run(ingest_async("path/to/directory")) By default, this won't write a file but can be enabled with the `output` argument. -## 🌐 Self-host +## 🐳 Self-host 1. Build the image: @@ -104,7 +108,7 @@ If you are hosting it on a domain, you can specify the allowed hostnames via env ALLOWED_HOSTS="example.com, localhost, 127.0.0.1" ``` -## ✔️ Contributing to Gitingest +## 🤝 Contributing ### Non-technical ways to contribute @@ -128,6 +132,6 @@ Gitingest aims to be friendly for first time contributors, with a simple python Check out the NPM alternative 📦 Repomix: -## Project Growth +## 🚀 Project Growth [![Star History Chart](https://api.star-history.com/svg?repos=cyclotruc/gitingest&type=Date)](https://star-history.com/#cyclotruc/gitingest&Date) diff --git a/pyproject.toml b/pyproject.toml index 41df465d..45e9d844 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,22 +3,22 @@ name = "gitingest" version = "0.1.3" description="CLI tool to analyze and create text dumps of codebases for LLMs" readme = {file = "README.md", content-type = "text/markdown" } -requires-python = ">= 3.10" +requires-python = ">= 3.8" dependencies = [ "click>=8.0.0", - "fastapi[standard]", - "python-dotenv", - "slowapi", - "starlette", "tiktoken", - "uvicorn", + "typing_extensions; python_version < '3.10'", ] + license = {file = "LICENSE"} authors = [{name = "Romain Courtois", email = "romain@coderamp.io"}] classifiers=[ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", diff --git a/setup.py b/setup.py index 49895bdd..d2704914 100644 --- a/setup.py +++ b/setup.py @@ -14,13 +14,14 @@ install_requires=[ "click>=8.0.0", "tiktoken", + "typing_extensions; python_version < '3.10'", ], entry_points={ "console_scripts": [ "gitingest=gitingest.cli:main", ], }, - python_requires=">=3.6", + python_requires=">=3.7", author="Romain Courtois", author_email="romain@coderamp.io", description="CLI tool to analyze and create text dumps of codebases for LLMs", diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index c06bd269..34dbcbf6 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -3,6 +3,7 @@ # pylint: disable=no-value-for-parameter import asyncio +from typing import Optional, Tuple import click @@ -19,14 +20,14 @@ @click.option("--branch", "-b", default=None, help="Branch to clone and ingest") def main( source: str, - output: str | None, + output: Optional[str], max_size: int, - exclude_pattern: tuple[str, ...], - include_pattern: tuple[str, ...], - branch: str | None, + exclude_pattern: Tuple[str, ...], + include_pattern: Tuple[str, ...], + branch: Optional[str], ): """ - Main entry point for the CLI. This function is called when the CLI is run as a script. + Main entry point for the CLI. This function is called when the CLI is run as a script. It calls the async main function to run the command. @@ -34,16 +35,16 @@ def main( ---------- source : str The source directory or repository to analyze. - output : str | None + output : str, optional The path where the output file will be written. If not specified, the output will be written to a file named `.txt` in the current directory. max_size : int The maximum file size to process, in bytes. Files larger than this size will be ignored. - exclude_pattern : tuple[str, ...] + exclude_pattern : Tuple[str, ...] A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored. - include_pattern : tuple[str, ...] + include_pattern : Tuple[str, ...] A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. - branch : str | None + branch : str, optional The branch to clone (optional). """ # Main entry point for the CLI. This function is called when the CLI is run as a script. @@ -52,11 +53,11 @@ def main( async def _async_main( source: str, - output: str | None, + output: Optional[str], max_size: int, - exclude_pattern: tuple[str, ...], - include_pattern: tuple[str, ...], - branch: str | None, + exclude_pattern: Tuple[str, ...], + include_pattern: Tuple[str, ...], + branch: Optional[str], ) -> None: """ Analyze a directory or repository and create a text dump of its contents. @@ -68,16 +69,16 @@ async def _async_main( ---------- source : str The source directory or repository to analyze. - output : str | None + output : str, optional The path where the output file will be written. If not specified, the output will be written to a file named `.txt` in the current directory. max_size : int The maximum file size to process, in bytes. Files larger than this size will be ignored. - exclude_pattern : tuple[str, ...] + exclude_pattern : Tuple[str, ...] A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored. - include_pattern : tuple[str, ...] + include_pattern : Tuple[str, ...] A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. - branch : str | None + branch : str, optional The branch to clone (optional). Raises diff --git a/src/gitingest/ignore_patterns.py b/src/gitingest/ignore_patterns.py index 5741ab15..633cbc46 100644 --- a/src/gitingest/ignore_patterns.py +++ b/src/gitingest/ignore_patterns.py @@ -1,6 +1,8 @@ """ Default ignore patterns for Gitingest. """ -DEFAULT_IGNORE_PATTERNS: set[str] = { +from typing import Set + +DEFAULT_IGNORE_PATTERNS: Set[str] = { # Python "*.pyc", "*.pyo", diff --git a/src/gitingest/notebook_utils.py b/src/gitingest/notebook_utils.py index 1a385ca4..a2b8bacb 100644 --- a/src/gitingest/notebook_utils.py +++ b/src/gitingest/notebook_utils.py @@ -4,7 +4,7 @@ import warnings from itertools import chain from pathlib import Path -from typing import Any +from typing import Any, Dict, List, Optional from gitingest.exceptions import InvalidNotebookError @@ -32,12 +32,13 @@ def process_notebook(file: Path, include_output: bool = True) -> str: """ try: with file.open(encoding="utf-8") as f: - notebook: dict[str, Any] = json.load(f) + notebook: Dict[str, Any] = json.load(f) except json.JSONDecodeError as e: raise InvalidNotebookError(f"Invalid JSON in notebook: {file}") from e # Check if the notebook contains worksheets - if worksheets := notebook.get("worksheets"): + worksheets = notebook.get("worksheets") + if worksheets: warnings.warn( "Worksheets are deprecated as of IPEP-17. Consider updating the notebook. " "(See: https://github.com/jupyter/nbformat and " @@ -57,26 +58,27 @@ def process_notebook(file: Path, include_output: bool = True) -> str: result = ["# Jupyter notebook converted to Python script."] for cell in cells: - if cell_str := _process_cell(cell, include_output=include_output): + cell_str = _process_cell(cell, include_output=include_output) + if cell_str: result.append(cell_str) return "\n\n".join(result) + "\n" -def _process_cell(cell: dict[str, Any], include_output: bool) -> str | None: +def _process_cell(cell: Dict[str, Any], include_output: bool) -> Optional[str]: """ Process a Jupyter notebook cell and return the cell content as a string. Parameters ---------- - cell : dict[str, Any] + cell : Dict[str, Any] The cell dictionary from a Jupyter notebook. include_output : bool Whether to include cell outputs in the generated script Returns ------- - str | None + str, optional The cell content as a string, or None if the cell is empty. Raises @@ -101,7 +103,8 @@ def _process_cell(cell: dict[str, Any], include_output: bool) -> str | None: return f'"""\n{cell_str}\n"""' # Add cell output as comments - if include_output and (outputs := cell.get("outputs")): + outputs = cell.get("outputs") + if include_output and outputs: # Include cell outputs as comments output_lines = [] @@ -118,18 +121,18 @@ def _process_cell(cell: dict[str, Any], include_output: bool) -> str | None: return cell_str -def _extract_output(output: dict[str, Any]) -> list[str]: +def _extract_output(output: Dict[str, Any]) -> List[str]: """ Extract the output from a Jupyter notebook cell. Parameters ---------- - output : dict[str, Any] + output : Dict[str, Any] The output dictionary from a Jupyter notebook cell. Returns ------- - list[str] + List[str] The output as a list of strings. Raises @@ -139,15 +142,13 @@ def _extract_output(output: dict[str, Any]) -> list[str]: """ output_type = output["output_type"] - match output_type: - case "stream": - return output["text"] + if output_type == "stream": + return output["text"] - case "execute_result" | "display_data": - return output["data"]["text/plain"] + if output_type in ("execute_result", "display_data"): + return output["data"]["text/plain"] - case "error": - return [f"Error: {output['ename']}: {output['evalue']}"] + if output_type == "error": + return [f"Error: {output['ename']}: {output['evalue']}"] - case _: - raise ValueError(f"Unknown output type: {output_type}") + raise ValueError(f"Unknown output type: {output_type}") diff --git a/src/gitingest/query_ingestion.py b/src/gitingest/query_ingestion.py index b912ee54..11e2151a 100644 --- a/src/gitingest/query_ingestion.py +++ b/src/gitingest/query_ingestion.py @@ -5,7 +5,7 @@ import platform from fnmatch import fnmatch from pathlib import Path -from typing import Any +from typing import Any, Dict, List, Optional, Set, Tuple, Union import tiktoken @@ -42,7 +42,7 @@ def _normalize_path(path: Path) -> Path: return Path(os.path.normpath(str(path))) -def _normalize_path_str(path: str | Path) -> str: +def _normalize_path_str(path: Union[Path, str]) -> str: """ Convert path to string with forward slashes for consistent output. @@ -59,13 +59,13 @@ def _normalize_path_str(path: str | Path) -> str: return str(path).replace(os.sep, "/") -def _get_encoding_list() -> list[str]: +def _get_encoding_list() -> List[str]: """ Get list of encodings to try, prioritized for the current platform. Returns ------- - list[str] + List[str] List of encoding names to try in priority order, starting with the platform's default encoding followed by common fallback encodings. """ @@ -75,7 +75,7 @@ def _get_encoding_list() -> list[str]: return encodings + [locale.getpreferredencoding()] -def _should_include(path: Path, base_path: Path, include_patterns: set[str]) -> bool: +def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> bool: """ Determine if the given file or directory path matches any of the include patterns. @@ -88,7 +88,7 @@ def _should_include(path: Path, base_path: Path, include_patterns: set[str]) -> The absolute path of the file or directory to check. base_path : Path The base directory from which the relative path is calculated. - include_patterns : set[str] + include_patterns : Set[str] A set of patterns to check against the relative path. Returns @@ -109,7 +109,7 @@ def _should_include(path: Path, base_path: Path, include_patterns: set[str]) -> return False -def _should_exclude(path: Path, base_path: Path, ignore_patterns: set[str]) -> bool: +def _should_exclude(path: Path, base_path: Path, ignore_patterns: Set[str]) -> bool: """ Determine if the given file or directory path matches any of the ignore patterns. @@ -123,7 +123,7 @@ def _should_exclude(path: Path, base_path: Path, ignore_patterns: set[str]) -> b The absolute path of the file or directory to check. base_path : Path The base directory from which the relative path is calculated. - ignore_patterns : set[str] + ignore_patterns : Set[str] A set of patterns to check against the relative path. Returns @@ -244,7 +244,7 @@ def _read_file_content(file_path: Path) -> str: return f"Error reading file: {e}" -def _sort_children(children: list[dict[str, Any]]) -> list[dict[str, Any]]: +def _sort_children(children: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Sort the children nodes of a directory according to a specific order. @@ -258,12 +258,12 @@ def _sort_children(children: list[dict[str, Any]]) -> list[dict[str, Any]]: Parameters ---------- - children : list[dict[str, Any]] + children : List[Dict[str, Any]] List of file and directory nodes to sort. Returns ------- - list[dict[str, Any]] + List[Dict[str, Any]] Sorted list according to the specified order. """ # Separate files and directories @@ -293,10 +293,10 @@ def _sort_children(children: list[dict[str, Any]]) -> list[dict[str, Any]]: def _scan_directory( path: Path, query: ParsedQuery, - seen_paths: set[Path] | None = None, + seen_paths: Optional[Set[Path]] = None, depth: int = 0, - stats: dict[str, int] | None = None, -) -> dict[str, Any] | None: + stats: Optional[Dict[str, int]] = None, +) -> Optional[Dict[str, Any]]: """ Recursively analyze a directory and its contents with safety limits. @@ -310,16 +310,16 @@ def _scan_directory( The path of the directory to scan. query : ParsedQuery The parsed query object containing information about the repository and query parameters. - seen_paths : set[Path] | None, optional + seen_paths : Set[Path] | None, optional A set to track already visited paths, by default None. depth : int The current depth of directory traversal, by default 0. - stats : dict[str, int] | None, optional + stats : Dict[str, int] | None, optional A dictionary to track statistics such as total file count and size, by default None. Returns ------- - dict[str, Any] | None + Dict[str, Any] | None A dictionary representing the directory structure and contents, or `None` if limits are reached. """ if seen_paths is None: @@ -373,9 +373,9 @@ def _scan_directory( def _process_symlink( item: Path, query: ParsedQuery, - result: dict[str, Any], - seen_paths: set[Path], - stats: dict[str, int], + result: Dict[str, Any], + seen_paths: Set[Path], + stats: Dict[str, int], depth: int, ) -> None: """ @@ -390,11 +390,11 @@ def _process_symlink( The full path of the symlink. query : ParsedQuery The parsed query object containing information about the repository and query parameters. - result : dict[str, Any] + result : Dict[str, Any] The dictionary to accumulate the results. - seen_paths : set[str] + seen_paths : Set[str] A set of already visited paths. - stats : dict[str, int] + stats : Dict[str, int] The dictionary to track statistics such as file count and size. depth : int The current depth in the directory traversal. @@ -460,7 +460,7 @@ def _process_symlink( result["dir_count"] += 1 + subdir["dir_count"] -def _process_file(item: Path, result: dict[str, Any], stats: dict[str, int]) -> None: +def _process_file(item: Path, result: Dict[str, Any], stats: Dict[str, int]) -> None: """ Process a file in the file system. @@ -471,9 +471,9 @@ def _process_file(item: Path, result: dict[str, Any], stats: dict[str, int]) -> ---------- item : Path The full path of the file. - result : dict[str, Any] + result : Dict[str, Any] The dictionary to accumulate the results. - stats : dict[str, int] + stats : Dict[str, int] The dictionary to track statistics such as file count and size. Raises @@ -513,9 +513,9 @@ def _process_file(item: Path, result: dict[str, Any], stats: dict[str, int]) -> def _process_item( item: Path, query: ParsedQuery, - result: dict[str, Any], - seen_paths: set[Path], - stats: dict[str, int], + result: Dict[str, Any], + seen_paths: Set[Path], + stats: Dict[str, int], depth: int, ) -> None: """ @@ -530,11 +530,11 @@ def _process_item( The full path of the file or directory to process. query : ParsedQuery The parsed query object containing information about the repository and query parameters. - result : dict[str, Any] + result : Dict[str, Any] The result dictionary to accumulate processed file/directory data. - seen_paths : set[Path] + seen_paths : Set[Path] A set of paths that have already been visited. - stats : dict[str, int] + stats : Dict[str, int] A dictionary of statistics like the total file count and size. depth : int The current depth of directory traversal. @@ -572,9 +572,9 @@ def _process_item( def _extract_files_content( query: ParsedQuery, - node: dict[str, Any], - files: list[dict[str, Any]] | None = None, -) -> list[dict[str, Any]]: + node: Dict[str, Any], + files: Optional[List[Dict[str, Any]]] = None, +) -> List[Dict[str, Any]]: """ Recursively collect all text files with their contents. @@ -585,14 +585,14 @@ def _extract_files_content( ---------- query : ParsedQuery The parsed query object containing information about the repository and query parameters. - node : dict[str, Any] + node : Dict[str, Any] The current directory or file node being processed. - files : list[dict[str, Any]] | None, optional + files : List[Dict[str, Any]] | None, optional A list to collect the extracted files' information, by default None. Returns ------- - list[dict[str, Any]] + List[Dict[str, Any]] A list of dictionaries, each containing the path, content (or `None` if too large), and size of each file. """ if files is None: @@ -620,7 +620,7 @@ def _extract_files_content( return files -def _create_file_content_string(files: list[dict[str, Any]]) -> str: +def _create_file_content_string(files: List[Dict[str, Any]]) -> str: """ Create a formatted string of file contents with separators. @@ -629,7 +629,7 @@ def _create_file_content_string(files: list[dict[str, Any]]) -> str: Parameters ---------- - files : list[dict[str, Any]] + files : List[Dict[str, Any]] A list of dictionaries containing file information, including the path and content. Returns @@ -654,7 +654,7 @@ def _create_file_content_string(files: list[dict[str, Any]]) -> str: return output -def _create_summary_string(query: ParsedQuery, nodes: dict[str, Any]) -> str: +def _create_summary_string(query: ParsedQuery, nodes: Dict[str, Any]) -> str: """ Create a summary string with file counts and content size. @@ -665,7 +665,7 @@ def _create_summary_string(query: ParsedQuery, nodes: dict[str, Any]) -> str: ---------- query : ParsedQuery The parsed query object containing information about the repository and query parameters. - nodes : dict[str, Any] + nodes : Dict[str, Any] Dictionary representing the directory structure, including file and directory counts. Returns @@ -690,7 +690,7 @@ def _create_summary_string(query: ParsedQuery, nodes: dict[str, Any]) -> str: return summary -def _create_tree_structure(query: ParsedQuery, node: dict[str, Any], prefix: str = "", is_last: bool = True) -> str: +def _create_tree_structure(query: ParsedQuery, node: Dict[str, Any], prefix: str = "", is_last: bool = True) -> str: """ Create a tree-like string representation of the file structure. @@ -701,7 +701,7 @@ def _create_tree_structure(query: ParsedQuery, node: dict[str, Any], prefix: str ---------- query : ParsedQuery The parsed query object containing information about the repository and query parameters. - node : dict[str, Any] + node : Dict[str, Any] The current directory or file node being processed. prefix : str A string used for indentation and formatting of the tree structure, by default "". @@ -733,7 +733,7 @@ def _create_tree_structure(query: ParsedQuery, node: dict[str, Any], prefix: str return tree -def _generate_token_string(context_string: str) -> str | None: +def _generate_token_string(context_string: str) -> Optional[str]: """ Return the number of tokens in a text string. @@ -747,7 +747,7 @@ def _generate_token_string(context_string: str) -> str | None: Returns ------- - str | None + str, optional The formatted number of tokens as a string (e.g., '1.2k', '1.2M'), or `None` if an error occurs. """ try: @@ -766,7 +766,7 @@ def _generate_token_string(context_string: str) -> str | None: return str(total_tokens) -def _ingest_single_file(path: Path, query: ParsedQuery) -> tuple[str, str, str]: +def _ingest_single_file(path: Path, query: ParsedQuery) -> Tuple[str, str, str]: """ Ingest a single file and return its summary, directory structure, and content. @@ -782,7 +782,7 @@ def _ingest_single_file(path: Path, query: ParsedQuery) -> tuple[str, str, str]: Returns ------- - tuple[str, str, str] + Tuple[str, str, str] A tuple containing the summary, directory structure, and file content. Raises @@ -827,7 +827,7 @@ def _ingest_single_file(path: Path, query: ParsedQuery) -> tuple[str, str, str]: return summary, tree, files_content -def _ingest_directory(path: Path, query: ParsedQuery) -> tuple[str, str, str]: +def _ingest_directory(path: Path, query: ParsedQuery) -> Tuple[str, str, str]: """ Ingest an entire directory and return its summary, directory structure, and file contents. @@ -843,7 +843,7 @@ def _ingest_directory(path: Path, query: ParsedQuery) -> tuple[str, str, str]: Returns ------- - tuple[str, str, str] + Tuple[str, str, str] A tuple containing the summary, directory structure, and file contents. Raises @@ -867,7 +867,7 @@ def _ingest_directory(path: Path, query: ParsedQuery) -> tuple[str, str, str]: return summary, tree, files_content -def run_ingest_query(query: ParsedQuery) -> tuple[str, str, str]: +def run_ingest_query(query: ParsedQuery) -> Tuple[str, str, str]: """ Run the ingestion process for a parsed query. @@ -882,7 +882,7 @@ def run_ingest_query(query: ParsedQuery) -> tuple[str, str, str]: Returns ------- - tuple[str, str, str] + Tuple[str, str, str] A tuple containing the summary, directory structure, and file contents. Raises diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index 8fa1648e..2346c6a0 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -7,6 +7,7 @@ import warnings from dataclasses import dataclass from pathlib import Path +from typing import List, Optional, Set, Tuple, Union from urllib.parse import unquote, urlparse from gitingest.config import MAX_FILE_SIZE, TMP_BASE_PATH @@ -14,9 +15,9 @@ from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS from gitingest.repository_clone import _check_repo_exists, fetch_remote_branch_list -HEX_DIGITS: set[str] = set(string.hexdigits) +HEX_DIGITS: Set[str] = set(string.hexdigits) -KNOWN_GIT_HOSTS: list[str] = [ +KNOWN_GIT_HOSTS: List[str] = [ "github.com", "gitlab.com", "bitbucket.org", @@ -32,28 +33,28 @@ class ParsedQuery: # pylint: disable=too-many-instance-attributes Dataclass to store the parsed details of the repository or file path. """ - user_name: str | None - repo_name: str | None + user_name: Optional[str] + repo_name: Optional[str] subpath: str local_path: Path - url: str | None + url: Optional[str] slug: str id: str - type: str | None = None - branch: str | None = None - commit: str | None = None + type: Optional[str] = None + branch: Optional[str] = None + commit: Optional[str] = None max_file_size: int = MAX_FILE_SIZE - ignore_patterns: set[str] | None = None - include_patterns: set[str] | None = None - pattern_type: str | None = None + ignore_patterns: Optional[Set[str]] = None + include_patterns: Optional[Set[str]] = None + pattern_type: Optional[str] = None async def parse_query( source: str, max_file_size: int, from_web: bool, - include_patterns: set[str] | str | None = None, - ignore_patterns: set[str] | str | None = None, + include_patterns: Optional[Union[str, Set[str]]] = None, + ignore_patterns: Optional[Union[str, Set[str]]] = None, ) -> ParsedQuery: """ Parse the input source (URL or path) to extract relevant details for the query. @@ -70,9 +71,9 @@ async def parse_query( The maximum file size in bytes to include. from_web : bool Flag indicating whether the source is a web URL. - include_patterns : set[str] | str | None, optional + include_patterns : Union[str, Set[str]], optional Patterns to include, by default None. Can be a set of strings or a single string. - ignore_patterns : set[str] | str | None, optional + ignore_patterns : Union[str, Set[str]], optional Patterns to ignore, by default None. Can be a set of strings or a single string. Returns @@ -208,24 +209,24 @@ async def _parse_repo_source(source: str) -> ParsedQuery: return parsed -async def _configure_branch_and_subpath(remaining_parts: list[str], url: str) -> str | None: +async def _configure_branch_and_subpath(remaining_parts: List[str], url: str) -> Optional[str]: """ Configure the branch and subpath based on the remaining parts of the URL. Parameters ---------- - remaining_parts : list[str] + remaining_parts : List[str] The remaining parts of the URL path. url : str The URL of the repository. Returns ------- - str | None + str, optional The branch name if found, otherwise None. """ try: # Fetch the list of branches from the remote repository - branches: list[str] = await fetch_remote_branch_list(url) + branches: List[str] = await fetch_remote_branch_list(url) except RuntimeError as e: warnings.warn(f"Warning: Failed to fetch branch list: {e}", RuntimeWarning) return remaining_parts.pop(0) @@ -283,7 +284,7 @@ def _normalize_pattern(pattern: str) -> str: return pattern -def _parse_patterns(pattern: set[str] | str) -> set[str]: +def _parse_patterns(pattern: Union[str, Set[str]]) -> Set[str]: """ Parse and validate file/directory patterns for inclusion or exclusion. @@ -292,12 +293,12 @@ def _parse_patterns(pattern: set[str] | str) -> set[str]: Parameters ---------- - pattern : set[str] | str + pattern : Set[str] | str Pattern(s) to parse - either a single string or set of strings Returns ------- - set[str] + Set[str] A set of normalized patterns. Raises @@ -309,7 +310,7 @@ def _parse_patterns(pattern: set[str] | str) -> set[str]: """ patterns = pattern if isinstance(pattern, set) else {pattern} - parsed_patterns: set[str] = set() + parsed_patterns: Set[str] = set() for p in patterns: parsed_patterns = parsed_patterns.union(set(re.split(",| ", p))) @@ -324,20 +325,20 @@ def _parse_patterns(pattern: set[str] | str) -> set[str]: return {_normalize_pattern(p) for p in parsed_patterns} -def _override_ignore_patterns(ignore_patterns: set[str], include_patterns: set[str]) -> set[str]: +def _override_ignore_patterns(ignore_patterns: Set[str], include_patterns: Set[str]) -> Set[str]: """ Remove patterns from ignore_patterns that are present in include_patterns using set difference. Parameters ---------- - ignore_patterns : set[str] + ignore_patterns : Set[str] The set of ignore patterns to filter. - include_patterns : set[str] + include_patterns : Set[str] The set of include patterns to remove from ignore_patterns. Returns ------- - set[str] + Set[str] The filtered set of ignore patterns. """ return set(ignore_patterns) - set(include_patterns) @@ -418,7 +419,7 @@ async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str: raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.") -def _get_user_and_repo_from_path(path: str) -> tuple[str, str]: +def _get_user_and_repo_from_path(path: str) -> Tuple[str, str]: """ Extract the user and repository names from a given path. @@ -429,7 +430,7 @@ def _get_user_and_repo_from_path(path: str) -> tuple[str, str]: Returns ------- - tuple[str, str] + Tuple[str, str] A tuple containing the user and repository names. Raises diff --git a/src/gitingest/repository_clone.py b/src/gitingest/repository_clone.py index ad5fc75e..c6fbe9f0 100644 --- a/src/gitingest/repository_clone.py +++ b/src/gitingest/repository_clone.py @@ -4,6 +4,7 @@ import os from dataclasses import dataclass from pathlib import Path +from typing import List, Optional, Tuple from gitingest.utils import async_timeout @@ -24,20 +25,20 @@ class CloneConfig: The URL of the Git repository to clone. local_path : str The local directory where the repository will be cloned. - commit : str | None, optional + commit : str, optional The specific commit hash to check out after cloning (default is None). - branch : str | None, optional + branch : str, optional The branch to clone (default is None). """ url: str local_path: str - commit: str | None = None - branch: str | None = None + commit: Optional[str] = None + branch: Optional[str] = None @async_timeout(TIMEOUT) -async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: +async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]: """ Clone a repository to a local path based on the provided configuration. @@ -51,12 +52,12 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: A dictionary containing the following keys: - url (str): The URL of the repository. - local_path (str): The local path to clone the repository to. - - commit (Optional[str]): The specific commit hash to checkout. - - branch (Optional[str]): The branch to clone. Defaults to 'main' or 'master' if not provided. + - commit (str, optional): The specific commit hash to checkout. + - branch (str, optional): The branch to clone. Defaults to 'main' or 'master' if not provided. Returns ------- - tuple[bytes, bytes] + Tuple[bytes, bytes] A tuple containing the stdout and stderr of the Git commands executed. Raises @@ -69,8 +70,8 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: # Extract and validate query parameters url: str = config.url local_path: str = config.local_path - commit: str | None = config.commit - branch: str | None = config.branch + commit: Optional[str] = config.commit + branch: Optional[str] = config.branch if not url: raise ValueError("The 'url' parameter is required.") @@ -162,7 +163,7 @@ async def _check_repo_exists(url: str) -> bool: @async_timeout(TIMEOUT) -async def fetch_remote_branch_list(url: str) -> list[str]: +async def fetch_remote_branch_list(url: str) -> List[str]: """ Fetch the list of branches from a remote Git repository. Parameters @@ -171,7 +172,7 @@ async def fetch_remote_branch_list(url: str) -> list[str]: The URL of the Git repository to fetch branches from. Returns ------- - list[str] + List[str] A list of branch names available in the remote repository. """ fetch_branches_command = ["git", "ls-remote", "--heads", url] @@ -185,7 +186,7 @@ async def fetch_remote_branch_list(url: str) -> list[str]: ] -async def _run_git_command(*args: str) -> tuple[bytes, bytes]: +async def _run_git_command(*args: str) -> Tuple[bytes, bytes]: """ Execute a Git command asynchronously and captures its output. @@ -196,7 +197,7 @@ async def _run_git_command(*args: str) -> tuple[bytes, bytes]: Returns ------- - tuple[bytes, bytes] + Tuple[bytes, bytes] A tuple containing the stdout and stderr of the Git command. Raises diff --git a/src/gitingest/repository_ingest.py b/src/gitingest/repository_ingest.py index 590351b4..b91950e0 100644 --- a/src/gitingest/repository_ingest.py +++ b/src/gitingest/repository_ingest.py @@ -3,6 +3,7 @@ import asyncio import inspect import shutil +from typing import Optional, Set, Tuple, Union from gitingest.config import TMP_BASE_PATH from gitingest.query_ingestion import run_ingest_query @@ -13,11 +14,11 @@ async def ingest_async( source: str, max_file_size: int = 10 * 1024 * 1024, # 10 MB - include_patterns: set[str] | str | None = None, - exclude_patterns: set[str] | str | None = None, - branch: str | None = None, - output: str | None = None, -) -> tuple[str, str, str]: + include_patterns: Optional[Union[str, Set[str]]] = None, + exclude_patterns: Optional[Union[str, Set[str]]] = None, + branch: Optional[str] = None, + output: Optional[str] = None, +) -> Tuple[str, str, str]: """ Main entry point for ingesting a source and processing its contents. @@ -32,18 +33,18 @@ async def ingest_async( max_file_size : int Maximum allowed file size for file ingestion. Files larger than this size are ignored, by default 10*1024*1024 (10 MB). - include_patterns : set[str] | str | None, optional + include_patterns : Union[str, Set[str]], optional Pattern or set of patterns specifying which files to include. If `None`, all files are included. - exclude_patterns : set[str] | str | None, optional + exclude_patterns : Union[str, Set[str]], optional Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded. - branch : str | None, optional + branch : str, optional The branch to clone and ingest. If `None`, the default branch is used. - output : str | None, optional + output : str, optional File path where the summary and content should be written. If `None`, the results are not written to a file. Returns ------- - tuple[str, str, str] + Tuple[str, str, str] A tuple containing: - A summary string of the analyzed repository or directory. - A tree-like string representation of the file structure. @@ -101,11 +102,11 @@ async def ingest_async( def ingest( source: str, max_file_size: int = 10 * 1024 * 1024, # 10 MB - include_patterns: set[str] | str | None = None, - exclude_patterns: set[str] | str | None = None, - branch: str | None = None, - output: str | None = None, -) -> tuple[str, str, str]: + include_patterns: Optional[Union[str, Set[str]]] = None, + exclude_patterns: Optional[Union[str, Set[str]]] = None, + branch: Optional[str] = None, + output: Optional[str] = None, +) -> Tuple[str, str, str]: """ Synchronous version of ingest_async. @@ -120,18 +121,18 @@ def ingest( max_file_size : int Maximum allowed file size for file ingestion. Files larger than this size are ignored, by default 10*1024*1024 (10 MB). - include_patterns : set[str] | str | None, optional + include_patterns : Union[str, Set[str]], optional Pattern or set of patterns specifying which files to include. If `None`, all files are included. - exclude_patterns : set[str] | str | None, optional + exclude_patterns : Union[str, Set[str]], optional Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded. - branch : str | None, optional + branch : str, optional The branch to clone and ingest. If `None`, the default branch is used. - output : str | None, optional + output : str, optional File path where the summary and content should be written. If `None`, the results are not written to a file. Returns ------- - tuple[str, str, str] + Tuple[str, str, str] A tuple containing: - A summary string of the analyzed repository or directory. - A tree-like string representation of the file structure. diff --git a/src/gitingest/utils.py b/src/gitingest/utils.py index 3c28da8a..3af58c41 100644 --- a/src/gitingest/utils.py +++ b/src/gitingest/utils.py @@ -2,16 +2,14 @@ import asyncio import functools -from collections.abc import Awaitable, Callable -from typing import ParamSpec, TypeVar +from typing import Any, Awaitable, Callable, TypeVar from gitingest.exceptions import AsyncTimeoutError T = TypeVar("T") -P = ParamSpec("P") -def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]: +def async_timeout(seconds: int = 10) -> Callable[..., Callable[..., Awaitable[T]]]: """ Async Timeout decorator. @@ -33,9 +31,9 @@ def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Ca an `AsyncTimeoutError` is raised. """ - def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]: + def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]: @functools.wraps(func) - async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: + async def wrapper(*args: Any, **kwargs: Any) -> T: try: return await asyncio.wait_for(func(*args, **kwargs), timeout=seconds) except asyncio.TimeoutError as exc: diff --git a/src/server/main.py b/src/server/main.py index bcdd601d..a71f5391 100644 --- a/src/server/main.py +++ b/src/server/main.py @@ -2,6 +2,7 @@ import os from pathlib import Path +from typing import Dict from dotenv import load_dotenv from fastapi import FastAPI, Request @@ -44,13 +45,13 @@ @app.get("/health") -async def health_check() -> dict[str, str]: +async def health_check() -> Dict[str, str]: """ Health check endpoint to verify that the server is running. Returns ------- - dict[str, str] + Dict[str, str] A JSON object with a "status" key indicating the server's health status. """ return {"status": "healthy"} diff --git a/src/server/server_config.py b/src/server/server_config.py index 081e534b..1f9d22d9 100644 --- a/src/server/server_config.py +++ b/src/server/server_config.py @@ -1,12 +1,14 @@ """ Configuration for the server. """ +from typing import Dict, List + from fastapi.templating import Jinja2Templates MAX_DISPLAY_SIZE: int = 300_000 DELETE_REPO_AFTER: int = 60 * 60 # In seconds -EXAMPLE_REPOS: list[dict[str, str]] = [ +EXAMPLE_REPOS: List[Dict[str, str]] = [ {"name": "Gitingest", "url": "https://github.com/cyclotruc/gitingest"}, {"name": "FastAPI", "url": "https://github.com/tiangolo/fastapi"}, {"name": "Flask", "url": "https://github.com/pallets/flask"}, diff --git a/src/server/server_utils.py b/src/server/server_utils.py index 4eb89e99..d5da43b0 100644 --- a/src/server/server_utils.py +++ b/src/server/server_utils.py @@ -124,7 +124,8 @@ async def _process_folder(folder: Path) -> None: txt_files = [f for f in folder.iterdir() if f.suffix == ".txt"] # Extract owner and repository name from the filename - if txt_files and "-" in (filename := txt_files[0].stem): + filename = txt_files[0].stem + if txt_files and "-" in filename: owner, repo = filename.split("-", 1) repo_url = f"{owner}/{repo}" diff --git a/tests/conftest.py b/tests/conftest.py index 507d1f51..43e0859c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,15 +6,14 @@ """ import json -from collections.abc import Callable from pathlib import Path -from typing import Any +from typing import Any, Callable, Dict import pytest from gitingest.query_parser import ParsedQuery -WriteNotebookFunc = Callable[[str, dict[str, Any]], Path] +WriteNotebookFunc = Callable[[str, Dict[str, Any]], Path] @pytest.fixture @@ -124,7 +123,7 @@ def write_notebook(tmp_path: Path) -> WriteNotebookFunc: file, and returns the path to the file. """ - def _write_notebook(name: str, content: dict[str, Any]) -> Path: + def _write_notebook(name: str, content: Dict[str, Any]) -> Path: notebook_path = tmp_path / name with notebook_path.open(mode="w", encoding="utf-8") as f: json.dump(content, f) diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py index b35d9184..a824970d 100644 --- a/tests/query_parser/test_git_host_agnostic.py +++ b/tests/query_parser/test_git_host_agnostic.py @@ -5,6 +5,8 @@ Bitbucket, Gitea, and Codeberg, even if the host is omitted. """ +from typing import List + import pytest from gitingest.query_parser import parse_query @@ -67,7 +69,7 @@ ) @pytest.mark.asyncio async def test_parse_query_without_host( - urls: list[str], + urls: List[str], expected_user: str, expected_repo: str, expected_url: str, diff --git a/tests/query_parser/test_query_parser.py b/tests/query_parser/test_query_parser.py index 8b828909..3c3097fe 100644 --- a/tests/query_parser/test_query_parser.py +++ b/tests/query_parser/test_query_parser.py @@ -469,18 +469,17 @@ async def test_parse_repo_source_with_various_url_patterns(url, expected_branch, When `_parse_repo_source` is called with remote branch fetching, Then the correct branch/subpath should be set or None if unmatched. """ - with ( - patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command, - patch("gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches, - ): - - mock_run_git_command.return_value = ( - b"refs/heads/feature/fix1\nrefs/heads/main\nrefs/heads/feature-branch\nrefs/heads/fix\n", - b"", - ) - mock_fetch_branches.return_value = ["feature/fix1", "main", "feature-branch"] + with patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command: + with patch( + "gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock + ) as mock_fetch_branches: + mock_run_git_command.return_value = ( + b"refs/heads/feature/fix1\nrefs/heads/main\nrefs/heads/feature-branch\nrefs/heads/fix\n", + b"", + ) + mock_fetch_branches.return_value = ["feature/fix1", "main", "feature-branch"] - parsed_query = await _parse_repo_source(url) + parsed_query = await _parse_repo_source(url) - assert parsed_query.branch == expected_branch - assert parsed_query.subpath == expected_subpath + assert parsed_query.branch == expected_branch + assert parsed_query.subpath == expected_subpath diff --git a/tests/test_cli.py b/tests/test_cli.py index 0b652390..827c5224 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -25,9 +25,9 @@ def test_cli_with_options(): [ "./", "--output", - OUTPUT_FILE_PATH, + str(OUTPUT_FILE_PATH), "--max-size", - MAX_FILE_SIZE, + str(MAX_FILE_SIZE), "--exclude-pattern", "tests/", "--include-pattern",