From 8b8ad9708f0d6255b95471b48c2b089535f32b2e Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sun, 29 Dec 2024 10:37:48 +0100 Subject: [PATCH 1/3] refactor: prefix helper functions with an underscore --- src/gitingest/exceptions.py | 6 ++ src/gitingest/parse_query.py | 124 ++++++++++++++++++++--------------- 2 files changed, 76 insertions(+), 54 deletions(-) create mode 100644 src/gitingest/exceptions.py diff --git a/src/gitingest/exceptions.py b/src/gitingest/exceptions.py new file mode 100644 index 00000000..f02829ba --- /dev/null +++ b/src/gitingest/exceptions.py @@ -0,0 +1,6 @@ +class InvalidPatternError(ValueError): + def __init__(self, pattern: str) -> None: + super().__init__( + f"Pattern '{pattern}' contains invalid characters. Only alphanumeric characters, dash (-), " + "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." + ) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 0d41e757..27330eb1 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -5,12 +5,71 @@ from typing import Any from urllib.parse import unquote +from gitingest.exceptions import InvalidPatternError from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS TMP_BASE_PATH: str = "../tmp" HEX_DIGITS = set(string.hexdigits) +def parse_query( + source: str, + max_file_size: int, + from_web: bool, + include_patterns: list[str] | str | None = None, + ignore_patterns: list[str] | str | None = None, +) -> dict[str, Any]: + """ + Parses the input source to construct a query dictionary with specified parameters. + + Parameters + ---------- + source : str + The source URL or file path to parse. + max_file_size : int + The maximum file size in bytes to include. + from_web : bool + Flag indicating whether the source is a web URL. + include_patterns : Optional[Union[List[str], str]], optional + Patterns to include, by default None. Can be a list of strings or a single string. + ignore_patterns : Optional[Union[List[str], str]], optional + Patterns to ignore, by default None. Can be a list of strings or a single string. + + Returns + ------- + Dict[str, Any] + A dictionary containing the parsed query parameters, including 'max_file_size', + 'ignore_patterns', and 'include_patterns'. + """ + # Determine the parsing method based on the source type + if from_web or source.startswith("https://") or "github.com" in source: + query = _parse_url(source) + else: + query = _parse_path(source) + + # Process ignore patterns + ignore_patterns_list = DEFAULT_IGNORE_PATTERNS.copy() + if ignore_patterns: + ignore_patterns_list += _parse_patterns(ignore_patterns) + + # Process include patterns and override ignore patterns accordingly + if include_patterns: + parsed_include = _parse_patterns(include_patterns) + ignore_patterns_list = _override_ignore_patterns(ignore_patterns_list, include_patterns=parsed_include) + else: + parsed_include = None + + # Update the query dictionary with max_file_size and processed patterns + query.update( + { + "max_file_size": max_file_size, + "ignore_patterns": ignore_patterns_list, + "include_patterns": parsed_include, + } + ) + return query + + def _parse_url(url: str) -> dict[str, Any]: url = url.split(" ")[0] url = unquote(url) # Decode URL-encoded characters @@ -96,12 +155,13 @@ def _parse_patterns(pattern: list[str] | str) -> list[str]: Raises ------ - ValueError + InvalidPatternError If any pattern contains invalid characters. Only alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed. """ patterns = pattern if isinstance(pattern, list) else [pattern] + patterns = [p.strip() for p in patterns] parsed_patterns = [] for p in patterns: @@ -110,11 +170,8 @@ def _parse_patterns(pattern: list[str] | str) -> list[str]: parsed_patterns = [p for p in parsed_patterns if p != ""] for p in parsed_patterns: - if not all(c.isalnum() or c in "-_./+*" for c in p): - raise ValueError( - f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), " - "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." - ) + if not _is_valid_pattern(p): + raise InvalidPatternError(p) return [_normalize_pattern(p) for p in parsed_patterns] @@ -149,59 +206,18 @@ def _parse_path(path: str) -> dict[str, Any]: return query -def parse_query( - source: str, - max_file_size: int, - from_web: bool, - include_patterns: list[str] | str | None = None, - ignore_patterns: list[str] | str | None = None, -) -> dict[str, Any]: +def _is_valid_pattern(pattern: str) -> bool: """ - Parses the input source to construct a query dictionary with specified parameters. + _summary_ Parameters ---------- - source : str - The source URL or file path to parse. - max_file_size : int - The maximum file size in bytes to include. - from_web : bool - Flag indicating whether the source is a web URL. - include_patterns : Optional[Union[List[str], str]], optional - Patterns to include, by default None. Can be a list of strings or a single string. - ignore_patterns : Optional[Union[List[str], str]], optional - Patterns to ignore, by default None. Can be a list of strings or a single string. + pattern : str + _description_ Returns ------- - Dict[str, Any] - A dictionary containing the parsed query parameters, including 'max_file_size', - 'ignore_patterns', and 'include_patterns'. + bool + _description_ """ - # Determine the parsing method based on the source type - if from_web or source.startswith("https://") or "github.com" in source: - query = _parse_url(source) - else: - query = _parse_path(source) - - # Process ignore patterns - ignore_patterns_list = DEFAULT_IGNORE_PATTERNS.copy() - if ignore_patterns: - ignore_patterns_list += _parse_patterns(ignore_patterns) - - # Process include patterns and override ignore patterns accordingly - if include_patterns: - parsed_include = _parse_patterns(include_patterns) - ignore_patterns_list = _override_ignore_patterns(ignore_patterns_list, include_patterns=parsed_include) - else: - parsed_include = None - - # Update the query dictionary with max_file_size and processed patterns - query.update( - { - "max_file_size": max_file_size, - "ignore_patterns": ignore_patterns_list, - "include_patterns": parsed_include, - } - ) - return query + return all(c.isalnum() or c in "-_./+*" for c in pattern) From 856a8228125cb930aaf10e7c3b8074dfaa0c6ea6 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Mon, 30 Dec 2024 10:31:17 +0100 Subject: [PATCH 2/3] Add docstrings to functions and move AsyncTimeoutError to gitingest.exceptions --- src/gitingest/cli.py | 27 ++- src/gitingest/clone.py | 31 ++- src/gitingest/exceptions.py | 23 +++ src/gitingest/ingest.py | 36 +++- src/gitingest/ingest_from_query.py | 290 ++++++++++++++++++++++++++++- src/gitingest/parse_query.py | 98 +++++++++- src/gitingest/utils.py | 31 ++- src/routers/download.py | 27 +++ src/routers/dynamic.py | 44 +++++ src/routers/index.py | 43 +++++ 10 files changed, 620 insertions(+), 30 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index f275efac..57d9f2c5 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -17,7 +17,32 @@ def main( exclude_pattern: tuple[str, ...], include_pattern: tuple[str, ...], ) -> None: - """Analyze a directory and create a text dump of its contents.""" + """ + Analyze a directory or repository and create a text dump of its contents. + + This command analyzes the contents of a specified source directory or repository, + applies custom include and exclude patterns, and generates a text summary of the analysis + which is then written to an output file. + + Parameters + ---------- + source : str + The source directory or repository to analyze. + output : str | None + The path where the output file will be written. If not specified, the output will be written + to a file named `.txt` in the current directory. + max_size : int + The maximum file size to process, in bytes. Files larger than this size will be ignored. + exclude_pattern : tuple[str, ...] + A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored. + include_pattern : tuple[str, ...] + A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. + + Raises + ------ + click.Abort + If there is an error during the execution of the command, this exception is raised to abort the process. + """ try: # Combine default and custom ignore patterns exclude_patterns = list(exclude_pattern) diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index a91b6a99..da6550f1 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -1,13 +1,32 @@ import asyncio from dataclasses import dataclass -from gitingest.utils import AsyncTimeoutError, async_timeout +from gitingest.exceptions import AsyncTimeoutError +from gitingest.utils import async_timeout CLONE_TIMEOUT: int = 20 @dataclass class CloneConfig: + """ + Configuration for cloning a Git repository. + + This class holds the necessary parameters for cloning a repository to a local path, including + the repository's URL, the target local path, and optional parameters for a specific commit or branch. + + Attributes + ---------- + url : str + The URL of the Git repository to clone. + local_path : str + The local directory where the repository will be cloned. + commit : str | None, optional + The specific commit hash to check out after cloning (default is None). + branch : str | None, optional + The branch to clone (default is None). + """ + url: str local_path: str commit: str | None = None @@ -17,7 +36,11 @@ class CloneConfig: @async_timeout(CLONE_TIMEOUT) async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: """ - Clones a repository to a local path based on the provided query parameters. + Clones a repository to a local path based on the provided configuration. + + This function handles the process of cloning a Git repository to the local file system. + It can clone a specific branch or commit if provided, and it raises exceptions if + any errors occur during the cloning process. Parameters ---------- @@ -30,7 +53,7 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: Returns ------- - Tuple[bytes, bytes] + tuple[bytes, bytes] A tuple containing the stdout and stderr of the git commands executed. Raises @@ -123,7 +146,7 @@ async def _run_git_command(*args: str) -> tuple[bytes, bytes]: Returns ------- - Tuple[bytes, bytes] + tuple[bytes, bytes] A tuple containing the stdout and stderr of the git command. Raises diff --git a/src/gitingest/exceptions.py b/src/gitingest/exceptions.py index f02829ba..34263e41 100644 --- a/src/gitingest/exceptions.py +++ b/src/gitingest/exceptions.py @@ -1,6 +1,29 @@ class InvalidPatternError(ValueError): + """ + Exception raised when a pattern contains invalid characters. + + This exception is used to signal that a pattern provided for some operation + contains characters that are not allowed. The valid characters for the pattern + include alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), + plus (+), and asterisk (*). + + Parameters + ---------- + pattern : str + The invalid pattern that caused the error. + """ + def __init__(self, pattern: str) -> None: super().__init__( f"Pattern '{pattern}' contains invalid characters. Only alphanumeric characters, dash (-), " "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." ) + + +class AsyncTimeoutError(Exception): + """ + Raised when an async operation exceeds its timeout limit. + + This exception is used by the `async_timeout` decorator to signal that the wrapped + asynchronous function has exceeded the specified time limit for execution. + """ diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py index e4c673de..4bb329fa 100644 --- a/src/gitingest/ingest.py +++ b/src/gitingest/ingest.py @@ -15,7 +15,39 @@ def ingest( exclude_patterns: list[str] | str | None = None, output: str | None = None, ) -> tuple[str, str, str]: + """ + Main entry point for ingesting a source and processing its contents. + This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), + and processes its files according to the specified query parameters. It returns a summary, a tree-like + structure of the files, and the content of the files. The results can optionally be written to an output file. + + Parameters + ---------- + source : str + The source to analyze, which can be a URL (for a GitHub repository) or a local directory path. + max_file_size : int, optional + The maximum allowed file size for file ingestion. Files larger than this size are ignored, by default 10*1024*1024 (10 MB). + include_patterns : list[str] | str | None, optional + A pattern or list of patterns specifying which files to include in the analysis. If `None`, all files are included. + exclude_patterns : list[str] | str | None, optional + A pattern or list of patterns specifying which files to exclude from the analysis. If `None`, no files are excluded. + output : str | None, optional + The file path where the summary and content should be written. If `None`, the results are not written to a file. + + Returns + ------- + tuple[str, str, str] + A tuple containing: + - A summary string of the analyzed repository or directory. + - A tree-like string representation of the file structure. + - The content of the files in the repository or directory. + + Raises + ------ + TypeError + If `clone_repo` does not return a coroutine, or if the `source` is of an unsupported type. + """ try: query = parse_query( source=source, @@ -42,8 +74,8 @@ def ingest( summary, tree, content = ingest_from_query(query) - if output: - with open(f"{output}", "w") as f: + if output is not None: + with open(output, "w") as f: f.write(tree + "\n" + content) return summary, tree, content diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index 886afa26..d8f57b71 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -11,6 +11,27 @@ def _should_include(path: str, base_path: str, include_patterns: list[str]) -> bool: + """ + Determines if the given file or directory path matches any of the include patterns. + + This function checks whether the relative path of a file or directory matches + any of the specified patterns. If a match is found, it returns `True`, indicating + that the file or directory should be included in further processing. + + Parameters + ---------- + path : str + The absolute path of the file or directory to check. + base_path : str + The base directory from which the relative path is calculated. + include_patterns : list[str] + A list of patterns to check against the relative path. + + Returns + ------- + bool + `True` if the path matches any of the include patterns, `False` otherwise. + """ rel_path = path.replace(base_path, "").lstrip(os.sep) include = False for pattern in include_patterns: @@ -20,6 +41,27 @@ def _should_include(path: str, base_path: str, include_patterns: list[str]) -> b def _should_exclude(path: str, base_path: str, ignore_patterns: list[str]) -> bool: + """ + Determines if the given file or directory path matches any of the ignore patterns. + + This function checks whether the relative path of a file or directory matches + any of the specified ignore patterns. If a match is found, it returns `True`, indicating + that the file or directory should be excluded from further processing. + + Parameters + ---------- + path : str + The absolute path of the file or directory to check. + base_path : str + The base directory from which the relative path is calculated. + ignore_patterns : list[str] + A list of patterns to check against the relative path. + + Returns + ------- + bool + `True` if the path matches any of the ignore patterns, `False` otherwise. + """ rel_path = path.replace(base_path, "").lstrip(os.sep) for pattern in ignore_patterns: if pattern and fnmatch(rel_path, pattern): @@ -28,7 +70,25 @@ def _should_exclude(path: str, base_path: str, ignore_patterns: list[str]) -> bo def _is_safe_symlink(symlink_path: str, base_path: str) -> bool: - """Check if a symlink points to a location within the base directory.""" + """ + Check if a symlink points to a location within the base directory. + + This function resolves the target of a symlink and ensures it is within the specified + base directory, returning `True` if it is safe, or `False` if the symlink points outside + the base directory. + + Parameters + ---------- + symlink_path : str + The path of the symlink to check. + base_path : str + The base directory to ensure the symlink points within. + + Returns + ------- + bool + `True` if the symlink points within the base directory, `False` otherwise. + """ try: target_path = os.path.realpath(symlink_path) base_path = os.path.realpath(base_path) @@ -39,7 +99,23 @@ def _is_safe_symlink(symlink_path: str, base_path: str) -> bool: def _is_text_file(file_path: str) -> bool: - """Determines if a file is likely a text file based on its content.""" + """ + Determine if a file is likely a text file based on its content. + + This function attempts to read the first 1024 bytes of a file and checks for the presence + of non-text characters. It returns `True` if the file is determined to be a text file, + otherwise returns `False`. + + Parameters + ---------- + file_path : str + The path to the file to check. + + Returns + ------- + bool + `True` if the file is likely a text file, `False` otherwise. + """ try: with open(file_path, "rb") as file: chunk = file.read(1024) @@ -49,6 +125,23 @@ def _is_text_file(file_path: str) -> bool: def _read_file_content(file_path: str) -> str: + """ + Reads the content of a file. + + This function attempts to open a file and read its contents using UTF-8 encoding. + If an error occurs during reading (e.g., file is not found or permission error), + it returns an error message. + + Parameters + ---------- + file_path : str + The path to the file to read. + + Returns + ------- + str + The content of the file, or an error message if the file could not be read. + """ try: with open(file_path, encoding="utf-8", errors="ignore") as f: return f.read() @@ -63,7 +156,31 @@ def _scan_directory( depth: int = 0, stats: dict[str, int] | None = None, ) -> dict[str, Any] | None: - """Recursively analyzes a directory and its contents with safety limits.""" + """ + Recursively analyze a directory and its contents with safety limits. + + This function scans a directory and its subdirectories up to a specified depth. It checks + for any file or directory that should be included or excluded based on the provided patterns + and limits. It also tracks the number of files and total size processed. + + Parameters + ---------- + path : str + The path of the directory to scan. + query : dict[str, Any] + A dictionary containing the query parameters, such as include and ignore patterns. + seen_paths : set[str] | None, optional + A set to track already visited paths, by default None. + depth : int, optional + The current depth of directory traversal, by default 0. + stats : dict[str, int] | None, optional + A dictionary to track statistics such as total file count and size, by default None. + + Returns + ------- + dict[str, Any] | None + A dictionary representing the directory structure and contents, or `None` if limits are reached. + """ if seen_paths is None: seen_paths = set() @@ -224,7 +341,28 @@ def _extract_files_content( max_file_size: int, files: list[dict[str, Any]] | None = None, ) -> list[dict[str, Any]]: - """Recursively collects all text files with their contents.""" + """ + Recursively collect all text files with their contents. + + This function traverses the directory tree and extracts the contents of all text files + into a list, ignoring non-text files or files that exceed the specified size limit. + + Parameters + ---------- + query : dict[str, Any] + A dictionary containing the query parameters, including the base path of the repository. + node : dict[str, Any] + The current directory or file node being processed. + max_file_size : int + The maximum file size in bytes for which content should be extracted. + files : list[dict[str, Any]] | None, optional + A list to collect the extracted files' information, by default None. + + Returns + ------- + list[dict[str, Any]] + A list of dictionaries, each containing the path, content (or `None` if too large), and size of each file. + """ if files is None: files = [] @@ -248,7 +386,22 @@ def _extract_files_content( def _create_file_content_string(files: list[dict[str, Any]]) -> str: - """Creates a formatted string of file contents with separators.""" + """ + Create a formatted string of file contents with separators. + + This function takes a list of files and generates a formatted string where each file’s + content is separated by a divider. If a README.md file is found, it is placed at the top. + + Parameters + ---------- + files : list[dict[str, Any]] + A list of dictionaries containing file information, including the path and content. + + Returns + ------- + str + A formatted string representing the contents of all the files with appropriate separators. + """ output = "" separator = "=" * 48 + "\n" @@ -278,7 +431,24 @@ def _create_file_content_string(files: list[dict[str, Any]]) -> str: def _create_summary_string(query: dict[str, Any], nodes: dict[str, Any]) -> str: - """Creates a summary string with file counts and content size.""" + """ + Create a summary string with file counts and content size. + + This function generates a summary of the repository's contents, including the number + of files analyzed, the total content size, and other relevant details based on the query parameters. + + Parameters + ---------- + query : dict[str, Any] + A dictionary containing query parameters like repository name, commit, branch, and subpath. + nodes : dict[str, Any] + A dictionary representing the directory structure, including file and directory counts. + + Returns + ------- + str + A summary string containing details such as the repository name, file count, and other query-specific information. + """ if "user_name" in query: summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" else: @@ -297,7 +467,28 @@ def _create_summary_string(query: dict[str, Any], nodes: dict[str, Any]) -> str: def _create_tree_structure(query: dict[str, Any], node: dict[str, Any], prefix: str = "", is_last: bool = True) -> str: - """Creates a tree-like string representation of the file structure.""" + """ + Create a tree-like string representation of the file structure. + + This function generates a string representation of the directory structure, formatted + as a tree with appropriate indentation for nested directories and files. + + Parameters + ---------- + query : dict[str, Any] + A dictionary containing query parameters like repository name and subpath. + node : dict[str, Any] + The current directory or file node being processed. + prefix : str, optional + A string used for indentation and formatting of the tree structure, by default "". + is_last : bool, optional + A flag indicating whether the current node is the last in its directory, by default True. + + Returns + ------- + str + A string representing the directory structure formatted as a tree. + """ tree = "" if not node["name"]: @@ -319,7 +510,22 @@ def _create_tree_structure(query: dict[str, Any], node: dict[str, Any], prefix: def _generate_token_string(context_string: str) -> str | None: - """Returns the number of tokens in a text string.""" + """ + Return the number of tokens in a text string. + + This function estimates the number of tokens in a given text string using the `tiktoken` + library. It returns the number of tokens in a human-readable format (e.g., '1.2k', '1.2M'). + + Parameters + ---------- + context_string : str + The text string for which the token count is to be estimated. + + Returns + ------- + str | None + The formatted number of tokens as a string (e.g., '1.2k', '1.2M'), or `None` if an error occurs. + """ formatted_tokens = "" try: encoding = tiktoken.get_encoding("cl100k_base") @@ -340,6 +546,29 @@ def _generate_token_string(context_string: str) -> str | None: def _ingest_single_file(path: str, query: dict[str, Any]) -> tuple[str, str, str]: + """ + Ingest a single file and return its summary, directory structure, and content. + + This function reads a file, generates a summary of its contents, and returns the content + along with its directory structure and token estimation. + + Parameters + ---------- + path : str + The path of the file to ingest. + query : dict[str, Any] + A dictionary containing query parameters, such as the maximum file size. + + Returns + ------- + tuple[str, str, str] + A tuple containing the summary, directory structure, and file content. + + Raises + ------ + ValueError + If the specified path is not a file or if the file is not a text file. + """ if not os.path.isfile(path): raise ValueError(f"Path {path} is not a file") @@ -376,6 +605,29 @@ def _ingest_single_file(path: str, query: dict[str, Any]) -> tuple[str, str, str def _ingest_directory(path: str, query: dict[str, Any]) -> tuple[str, str, str]: + """ + Ingest an entire directory and return its summary, directory structure, and file contents. + + This function processes a directory, extracts its contents, and generates a summary, + directory structure, and file content. It recursively processes subdirectories as well. + + Parameters + ---------- + path : str + The path of the directory to ingest. + query : dict[str, Any] + A dictionary containing query parameters, including maximum file size. + + Returns + ------- + tuple[str, str, str] + A tuple containing the summary, directory structure, and file contents. + + Raises + ------ + ValueError + If no files are found in the directory. + """ nodes = _scan_directory(path=path, query=query) if not nodes: raise ValueError(f"No files found in {path}") @@ -392,7 +644,27 @@ def _ingest_directory(path: str, query: dict[str, Any]) -> tuple[str, str, str]: def ingest_from_query(query: dict[str, Any]) -> tuple[str, str, str]: - """Main entry point for analyzing a codebase directory or single file.""" + """ + Main entry point for analyzing a codebase directory or single file. + + This function processes a file or directory based on the provided query, extracting its contents + and generating a summary, directory structure, and file content, along with token estimations. + + Parameters + ---------- + query : dict[str, Any] + A dictionary containing parameters like local path, subpath, file type, etc. + + Returns + ------- + tuple[str, str, str] + A tuple containing the summary, directory structure, and file contents. + + Raises + ------ + ValueError + If the specified path cannot be found or if the file is not a text file. + """ path = f"{query['local_path']}{query['subpath']}" if not os.path.exists(path): raise ValueError(f"{query['slug']} cannot be found") diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 27330eb1..18a78e9a 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -22,6 +22,10 @@ def parse_query( """ Parses the input source to construct a query dictionary with specified parameters. + This function processes the provided source (either a URL or file path) and builds a + query dictionary that includes information such as the source URL, maximum file size, + and any patterns to include or ignore. It handles both web and file-based sources. + Parameters ---------- source : str @@ -30,14 +34,14 @@ def parse_query( The maximum file size in bytes to include. from_web : bool Flag indicating whether the source is a web URL. - include_patterns : Optional[Union[List[str], str]], optional + include_patterns : list[str] | str | None, optional Patterns to include, by default None. Can be a list of strings or a single string. - ignore_patterns : Optional[Union[List[str], str]], optional + ignore_patterns : list[str] | str | None, optional Patterns to ignore, by default None. Can be a list of strings or a single string. Returns ------- - Dict[str, Any] + dict[str, Any] A dictionary containing the parsed query parameters, including 'max_file_size', 'ignore_patterns', and 'include_patterns'. """ @@ -71,6 +75,28 @@ def parse_query( def _parse_url(url: str) -> dict[str, Any]: + """ + Parses a GitHub repository URL into a structured query dictionary. + + This function extracts relevant information from a GitHub URL, such as the username, + repository name, commit, branch, and subpath, and returns them in a structured format. + + Parameters + ---------- + url : str + The GitHub URL to parse. + + Returns + ------- + dict[str, Any] + A dictionary containing the parsed details of the GitHub repository, including + the username, repository name, commit, branch, and other relevant information. + + Raises + ------ + ValueError + If the URL is invalid or does not correspond to a valid Git repository. + """ url = url.split(" ")[0] url = unquote(url) # Decode URL-encoded characters @@ -126,10 +152,42 @@ def _parse_url(url: str) -> dict[str, Any]: def _is_valid_git_commit_hash(commit: str) -> bool: + """ + Validates if the provided string is a valid Git commit hash. + + This function checks if the commit hash is a 40-character string consisting only + of hexadecimal digits, which is the standard format for Git commit hashes. + + Parameters + ---------- + commit : str + The string to validate as a Git commit hash. + + Returns + ------- + bool + True if the string is a valid 40-character Git commit hash, otherwise False. + """ return len(commit) == 40 and all(c in HEX_DIGITS for c in commit) def _normalize_pattern(pattern: str) -> str: + """ + Normalizes the given pattern by removing leading separators and appending a wildcard. + + This function processes the pattern string by stripping leading directory separators + and appending a wildcard (`*`) if the pattern ends with a separator. + + Parameters + ---------- + pattern : str + The pattern to normalize. + + Returns + ------- + str + The normalized pattern. + """ pattern = pattern.lstrip(os.sep) if pattern.endswith(os.sep): pattern += "*" @@ -161,7 +219,6 @@ def _parse_patterns(pattern: list[str] | str) -> list[str]: asterisk (*) are allowed. """ patterns = pattern if isinstance(pattern, list) else [pattern] - patterns = [p.strip() for p in patterns] parsed_patterns = [] for p in patterns: @@ -182,20 +239,37 @@ def _override_ignore_patterns(ignore_patterns: list[str], include_patterns: list Parameters ---------- - ignore_patterns : List[str] + ignore_patterns : list[str] The list of patterns to potentially remove. - include_patterns : List[str] + include_patterns : list[str] The list of patterns to exclude from ignore_patterns. Returns ------- - List[str] + list[str] A new list of ignore_patterns with specified patterns removed. """ return list(set(ignore_patterns) - set(include_patterns)) def _parse_path(path: str) -> dict[str, Any]: + """ + Parses a file path into a structured query dictionary. + + This function takes a file path and constructs a query dictionary that includes + relevant details such as the absolute path and the slug (a combination of the + directory and file names). + + Parameters + ---------- + path : str + The file path to parse. + + Returns + ------- + dict[str, Any] + A dictionary containing parsed details such as the local file path and slug. + """ query = { "url": None, "local_path": os.path.abspath(path), @@ -208,16 +282,20 @@ def _parse_path(path: str) -> dict[str, Any]: def _is_valid_pattern(pattern: str) -> bool: """ - _summary_ + Validates if the given pattern contains only valid characters. + + This function checks if the pattern contains only alphanumeric characters or one + of the following allowed characters: dash (`-`), underscore (`_`), dot (`.`), + forward slash (`/`), plus (`+`), or asterisk (`*`). Parameters ---------- pattern : str - _description_ + The pattern to validate. Returns ------- bool - _description_ + True if the pattern is valid, otherwise False. """ return all(c.isalnum() or c in "-_./+*" for c in pattern) diff --git a/src/gitingest/utils.py b/src/gitingest/utils.py index 82b8e303..bc95bfcc 100644 --- a/src/gitingest/utils.py +++ b/src/gitingest/utils.py @@ -3,16 +3,39 @@ from collections.abc import Awaitable, Callable from typing import ParamSpec, TypeVar +from gitingest.exceptions import AsyncTimeoutError + T = TypeVar("T") P = ParamSpec("P") -class AsyncTimeoutError(Exception): - """Raised when an async operation exceeds its timeout limit.""" +def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]: + """ + Async Timeout decorator. + This decorator wraps an asynchronous function and ensures it does not run for + longer than the specified number of seconds. If the function execution exceeds + this limit, it raises an `AsyncTimeoutError`. + + Parameters + ---------- + seconds : int, optional + The maximum allowed time (in seconds) for the asynchronous function to complete. + The default is 10 seconds. + + Returns + ------- + Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]] + A decorator that, when applied to an async function, ensures the function + completes within the specified time limit. If the function takes too long, + an `AsyncTimeoutError` is raised. + + Raises + ------ + AsyncTimeoutError + If the wrapped asynchronous function does not complete within the specified time limit. + """ -def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]: - # Async Timeout decorator def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]: @functools.wraps(func) async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: diff --git a/src/routers/download.py b/src/routers/download.py index 2dc10224..fdbf2bbe 100644 --- a/src/routers/download.py +++ b/src/routers/download.py @@ -10,6 +10,33 @@ @router.get("/download/{digest_id}") async def download_ingest(digest_id: str) -> Response: + """ + Downloads a .txt file associated with a given digest ID. + + This function searches for a `.txt` file in a directory corresponding to the provided + digest ID. If a file is found, it is read and returned as a downloadable attachment. + If no `.txt` file is found, an error is raised. + + Parameters + ---------- + digest_id : str + The unique identifier for the digest. It is used to find the corresponding directory + and locate the .txt file within that directory. + + Returns + ------- + Response + A FastAPI Response object containing the content of the found `.txt` file. The file is + sent with the appropriate media type (`text/plain`) and the correct `Content-Disposition` + header to prompt a file download. + + Raises + ------ + FileNotFoundError + If no `.txt` file is found in the directory corresponding to the given `digest_id`. + HTTPException + If the digest directory is not found or if no `.txt` file exists in the directory. + """ try: # Find the first .txt file in the directory directory = f"{TMP_BASE_PATH}/{digest_id}" diff --git a/src/routers/dynamic.py b/src/routers/dynamic.py index 12216f15..bfd6d44a 100644 --- a/src/routers/dynamic.py +++ b/src/routers/dynamic.py @@ -11,6 +11,25 @@ @router.get("/{full_path:path}") async def catch_all(request: Request, full_path: str) -> HTMLResponse: + """ + Renders a page with a GitHub URL based on the provided path. + + This endpoint catches all GET requests with a dynamic path, constructs a GitHub URL + using the `full_path` parameter, and renders the `github.jinja` template with that URL. + + Parameters + ---------- + request : Request + The incoming request object, which provides context for rendering the response. + full_path : str + The full path extracted from the URL, which is used to build the GitHub URL. + + Returns + ------- + HTMLResponse + An HTML response containing the rendered template, with the GitHub URL + and other default parameters such as loading state and file size. + """ return templates.TemplateResponse( "github.jinja", { @@ -31,6 +50,31 @@ async def process_catch_all( pattern_type: str = Form(...), pattern: str = Form(...), ) -> HTMLResponse: + """ + Processes the form submission with user input for query parameters. + + This endpoint handles POST requests, processes the input parameters (e.g., text, file size, pattern), + and calls the `process_query` function to handle the query logic, returning the result as an HTML response. + + Parameters + ---------- + request : Request + The incoming request object, which provides context for rendering the response. + input_text : str, optional + The input text provided by the user for processing, by default taken from the form. + max_file_size : int, optional + The maximum allowed file size for the input, specified by the user. + pattern_type : str, optional + The type of pattern used for the query, specified by the user. + pattern : str, optional + The pattern string used in the query, specified by the user. + + Returns + ------- + HTMLResponse + An HTML response generated after processing the form input and query logic, + which will be rendered and returned to the user. + """ return await process_query( request, input_text, diff --git a/src/routers/index.py b/src/routers/index.py index f2728805..9665bd08 100644 --- a/src/routers/index.py +++ b/src/routers/index.py @@ -12,6 +12,23 @@ @router.get("/", response_class=HTMLResponse) async def home(request: Request) -> HTMLResponse: + """ + Renders the home page with example repositories and default parameters. + + This endpoint serves the home page of the application, rendering the `index.jinja` template + and providing it with a list of example repositories and default file size values. + + Parameters + ---------- + request : Request + The incoming request object, which provides context for rendering the response. + + Returns + ------- + HTMLResponse + An HTML response containing the rendered home page template, with example repositories + and other default parameters such as file size. + """ return templates.TemplateResponse( "index.jinja", { @@ -31,6 +48,32 @@ async def index_post( pattern_type: str = Form(...), pattern: str = Form(...), ) -> HTMLResponse: + """ + Processes the form submission with user input for query parameters. + + This endpoint handles POST requests from the home page form. It processes the user-submitted + input (e.g., text, file size, pattern type) and invokes the `process_query` function to handle + the query logic, returning the result as an HTML response. + + Parameters + ---------- + request : Request + The incoming request object, which provides context for rendering the response. + input_text : str, optional + The input text provided by the user for processing, by default taken from the form. + max_file_size : int, optional + The maximum allowed file size for the input, specified by the user. + pattern_type : str, optional + The type of pattern used for the query, specified by the user. + pattern : str, optional + The pattern string used in the query, specified by the user. + + Returns + ------- + HTMLResponse + An HTML response containing the results of processing the form input and query logic, + which will be rendered and returned to the user. + """ return await process_query( request, input_text, From d3f69d1742255e32e7ebb647549f12e6ba4ffc4f Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Mon, 30 Dec 2024 10:37:52 +0100 Subject: [PATCH 3/3] Refactor: Move process_query to top and prefix helper functions with an underscore --- src/process_query.py | 148 +++++++++++++++++++++---------------------- 1 file changed, 74 insertions(+), 74 deletions(-) diff --git a/src/process_query.py b/src/process_query.py index 470b675b..4053e45c 100644 --- a/src/process_query.py +++ b/src/process_query.py @@ -11,78 +11,6 @@ templates = Jinja2Templates(directory="templates") -def print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: - """ - Print a formatted summary of the query details, including the URL, file size, - and pattern information, for easier debugging or logging. - - Parameters - ---------- - url : str - The URL associated with the query. - max_file_size : int - The maximum file size allowed for the query, in bytes. - pattern_type : str - Specifies the type of pattern to use, either "include" or "exclude". - pattern : str - The actual pattern string to include or exclude in the query. - """ - print(f"{Colors.WHITE}{url:<20}{Colors.END}", end="") - if int(max_file_size / 1024) != 50: - print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") - if pattern_type == "include" and pattern != "": - print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") - elif pattern_type == "exclude" and pattern != "": - print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") - - -def print_error(url: str, e: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None: - """ - Print a formatted error message including the URL, file size, pattern details, and the exception encountered, - for debugging or logging purposes. - - Parameters - ---------- - url : str - The URL associated with the query that caused the error. - e : Exception - The exception raised during the query or process. - max_file_size : int - The maximum file size allowed for the query, in bytes. - pattern_type : str - Specifies the type of pattern to use, either "include" or "exclude". - pattern : str - The actual pattern string to include or exclude in the query. - """ - print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") - print_query(url, max_file_size, pattern_type, pattern) - print(f" | {Colors.RED}{e}{Colors.END}") - - -def print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None: - """ - Print a formatted success message, including the URL, file size, pattern details, and a summary with estimated - tokens, for debugging or logging purposes. - - Parameters - ---------- - url : str - The URL associated with the successful query. - max_file_size : int - The maximum file size allowed for the query, in bytes. - pattern_type : str - Specifies the type of pattern to use, either "include" or "exclude". - pattern : str - The actual pattern string to include or exclude in the query. - summary : str - A summary of the query result, including details like estimated tokens. - """ - estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] - print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") - print_query(url, max_file_size, pattern_type, pattern) - print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") - - async def process_query( request: Request, input_text: str, @@ -149,7 +77,7 @@ async def process_query( except Exception as e: # hack to print error message when query is not defined if "query" in locals() and query is not None and isinstance(query, dict): - print_error(query["url"], e, max_file_size, pattern_type, pattern) + _print_error(query["url"], e, max_file_size, pattern_type, pattern) else: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print(f"{Colors.RED}{e}{Colors.END}") @@ -173,7 +101,7 @@ async def process_query( "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] ) - print_success( + _print_success( url=query["url"], max_file_size=max_file_size, pattern_type=pattern_type, @@ -197,3 +125,75 @@ async def process_query( "pattern": pattern, }, ) + + +def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: + """ + Print a formatted summary of the query details, including the URL, file size, + and pattern information, for easier debugging or logging. + + Parameters + ---------- + url : str + The URL associated with the query. + max_file_size : int + The maximum file size allowed for the query, in bytes. + pattern_type : str + Specifies the type of pattern to use, either "include" or "exclude". + pattern : str + The actual pattern string to include or exclude in the query. + """ + print(f"{Colors.WHITE}{url:<20}{Colors.END}", end="") + if int(max_file_size / 1024) != 50: + print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") + if pattern_type == "include" and pattern != "": + print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") + elif pattern_type == "exclude" and pattern != "": + print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") + + +def _print_error(url: str, e: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None: + """ + Print a formatted error message including the URL, file size, pattern details, and the exception encountered, + for debugging or logging purposes. + + Parameters + ---------- + url : str + The URL associated with the query that caused the error. + e : Exception + The exception raised during the query or process. + max_file_size : int + The maximum file size allowed for the query, in bytes. + pattern_type : str + Specifies the type of pattern to use, either "include" or "exclude". + pattern : str + The actual pattern string to include or exclude in the query. + """ + print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") + _print_query(url, max_file_size, pattern_type, pattern) + print(f" | {Colors.RED}{e}{Colors.END}") + + +def _print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None: + """ + Print a formatted success message, including the URL, file size, pattern details, and a summary with estimated + tokens, for debugging or logging purposes. + + Parameters + ---------- + url : str + The URL associated with the successful query. + max_file_size : int + The maximum file size allowed for the query, in bytes. + pattern_type : str + Specifies the type of pattern to use, either "include" or "exclude". + pattern : str + The actual pattern string to include or exclude in the query. + summary : str + A summary of the query result, including details like estimated tokens. + """ + estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] + print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") + _print_query(url, max_file_size, pattern_type, pattern) + print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}")