From 8b8ad9708f0d6255b95471b48c2b089535f32b2e Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Sun, 29 Dec 2024 10:37:48 +0100
Subject: [PATCH 1/3] refactor: prefix helper functions with an underscore

---
 src/gitingest/exceptions.py  |   6 ++
 src/gitingest/parse_query.py | 124 ++++++++++++++++++++---------------
 2 files changed, 76 insertions(+), 54 deletions(-)
 create mode 100644 src/gitingest/exceptions.py

diff --git a/src/gitingest/exceptions.py b/src/gitingest/exceptions.py
new file mode 100644
index 00000000..f02829ba
--- /dev/null
+++ b/src/gitingest/exceptions.py
@@ -0,0 +1,6 @@
+class InvalidPatternError(ValueError):
+    def __init__(self, pattern: str) -> None:
+        super().__init__(
+            f"Pattern '{pattern}' contains invalid characters. Only alphanumeric characters, dash (-), "
+            "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed."
+        )
diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py
index 0d41e757..27330eb1 100644
--- a/src/gitingest/parse_query.py
+++ b/src/gitingest/parse_query.py
@@ -5,12 +5,71 @@
 from typing import Any
 from urllib.parse import unquote
 
+from gitingest.exceptions import InvalidPatternError
 from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS
 
 TMP_BASE_PATH: str = "../tmp"
 HEX_DIGITS = set(string.hexdigits)
 
 
+def parse_query(
+    source: str,
+    max_file_size: int,
+    from_web: bool,
+    include_patterns: list[str] | str | None = None,
+    ignore_patterns: list[str] | str | None = None,
+) -> dict[str, Any]:
+    """
+    Parses the input source to construct a query dictionary with specified parameters.
+
+    Parameters
+    ----------
+    source : str
+        The source URL or file path to parse.
+    max_file_size : int
+        The maximum file size in bytes to include.
+    from_web : bool
+        Flag indicating whether the source is a web URL.
+    include_patterns : Optional[Union[List[str], str]], optional
+        Patterns to include, by default None. Can be a list of strings or a single string.
+    ignore_patterns : Optional[Union[List[str], str]], optional
+        Patterns to ignore, by default None. Can be a list of strings or a single string.
+
+    Returns
+    -------
+    Dict[str, Any]
+        A dictionary containing the parsed query parameters, including 'max_file_size',
+        'ignore_patterns', and 'include_patterns'.
+    """
+    # Determine the parsing method based on the source type
+    if from_web or source.startswith("https://") or "github.com" in source:
+        query = _parse_url(source)
+    else:
+        query = _parse_path(source)
+
+    # Process ignore patterns
+    ignore_patterns_list = DEFAULT_IGNORE_PATTERNS.copy()
+    if ignore_patterns:
+        ignore_patterns_list += _parse_patterns(ignore_patterns)
+
+    # Process include patterns and override ignore patterns accordingly
+    if include_patterns:
+        parsed_include = _parse_patterns(include_patterns)
+        ignore_patterns_list = _override_ignore_patterns(ignore_patterns_list, include_patterns=parsed_include)
+    else:
+        parsed_include = None
+
+    # Update the query dictionary with max_file_size and processed patterns
+    query.update(
+        {
+            "max_file_size": max_file_size,
+            "ignore_patterns": ignore_patterns_list,
+            "include_patterns": parsed_include,
+        }
+    )
+    return query
+
+
 def _parse_url(url: str) -> dict[str, Any]:
     url = url.split(" ")[0]
     url = unquote(url)  # Decode URL-encoded characters
@@ -96,12 +155,13 @@ def _parse_patterns(pattern: list[str] | str) -> list[str]:
 
     Raises
     ------
-    ValueError
+    InvalidPatternError
         If any pattern contains invalid characters. Only alphanumeric characters,
         dash (-), underscore (_), dot (.), forward slash (/), plus (+), and
         asterisk (*) are allowed.
     """
     patterns = pattern if isinstance(pattern, list) else [pattern]
+    patterns = [p.strip() for p in patterns]
 
     parsed_patterns = []
     for p in patterns:
@@ -110,11 +170,8 @@ def _parse_patterns(pattern: list[str] | str) -> list[str]:
     parsed_patterns = [p for p in parsed_patterns if p != ""]
 
     for p in parsed_patterns:
-        if not all(c.isalnum() or c in "-_./+*" for c in p):
-            raise ValueError(
-                f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), "
-                "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed."
-            )
+        if not _is_valid_pattern(p):
+            raise InvalidPatternError(p)
 
     return [_normalize_pattern(p) for p in parsed_patterns]
 
@@ -149,59 +206,18 @@ def _parse_path(path: str) -> dict[str, Any]:
     return query
 
 
-def parse_query(
-    source: str,
-    max_file_size: int,
-    from_web: bool,
-    include_patterns: list[str] | str | None = None,
-    ignore_patterns: list[str] | str | None = None,
-) -> dict[str, Any]:
+def _is_valid_pattern(pattern: str) -> bool:
     """
-    Parses the input source to construct a query dictionary with specified parameters.
+    _summary_
 
     Parameters
     ----------
-    source : str
-        The source URL or file path to parse.
-    max_file_size : int
-        The maximum file size in bytes to include.
-    from_web : bool
-        Flag indicating whether the source is a web URL.
-    include_patterns : Optional[Union[List[str], str]], optional
-        Patterns to include, by default None. Can be a list of strings or a single string.
-    ignore_patterns : Optional[Union[List[str], str]], optional
-        Patterns to ignore, by default None. Can be a list of strings or a single string.
+    pattern : str
+        _description_
 
     Returns
     -------
-    Dict[str, Any]
-        A dictionary containing the parsed query parameters, including 'max_file_size',
-        'ignore_patterns', and 'include_patterns'.
+    bool
+        _description_
     """
-    # Determine the parsing method based on the source type
-    if from_web or source.startswith("https://") or "github.com" in source:
-        query = _parse_url(source)
-    else:
-        query = _parse_path(source)
-
-    # Process ignore patterns
-    ignore_patterns_list = DEFAULT_IGNORE_PATTERNS.copy()
-    if ignore_patterns:
-        ignore_patterns_list += _parse_patterns(ignore_patterns)
-
-    # Process include patterns and override ignore patterns accordingly
-    if include_patterns:
-        parsed_include = _parse_patterns(include_patterns)
-        ignore_patterns_list = _override_ignore_patterns(ignore_patterns_list, include_patterns=parsed_include)
-    else:
-        parsed_include = None
-
-    # Update the query dictionary with max_file_size and processed patterns
-    query.update(
-        {
-            "max_file_size": max_file_size,
-            "ignore_patterns": ignore_patterns_list,
-            "include_patterns": parsed_include,
-        }
-    )
-    return query
+    return all(c.isalnum() or c in "-_./+*" for c in pattern)

From 856a8228125cb930aaf10e7c3b8074dfaa0c6ea6 Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Mon, 30 Dec 2024 10:31:17 +0100
Subject: [PATCH 2/3] Add docstrings to functions and move AsyncTimeoutError to
 gitingest.exceptions

---
 src/gitingest/cli.py               |  27 ++-
 src/gitingest/clone.py             |  31 ++-
 src/gitingest/exceptions.py        |  23 +++
 src/gitingest/ingest.py            |  36 +++-
 src/gitingest/ingest_from_query.py | 290 ++++++++++++++++++++++++++++-
 src/gitingest/parse_query.py       |  98 +++++++++-
 src/gitingest/utils.py             |  31 ++-
 src/routers/download.py            |  27 +++
 src/routers/dynamic.py             |  44 +++++
 src/routers/index.py               |  43 +++++
 10 files changed, 620 insertions(+), 30 deletions(-)

diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py
index f275efac..57d9f2c5 100644
--- a/src/gitingest/cli.py
+++ b/src/gitingest/cli.py
@@ -17,7 +17,32 @@ def main(
     exclude_pattern: tuple[str, ...],
     include_pattern: tuple[str, ...],
 ) -> None:
-    """Analyze a directory and create a text dump of its contents."""
+    """
+    Analyze a directory or repository and create a text dump of its contents.
+
+    This command analyzes the contents of a specified source directory or repository,
+    applies custom include and exclude patterns, and generates a text summary of the analysis
+    which is then written to an output file.
+
+    Parameters
+    ----------
+    source : str
+        The source directory or repository to analyze.
+    output : str | None
+        The path where the output file will be written. If not specified, the output will be written
+        to a file named `<repo_name>.txt` in the current directory.
+    max_size : int
+        The maximum file size to process, in bytes. Files larger than this size will be ignored.
+    exclude_pattern : tuple[str, ...]
+        A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored.
+    include_pattern : tuple[str, ...]
+        A tuple of patterns to include during the analysis. Only files matching these patterns will be processed.
+
+    Raises
+    ------
+    click.Abort
+        If there is an error during the execution of the command, this exception is raised to abort the process.
+    """
     try:
         # Combine default and custom ignore patterns
         exclude_patterns = list(exclude_pattern)
diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py
index a91b6a99..da6550f1 100644
--- a/src/gitingest/clone.py
+++ b/src/gitingest/clone.py
@@ -1,13 +1,32 @@
 import asyncio
 from dataclasses import dataclass
 
-from gitingest.utils import AsyncTimeoutError, async_timeout
+from gitingest.exceptions import AsyncTimeoutError
+from gitingest.utils import async_timeout
 
 CLONE_TIMEOUT: int = 20
 
 
 @dataclass
 class CloneConfig:
+    """
+    Configuration for cloning a Git repository.
+
+    This class holds the necessary parameters for cloning a repository to a local path, including
+    the repository's URL, the target local path, and optional parameters for a specific commit or branch.
+
+    Attributes
+    ----------
+    url : str
+        The URL of the Git repository to clone.
+    local_path : str
+        The local directory where the repository will be cloned.
+    commit : str | None, optional
+        The specific commit hash to check out after cloning (default is None).
+    branch : str | None, optional
+        The branch to clone (default is None).
+    """
+
     url: str
     local_path: str
     commit: str | None = None
@@ -17,7 +36,11 @@ class CloneConfig:
 @async_timeout(CLONE_TIMEOUT)
 async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]:
     """
-    Clones a repository to a local path based on the provided query parameters.
+    Clones a repository to a local path based on the provided configuration.
+
+    This function handles the process of cloning a Git repository to the local file system.
+    It can clone a specific branch or commit if provided, and it raises exceptions if
+    any errors occur during the cloning process.
 
     Parameters
     ----------
@@ -30,7 +53,7 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]:
 
     Returns
     -------
-    Tuple[bytes, bytes]
+    tuple[bytes, bytes]
         A tuple containing the stdout and stderr of the git commands executed.
 
     Raises
@@ -123,7 +146,7 @@ async def _run_git_command(*args: str) -> tuple[bytes, bytes]:
 
     Returns
     -------
-    Tuple[bytes, bytes]
+    tuple[bytes, bytes]
         A tuple containing the stdout and stderr of the git command.
 
     Raises
diff --git a/src/gitingest/exceptions.py b/src/gitingest/exceptions.py
index f02829ba..34263e41 100644
--- a/src/gitingest/exceptions.py
+++ b/src/gitingest/exceptions.py
@@ -1,6 +1,29 @@
 class InvalidPatternError(ValueError):
+    """
+    Exception raised when a pattern contains invalid characters.
+
+    This exception is used to signal that a pattern provided for some operation
+    contains characters that are not allowed. The valid characters for the pattern
+    include alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/),
+    plus (+), and asterisk (*).
+
+    Parameters
+    ----------
+    pattern : str
+        The invalid pattern that caused the error.
+    """
+
     def __init__(self, pattern: str) -> None:
         super().__init__(
             f"Pattern '{pattern}' contains invalid characters. Only alphanumeric characters, dash (-), "
             "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed."
         )
+
+
+class AsyncTimeoutError(Exception):
+    """
+    Raised when an async operation exceeds its timeout limit.
+
+    This exception is used by the `async_timeout` decorator to signal that the wrapped
+    asynchronous function has exceeded the specified time limit for execution.
+    """
diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py
index e4c673de..4bb329fa 100644
--- a/src/gitingest/ingest.py
+++ b/src/gitingest/ingest.py
@@ -15,7 +15,39 @@ def ingest(
     exclude_patterns: list[str] | str | None = None,
     output: str | None = None,
 ) -> tuple[str, str, str]:
+    """
+    Main entry point for ingesting a source and processing its contents.
 
+    This function analyzes a source (URL or local path), clones the corresponding repository (if applicable),
+    and processes its files according to the specified query parameters. It returns a summary, a tree-like
+    structure of the files, and the content of the files. The results can optionally be written to an output file.
+
+    Parameters
+    ----------
+    source : str
+        The source to analyze, which can be a URL (for a GitHub repository) or a local directory path.
+    max_file_size : int, optional
+        The maximum allowed file size for file ingestion. Files larger than this size are ignored, by default 10*1024*1024 (10 MB).
+    include_patterns : list[str] | str | None, optional
+        A pattern or list of patterns specifying which files to include in the analysis. If `None`, all files are included.
+    exclude_patterns : list[str] | str | None, optional
+        A pattern or list of patterns specifying which files to exclude from the analysis. If `None`, no files are excluded.
+    output : str | None, optional
+        The file path where the summary and content should be written. If `None`, the results are not written to a file.
+
+    Returns
+    -------
+    tuple[str, str, str]
+        A tuple containing:
+        - A summary string of the analyzed repository or directory.
+        - A tree-like string representation of the file structure.
+        - The content of the files in the repository or directory.
+
+    Raises
+    ------
+    TypeError
+        If `clone_repo` does not return a coroutine, or if the `source` is of an unsupported type.
+    """
     try:
         query = parse_query(
             source=source,
@@ -42,8 +74,8 @@ def ingest(
 
         summary, tree, content = ingest_from_query(query)
 
-        if output:
-            with open(f"{output}", "w") as f:
+        if output is not None:
+            with open(output, "w") as f:
                 f.write(tree + "\n" + content)
 
         return summary, tree, content
diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py
index 886afa26..d8f57b71 100644
--- a/src/gitingest/ingest_from_query.py
+++ b/src/gitingest/ingest_from_query.py
@@ -11,6 +11,27 @@
 
 
 def _should_include(path: str, base_path: str, include_patterns: list[str]) -> bool:
+    """
+    Determines if the given file or directory path matches any of the include patterns.
+
+    This function checks whether the relative path of a file or directory matches
+    any of the specified patterns. If a match is found, it returns `True`, indicating
+    that the file or directory should be included in further processing.
+
+    Parameters
+    ----------
+    path : str
+        The absolute path of the file or directory to check.
+    base_path : str
+        The base directory from which the relative path is calculated.
+    include_patterns : list[str]
+        A list of patterns to check against the relative path.
+
+    Returns
+    -------
+    bool
+        `True` if the path matches any of the include patterns, `False` otherwise.
+    """
     rel_path = path.replace(base_path, "").lstrip(os.sep)
     include = False
     for pattern in include_patterns:
@@ -20,6 +41,27 @@ def _should_include(path: str, base_path: str, include_patterns: list[str]) -> b
 
 
 def _should_exclude(path: str, base_path: str, ignore_patterns: list[str]) -> bool:
+    """
+    Determines if the given file or directory path matches any of the ignore patterns.
+
+    This function checks whether the relative path of a file or directory matches
+    any of the specified ignore patterns. If a match is found, it returns `True`, indicating
+    that the file or directory should be excluded from further processing.
+
+    Parameters
+    ----------
+    path : str
+        The absolute path of the file or directory to check.
+    base_path : str
+        The base directory from which the relative path is calculated.
+    ignore_patterns : list[str]
+        A list of patterns to check against the relative path.
+
+    Returns
+    -------
+    bool
+        `True` if the path matches any of the ignore patterns, `False` otherwise.
+    """
     rel_path = path.replace(base_path, "").lstrip(os.sep)
     for pattern in ignore_patterns:
         if pattern and fnmatch(rel_path, pattern):
@@ -28,7 +70,25 @@ def _should_exclude(path: str, base_path: str, ignore_patterns: list[str]) -> bo
 
 
 def _is_safe_symlink(symlink_path: str, base_path: str) -> bool:
-    """Check if a symlink points to a location within the base directory."""
+    """
+    Check if a symlink points to a location within the base directory.
+
+    This function resolves the target of a symlink and ensures it is within the specified
+    base directory, returning `True` if it is safe, or `False` if the symlink points outside
+    the base directory.
+
+    Parameters
+    ----------
+    symlink_path : str
+        The path of the symlink to check.
+    base_path : str
+        The base directory to ensure the symlink points within.
+
+    Returns
+    -------
+    bool
+        `True` if the symlink points within the base directory, `False` otherwise.
+    """
     try:
         target_path = os.path.realpath(symlink_path)
         base_path = os.path.realpath(base_path)
@@ -39,7 +99,23 @@ def _is_safe_symlink(symlink_path: str, base_path: str) -> bool:
 
 
 def _is_text_file(file_path: str) -> bool:
-    """Determines if a file is likely a text file based on its content."""
+    """
+    Determine if a file is likely a text file based on its content.
+
+    This function attempts to read the first 1024 bytes of a file and checks for the presence
+    of non-text characters. It returns `True` if the file is determined to be a text file,
+    otherwise returns `False`.
+
+    Parameters
+    ----------
+    file_path : str
+        The path to the file to check.
+
+    Returns
+    -------
+    bool
+        `True` if the file is likely a text file, `False` otherwise.
+    """
     try:
         with open(file_path, "rb") as file:
             chunk = file.read(1024)
@@ -49,6 +125,23 @@ def _is_text_file(file_path: str) -> bool:
 
 
 def _read_file_content(file_path: str) -> str:
+    """
+    Reads the content of a file.
+
+    This function attempts to open a file and read its contents using UTF-8 encoding.
+    If an error occurs during reading (e.g., file is not found or permission error),
+    it returns an error message.
+
+    Parameters
+    ----------
+    file_path : str
+        The path to the file to read.
+
+    Returns
+    -------
+    str
+        The content of the file, or an error message if the file could not be read.
+    """
     try:
         with open(file_path, encoding="utf-8", errors="ignore") as f:
             return f.read()
@@ -63,7 +156,31 @@ def _scan_directory(
     depth: int = 0,
     stats: dict[str, int] | None = None,
 ) -> dict[str, Any] | None:
-    """Recursively analyzes a directory and its contents with safety limits."""
+    """
+    Recursively analyze a directory and its contents with safety limits.
+
+    This function scans a directory and its subdirectories up to a specified depth. It checks
+    for any file or directory that should be included or excluded based on the provided patterns
+    and limits. It also tracks the number of files and total size processed.
+
+    Parameters
+    ----------
+    path : str
+        The path of the directory to scan.
+    query : dict[str, Any]
+        A dictionary containing the query parameters, such as include and ignore patterns.
+    seen_paths : set[str] | None, optional
+        A set to track already visited paths, by default None.
+    depth : int, optional
+        The current depth of directory traversal, by default 0.
+    stats : dict[str, int] | None, optional
+        A dictionary to track statistics such as total file count and size, by default None.
+
+    Returns
+    -------
+    dict[str, Any] | None
+        A dictionary representing the directory structure and contents, or `None` if limits are reached.
+    """
     if seen_paths is None:
         seen_paths = set()
 
@@ -224,7 +341,28 @@ def _extract_files_content(
     max_file_size: int,
     files: list[dict[str, Any]] | None = None,
 ) -> list[dict[str, Any]]:
-    """Recursively collects all text files with their contents."""
+    """
+    Recursively collect all text files with their contents.
+
+    This function traverses the directory tree and extracts the contents of all text files
+    into a list, ignoring non-text files or files that exceed the specified size limit.
+
+    Parameters
+    ----------
+    query : dict[str, Any]
+        A dictionary containing the query parameters, including the base path of the repository.
+    node : dict[str, Any]
+        The current directory or file node being processed.
+    max_file_size : int
+        The maximum file size in bytes for which content should be extracted.
+    files : list[dict[str, Any]] | None, optional
+        A list to collect the extracted files' information, by default None.
+
+    Returns
+    -------
+    list[dict[str, Any]]
+        A list of dictionaries, each containing the path, content (or `None` if too large), and size of each file.
+    """
     if files is None:
         files = []
 
@@ -248,7 +386,22 @@ def _extract_files_content(
 
 
 def _create_file_content_string(files: list[dict[str, Any]]) -> str:
-    """Creates a formatted string of file contents with separators."""
+    """
+    Create a formatted string of file contents with separators.
+
+    This function takes a list of files and generates a formatted string where each file’s
+    content is separated by a divider. If a README.md file is found, it is placed at the top.
+
+    Parameters
+    ----------
+    files : list[dict[str, Any]]
+        A list of dictionaries containing file information, including the path and content.
+
+    Returns
+    -------
+    str
+        A formatted string representing the contents of all the files with appropriate separators.
+    """
     output = ""
     separator = "=" * 48 + "\n"
 
@@ -278,7 +431,24 @@ def _create_file_content_string(files: list[dict[str, Any]]) -> str:
 
 
 def _create_summary_string(query: dict[str, Any], nodes: dict[str, Any]) -> str:
-    """Creates a summary string with file counts and content size."""
+    """
+    Create a summary string with file counts and content size.
+
+    This function generates a summary of the repository's contents, including the number
+    of files analyzed, the total content size, and other relevant details based on the query parameters.
+
+    Parameters
+    ----------
+    query : dict[str, Any]
+        A dictionary containing query parameters like repository name, commit, branch, and subpath.
+    nodes : dict[str, Any]
+        A dictionary representing the directory structure, including file and directory counts.
+
+    Returns
+    -------
+    str
+        A summary string containing details such as the repository name, file count, and other query-specific information.
+    """
     if "user_name" in query:
         summary = f"Repository: {query['user_name']}/{query['repo_name']}\n"
     else:
@@ -297,7 +467,28 @@ def _create_summary_string(query: dict[str, Any], nodes: dict[str, Any]) -> str:
 
 
 def _create_tree_structure(query: dict[str, Any], node: dict[str, Any], prefix: str = "", is_last: bool = True) -> str:
-    """Creates a tree-like string representation of the file structure."""
+    """
+    Create a tree-like string representation of the file structure.
+
+    This function generates a string representation of the directory structure, formatted
+    as a tree with appropriate indentation for nested directories and files.
+
+    Parameters
+    ----------
+    query : dict[str, Any]
+        A dictionary containing query parameters like repository name and subpath.
+    node : dict[str, Any]
+        The current directory or file node being processed.
+    prefix : str, optional
+        A string used for indentation and formatting of the tree structure, by default "".
+    is_last : bool, optional
+        A flag indicating whether the current node is the last in its directory, by default True.
+
+    Returns
+    -------
+    str
+        A string representing the directory structure formatted as a tree.
+    """
     tree = ""
 
     if not node["name"]:
@@ -319,7 +510,22 @@ def _create_tree_structure(query: dict[str, Any], node: dict[str, Any], prefix:
 
 
 def _generate_token_string(context_string: str) -> str | None:
-    """Returns the number of tokens in a text string."""
+    """
+    Return the number of tokens in a text string.
+
+    This function estimates the number of tokens in a given text string using the `tiktoken`
+    library. It returns the number of tokens in a human-readable format (e.g., '1.2k', '1.2M').
+
+    Parameters
+    ----------
+    context_string : str
+        The text string for which the token count is to be estimated.
+
+    Returns
+    -------
+    str | None
+        The formatted number of tokens as a string (e.g., '1.2k', '1.2M'), or `None` if an error occurs.
+    """
     formatted_tokens = ""
     try:
         encoding = tiktoken.get_encoding("cl100k_base")
@@ -340,6 +546,29 @@ def _generate_token_string(context_string: str) -> str | None:
 
 
 def _ingest_single_file(path: str, query: dict[str, Any]) -> tuple[str, str, str]:
+    """
+    Ingest a single file and return its summary, directory structure, and content.
+
+    This function reads a file, generates a summary of its contents, and returns the content
+    along with its directory structure and token estimation.
+
+    Parameters
+    ----------
+    path : str
+        The path of the file to ingest.
+    query : dict[str, Any]
+        A dictionary containing query parameters, such as the maximum file size.
+
+    Returns
+    -------
+    tuple[str, str, str]
+        A tuple containing the summary, directory structure, and file content.
+
+    Raises
+    ------
+    ValueError
+        If the specified path is not a file or if the file is not a text file.
+    """
     if not os.path.isfile(path):
         raise ValueError(f"Path {path} is not a file")
 
@@ -376,6 +605,29 @@ def _ingest_single_file(path: str, query: dict[str, Any]) -> tuple[str, str, str
 
 
 def _ingest_directory(path: str, query: dict[str, Any]) -> tuple[str, str, str]:
+    """
+    Ingest an entire directory and return its summary, directory structure, and file contents.
+
+    This function processes a directory, extracts its contents, and generates a summary,
+    directory structure, and file content. It recursively processes subdirectories as well.
+
+    Parameters
+    ----------
+    path : str
+        The path of the directory to ingest.
+    query : dict[str, Any]
+        A dictionary containing query parameters, including maximum file size.
+
+    Returns
+    -------
+    tuple[str, str, str]
+        A tuple containing the summary, directory structure, and file contents.
+
+    Raises
+    ------
+    ValueError
+        If no files are found in the directory.
+    """
     nodes = _scan_directory(path=path, query=query)
     if not nodes:
         raise ValueError(f"No files found in {path}")
@@ -392,7 +644,27 @@ def _ingest_directory(path: str, query: dict[str, Any]) -> tuple[str, str, str]:
 
 
 def ingest_from_query(query: dict[str, Any]) -> tuple[str, str, str]:
-    """Main entry point for analyzing a codebase directory or single file."""
+    """
+    Main entry point for analyzing a codebase directory or single file.
+
+    This function processes a file or directory based on the provided query, extracting its contents
+    and generating a summary, directory structure, and file content, along with token estimations.
+
+    Parameters
+    ----------
+    query : dict[str, Any]
+        A dictionary containing parameters like local path, subpath, file type, etc.
+
+    Returns
+    -------
+    tuple[str, str, str]
+        A tuple containing the summary, directory structure, and file contents.
+
+    Raises
+    ------
+    ValueError
+        If the specified path cannot be found or if the file is not a text file.
+    """
     path = f"{query['local_path']}{query['subpath']}"
     if not os.path.exists(path):
         raise ValueError(f"{query['slug']} cannot be found")
diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py
index 27330eb1..18a78e9a 100644
--- a/src/gitingest/parse_query.py
+++ b/src/gitingest/parse_query.py
@@ -22,6 +22,10 @@ def parse_query(
     """
     Parses the input source to construct a query dictionary with specified parameters.
 
+    This function processes the provided source (either a URL or file path) and builds a
+    query dictionary that includes information such as the source URL, maximum file size,
+    and any patterns to include or ignore. It handles both web and file-based sources.
+
     Parameters
     ----------
     source : str
@@ -30,14 +34,14 @@ def parse_query(
         The maximum file size in bytes to include.
     from_web : bool
         Flag indicating whether the source is a web URL.
-    include_patterns : Optional[Union[List[str], str]], optional
+    include_patterns : list[str] | str | None, optional
         Patterns to include, by default None. Can be a list of strings or a single string.
-    ignore_patterns : Optional[Union[List[str], str]], optional
+    ignore_patterns : list[str] | str | None, optional
         Patterns to ignore, by default None. Can be a list of strings or a single string.
 
     Returns
     -------
-    Dict[str, Any]
+    dict[str, Any]
         A dictionary containing the parsed query parameters, including 'max_file_size',
         'ignore_patterns', and 'include_patterns'.
     """
@@ -71,6 +75,28 @@ def parse_query(
 
 
 def _parse_url(url: str) -> dict[str, Any]:
+    """
+    Parses a GitHub repository URL into a structured query dictionary.
+
+    This function extracts relevant information from a GitHub URL, such as the username,
+    repository name, commit, branch, and subpath, and returns them in a structured format.
+
+    Parameters
+    ----------
+    url : str
+        The GitHub URL to parse.
+
+    Returns
+    -------
+    dict[str, Any]
+        A dictionary containing the parsed details of the GitHub repository, including
+        the username, repository name, commit, branch, and other relevant information.
+
+    Raises
+    ------
+    ValueError
+        If the URL is invalid or does not correspond to a valid Git repository.
+    """
     url = url.split(" ")[0]
     url = unquote(url)  # Decode URL-encoded characters
 
@@ -126,10 +152,42 @@ def _parse_url(url: str) -> dict[str, Any]:
 
 
 def _is_valid_git_commit_hash(commit: str) -> bool:
+    """
+    Validates if the provided string is a valid Git commit hash.
+
+    This function checks if the commit hash is a 40-character string consisting only
+    of hexadecimal digits, which is the standard format for Git commit hashes.
+
+    Parameters
+    ----------
+    commit : str
+        The string to validate as a Git commit hash.
+
+    Returns
+    -------
+    bool
+        True if the string is a valid 40-character Git commit hash, otherwise False.
+    """
     return len(commit) == 40 and all(c in HEX_DIGITS for c in commit)
 
 
 def _normalize_pattern(pattern: str) -> str:
+    """
+    Normalizes the given pattern by removing leading separators and appending a wildcard.
+
+    This function processes the pattern string by stripping leading directory separators
+    and appending a wildcard (`*`) if the pattern ends with a separator.
+
+    Parameters
+    ----------
+    pattern : str
+        The pattern to normalize.
+
+    Returns
+    -------
+    str
+        The normalized pattern.
+    """
     pattern = pattern.lstrip(os.sep)
     if pattern.endswith(os.sep):
         pattern += "*"
@@ -161,7 +219,6 @@ def _parse_patterns(pattern: list[str] | str) -> list[str]:
         asterisk (*) are allowed.
     """
     patterns = pattern if isinstance(pattern, list) else [pattern]
-    patterns = [p.strip() for p in patterns]
 
     parsed_patterns = []
     for p in patterns:
@@ -182,20 +239,37 @@ def _override_ignore_patterns(ignore_patterns: list[str], include_patterns: list
 
     Parameters
     ----------
-    ignore_patterns : List[str]
+    ignore_patterns : list[str]
         The list of patterns to potentially remove.
-    include_patterns : List[str]
+    include_patterns : list[str]
         The list of patterns to exclude from ignore_patterns.
 
     Returns
     -------
-    List[str]
+    list[str]
         A new list of ignore_patterns with specified patterns removed.
     """
     return list(set(ignore_patterns) - set(include_patterns))
 
 
 def _parse_path(path: str) -> dict[str, Any]:
+    """
+    Parses a file path into a structured query dictionary.
+
+    This function takes a file path and constructs a query dictionary that includes
+    relevant details such as the absolute path and the slug (a combination of the
+    directory and file names).
+
+    Parameters
+    ----------
+    path : str
+        The file path to parse.
+
+    Returns
+    -------
+    dict[str, Any]
+        A dictionary containing parsed details such as the local file path and slug.
+    """
     query = {
         "url": None,
         "local_path": os.path.abspath(path),
@@ -208,16 +282,20 @@ def _parse_path(path: str) -> dict[str, Any]:
 
 def _is_valid_pattern(pattern: str) -> bool:
     """
-    _summary_
+    Validates if the given pattern contains only valid characters.
+
+    This function checks if the pattern contains only alphanumeric characters or one
+    of the following allowed characters: dash (`-`), underscore (`_`), dot (`.`),
+    forward slash (`/`), plus (`+`), or asterisk (`*`).
 
     Parameters
     ----------
     pattern : str
-        _description_
+        The pattern to validate.
 
     Returns
     -------
     bool
-        _description_
+        True if the pattern is valid, otherwise False.
     """
     return all(c.isalnum() or c in "-_./+*" for c in pattern)
diff --git a/src/gitingest/utils.py b/src/gitingest/utils.py
index 82b8e303..bc95bfcc 100644
--- a/src/gitingest/utils.py
+++ b/src/gitingest/utils.py
@@ -3,16 +3,39 @@
 from collections.abc import Awaitable, Callable
 from typing import ParamSpec, TypeVar
 
+from gitingest.exceptions import AsyncTimeoutError
+
 T = TypeVar("T")
 P = ParamSpec("P")
 
 
-class AsyncTimeoutError(Exception):
-    """Raised when an async operation exceeds its timeout limit."""
+def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]:
+    """
+    Async Timeout decorator.
 
+    This decorator wraps an asynchronous function and ensures it does not run for
+    longer than the specified number of seconds. If the function execution exceeds
+    this limit, it raises an `AsyncTimeoutError`.
+
+    Parameters
+    ----------
+    seconds : int, optional
+        The maximum allowed time (in seconds) for the asynchronous function to complete.
+        The default is 10 seconds.
+
+    Returns
+    -------
+    Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]
+        A decorator that, when applied to an async function, ensures the function
+        completes within the specified time limit. If the function takes too long,
+        an `AsyncTimeoutError` is raised.
+
+    Raises
+    ------
+    AsyncTimeoutError
+        If the wrapped asynchronous function does not complete within the specified time limit.
+    """
 
-def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]:
-    # Async Timeout decorator
     def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]:
         @functools.wraps(func)
         async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
diff --git a/src/routers/download.py b/src/routers/download.py
index 2dc10224..fdbf2bbe 100644
--- a/src/routers/download.py
+++ b/src/routers/download.py
@@ -10,6 +10,33 @@
 
 @router.get("/download/{digest_id}")
 async def download_ingest(digest_id: str) -> Response:
+    """
+    Downloads a .txt file associated with a given digest ID.
+
+    This function searches for a `.txt` file in a directory corresponding to the provided
+    digest ID. If a file is found, it is read and returned as a downloadable attachment.
+    If no `.txt` file is found, an error is raised.
+
+    Parameters
+    ----------
+    digest_id : str
+        The unique identifier for the digest. It is used to find the corresponding directory
+        and locate the .txt file within that directory.
+
+    Returns
+    -------
+    Response
+        A FastAPI Response object containing the content of the found `.txt` file. The file is
+        sent with the appropriate media type (`text/plain`) and the correct `Content-Disposition`
+        header to prompt a file download.
+
+    Raises
+    ------
+    FileNotFoundError
+        If no `.txt` file is found in the directory corresponding to the given `digest_id`.
+    HTTPException
+        If the digest directory is not found or if no `.txt` file exists in the directory.
+    """
     try:
         # Find the first .txt file in the directory
         directory = f"{TMP_BASE_PATH}/{digest_id}"
diff --git a/src/routers/dynamic.py b/src/routers/dynamic.py
index 12216f15..bfd6d44a 100644
--- a/src/routers/dynamic.py
+++ b/src/routers/dynamic.py
@@ -11,6 +11,25 @@
 
 @router.get("/{full_path:path}")
 async def catch_all(request: Request, full_path: str) -> HTMLResponse:
+    """
+    Renders a page with a GitHub URL based on the provided path.
+
+    This endpoint catches all GET requests with a dynamic path, constructs a GitHub URL
+    using the `full_path` parameter, and renders the `github.jinja` template with that URL.
+
+    Parameters
+    ----------
+    request : Request
+        The incoming request object, which provides context for rendering the response.
+    full_path : str
+        The full path extracted from the URL, which is used to build the GitHub URL.
+
+    Returns
+    -------
+    HTMLResponse
+        An HTML response containing the rendered template, with the GitHub URL
+        and other default parameters such as loading state and file size.
+    """
     return templates.TemplateResponse(
         "github.jinja",
         {
@@ -31,6 +50,31 @@ async def process_catch_all(
     pattern_type: str = Form(...),
     pattern: str = Form(...),
 ) -> HTMLResponse:
+    """
+    Processes the form submission with user input for query parameters.
+
+    This endpoint handles POST requests, processes the input parameters (e.g., text, file size, pattern),
+    and calls the `process_query` function to handle the query logic, returning the result as an HTML response.
+
+    Parameters
+    ----------
+    request : Request
+        The incoming request object, which provides context for rendering the response.
+    input_text : str, optional
+        The input text provided by the user for processing, by default taken from the form.
+    max_file_size : int, optional
+        The maximum allowed file size for the input, specified by the user.
+    pattern_type : str, optional
+        The type of pattern used for the query, specified by the user.
+    pattern : str, optional
+        The pattern string used in the query, specified by the user.
+
+    Returns
+    -------
+    HTMLResponse
+        An HTML response generated after processing the form input and query logic,
+        which will be rendered and returned to the user.
+    """
     return await process_query(
         request,
         input_text,
diff --git a/src/routers/index.py b/src/routers/index.py
index f2728805..9665bd08 100644
--- a/src/routers/index.py
+++ b/src/routers/index.py
@@ -12,6 +12,23 @@
 
 @router.get("/", response_class=HTMLResponse)
 async def home(request: Request) -> HTMLResponse:
+    """
+    Renders the home page with example repositories and default parameters.
+
+    This endpoint serves the home page of the application, rendering the `index.jinja` template
+    and providing it with a list of example repositories and default file size values.
+
+    Parameters
+    ----------
+    request : Request
+        The incoming request object, which provides context for rendering the response.
+
+    Returns
+    -------
+    HTMLResponse
+        An HTML response containing the rendered home page template, with example repositories
+        and other default parameters such as file size.
+    """
     return templates.TemplateResponse(
         "index.jinja",
         {
@@ -31,6 +48,32 @@ async def index_post(
     pattern_type: str = Form(...),
     pattern: str = Form(...),
 ) -> HTMLResponse:
+    """
+    Processes the form submission with user input for query parameters.
+
+    This endpoint handles POST requests from the home page form. It processes the user-submitted
+    input (e.g., text, file size, pattern type) and invokes the `process_query` function to handle
+    the query logic, returning the result as an HTML response.
+
+    Parameters
+    ----------
+    request : Request
+        The incoming request object, which provides context for rendering the response.
+    input_text : str, optional
+        The input text provided by the user for processing, by default taken from the form.
+    max_file_size : int, optional
+        The maximum allowed file size for the input, specified by the user.
+    pattern_type : str, optional
+        The type of pattern used for the query, specified by the user.
+    pattern : str, optional
+        The pattern string used in the query, specified by the user.
+
+    Returns
+    -------
+    HTMLResponse
+        An HTML response containing the results of processing the form input and query logic,
+        which will be rendered and returned to the user.
+    """
     return await process_query(
         request,
         input_text,

From d3f69d1742255e32e7ebb647549f12e6ba4ffc4f Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Mon, 30 Dec 2024 10:37:52 +0100
Subject: [PATCH 3/3] Refactor: Move process_query to top and prefix helper
 functions with an underscore

---
 src/process_query.py | 148 +++++++++++++++++++++----------------------
 1 file changed, 74 insertions(+), 74 deletions(-)

diff --git a/src/process_query.py b/src/process_query.py
index 470b675b..4053e45c 100644
--- a/src/process_query.py
+++ b/src/process_query.py
@@ -11,78 +11,6 @@
 templates = Jinja2Templates(directory="templates")
 
 
-def print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None:
-    """
-    Print a formatted summary of the query details, including the URL, file size,
-    and pattern information, for easier debugging or logging.
-
-    Parameters
-    ----------
-    url : str
-        The URL associated with the query.
-    max_file_size : int
-        The maximum file size allowed for the query, in bytes.
-    pattern_type : str
-        Specifies the type of pattern to use, either "include" or "exclude".
-    pattern : str
-        The actual pattern string to include or exclude in the query.
-    """
-    print(f"{Colors.WHITE}{url:<20}{Colors.END}", end="")
-    if int(max_file_size / 1024) != 50:
-        print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="")
-    if pattern_type == "include" and pattern != "":
-        print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="")
-    elif pattern_type == "exclude" and pattern != "":
-        print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="")
-
-
-def print_error(url: str, e: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None:
-    """
-    Print a formatted error message including the URL, file size, pattern details, and the exception encountered,
-    for debugging or logging purposes.
-
-    Parameters
-    ----------
-    url : str
-        The URL associated with the query that caused the error.
-    e : Exception
-        The exception raised during the query or process.
-    max_file_size : int
-        The maximum file size allowed for the query, in bytes.
-    pattern_type : str
-        Specifies the type of pattern to use, either "include" or "exclude".
-    pattern : str
-        The actual pattern string to include or exclude in the query.
-    """
-    print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<-  {Colors.END}", end="")
-    print_query(url, max_file_size, pattern_type, pattern)
-    print(f" | {Colors.RED}{e}{Colors.END}")
-
-
-def print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None:
-    """
-    Print a formatted success message, including the URL, file size, pattern details, and a summary with estimated
-    tokens, for debugging or logging purposes.
-
-    Parameters
-    ----------
-    url : str
-        The URL associated with the successful query.
-    max_file_size : int
-        The maximum file size allowed for the query, in bytes.
-    pattern_type : str
-        Specifies the type of pattern to use, either "include" or "exclude".
-    pattern : str
-        The actual pattern string to include or exclude in the query.
-    summary : str
-        A summary of the query result, including details like estimated tokens.
-    """
-    estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :]
-    print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<-  {Colors.END}", end="")
-    print_query(url, max_file_size, pattern_type, pattern)
-    print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}")
-
-
 async def process_query(
     request: Request,
     input_text: str,
@@ -149,7 +77,7 @@ async def process_query(
     except Exception as e:
         # hack to print error message when query is not defined
         if "query" in locals() and query is not None and isinstance(query, dict):
-            print_error(query["url"], e, max_file_size, pattern_type, pattern)
+            _print_error(query["url"], e, max_file_size, pattern_type, pattern)
         else:
             print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<-  {Colors.END}", end="")
             print(f"{Colors.RED}{e}{Colors.END}")
@@ -173,7 +101,7 @@ async def process_query(
             "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE]
         )
 
-    print_success(
+    _print_success(
         url=query["url"],
         max_file_size=max_file_size,
         pattern_type=pattern_type,
@@ -197,3 +125,75 @@ async def process_query(
             "pattern": pattern,
         },
     )
+
+
+def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None:
+    """
+    Print a formatted summary of the query details, including the URL, file size,
+    and pattern information, for easier debugging or logging.
+
+    Parameters
+    ----------
+    url : str
+        The URL associated with the query.
+    max_file_size : int
+        The maximum file size allowed for the query, in bytes.
+    pattern_type : str
+        Specifies the type of pattern to use, either "include" or "exclude".
+    pattern : str
+        The actual pattern string to include or exclude in the query.
+    """
+    print(f"{Colors.WHITE}{url:<20}{Colors.END}", end="")
+    if int(max_file_size / 1024) != 50:
+        print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="")
+    if pattern_type == "include" and pattern != "":
+        print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="")
+    elif pattern_type == "exclude" and pattern != "":
+        print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="")
+
+
+def _print_error(url: str, e: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None:
+    """
+    Print a formatted error message including the URL, file size, pattern details, and the exception encountered,
+    for debugging or logging purposes.
+
+    Parameters
+    ----------
+    url : str
+        The URL associated with the query that caused the error.
+    e : Exception
+        The exception raised during the query or process.
+    max_file_size : int
+        The maximum file size allowed for the query, in bytes.
+    pattern_type : str
+        Specifies the type of pattern to use, either "include" or "exclude".
+    pattern : str
+        The actual pattern string to include or exclude in the query.
+    """
+    print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<-  {Colors.END}", end="")
+    _print_query(url, max_file_size, pattern_type, pattern)
+    print(f" | {Colors.RED}{e}{Colors.END}")
+
+
+def _print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None:
+    """
+    Print a formatted success message, including the URL, file size, pattern details, and a summary with estimated
+    tokens, for debugging or logging purposes.
+
+    Parameters
+    ----------
+    url : str
+        The URL associated with the successful query.
+    max_file_size : int
+        The maximum file size allowed for the query, in bytes.
+    pattern_type : str
+        Specifies the type of pattern to use, either "include" or "exclude".
+    pattern : str
+        The actual pattern string to include or exclude in the query.
+    summary : str
+        A summary of the query result, including details like estimated tokens.
+    """
+    estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :]
+    print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<-  {Colors.END}", end="")
+    _print_query(url, max_file_size, pattern_type, pattern)
+    print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}")