From 880c57b2178bd9e1434ee188cfdbf7b4b567fb67 Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Fri, 10 Jan 2025 11:26:23 +0100
Subject: [PATCH 1/7] refactor: standardize terminology and documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Standardized capitalization of 'Git', 'GitHub', and 'URL'
- Removed trailing slashes in links and added missing sentence periods in `README.md`
- Adjusted docstrings to adhere to PEP 257 by using imperative tense
- Standardized docstrings in `exceptions.py`
- Replaced 'GitHub' with 'Git' when referring to broader context
- Renamed templates: `github.jinja` → `git.jinja`, `github_form.jinja` → `git_form.jinja`
- Renamed variables: `github_url` → `repo_url`
---
 Dockerfile                                    |  2 +-
 README.md                                     | 30 +++++++++----------
 src/gitingest/__init__.py                     |  2 +-
 src/gitingest/exceptions.py                   |  4 +--
 src/gitingest/query_ingestion.py              |  4 ++-
 src/gitingest/repository_clone.py             | 17 +++++------
 src/gitingest/repository_ingest.py            |  2 +-
 src/query_processor.py                        |  8 ++---
 src/routers/dynamic.py                        | 16 +++++-----
 src/routers/index.py                          |  4 +--
 src/templates/api.jinja                       |  2 +-
 src/templates/base.jinja                      |  4 +--
 src/templates/components/footer.jinja         |  2 +-
 .../{github_form.jinja => git_form.jinja}     |  2 +-
 src/templates/{github.jinja => git.jinja}     |  2 +-
 src/templates/index.jinja                     |  4 +--
 16 files changed, 53 insertions(+), 52 deletions(-)
 rename src/templates/components/{github_form.jinja => git_form.jinja} (98%)
 rename src/templates/{github.jinja => git.jinja} (97%)
diff --git a/Dockerfile b/Dockerfile
index 564a5abb..cb0eab80 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -20,7 +20,7 @@ FROM python:3.12-slim
 ENV PYTHONUNBUFFERED=1
 ENV PYTHONDONTWRITEBYTECODE=1
 
-# Install git
+# Install Git
 RUN apt-get update \
     && apt-get install -y --no-install-recommends git curl\
     && rm -rf /var/lib/apt/lists/*
diff --git a/README.md b/README.md
index d5fe3079..049f6402 100644
--- a/README.md
+++ b/README.md
@@ -11,13 +11,13 @@
 
 Turn any Git repository into a prompt-friendly text ingest for LLMs.
 
-You can also replace `hub` with `ingest` in any GitHub URL to access the coresponding digest
+You can also replace `hub` with `ingest` in any GitHub URL to access the coresponding digest.
 
-[gitingest.com](https://gitingest.com/) · [Chrome Extension](https://chromewebstore.google.com/detail/adfjahbijlkjfoicpjkhjicpjpjfaood) · [Firefox Add-on](https://addons.mozilla.org/firefox/addon/gitingest/)
+[gitingest.com](https://gitingest.com) · [Chrome Extension](https://chromewebstore.google.com/detail/adfjahbijlkjfoicpjkhjicpjpjfaood) · [Firefox Add-on](https://addons.mozilla.org/firefox/addon/gitingest)
 
 ## 🚀 Features
 
-- **Easy code context**: Get a text digest from a git repository URL or a directory
+- **Easy code context**: Get a text digest from a Git repository URL or a directory
 - **Smart Formatting**: Optimized output format for LLM prompts
 - **Statistics about**:
   - File and directory structure
@@ -36,11 +36,12 @@ pip install gitingest
 
 <!-- markdownlint-disable MD033 -->
 <a href="https://chromewebstore.google.com/detail/adfjahbijlkjfoicpjkhjicpjpjfaood" target="_blank" title="Get Gitingest Extension from Chrome Web Store"><img height="48" src="https://github.com/user-attachments/assets/20a6e44b-fd46-4e6c-8ea6-aad436035753" alt="Available in the Chrome Web Store" /></a>
-<a href="https://addons.mozilla.org/firefox/addon/gitingest/" target="_blank" title="Get Gitingest Extension from Firefox Add-ons"><img height="48" src="https://github.com/user-attachments/assets/c0e99e6b-97cf-4af2-9737-099db7d3538b" alt="Get The Add-on for Firefox" /></a>
+<a href="https://addons.mozilla.org/firefox/addon/gitingest" target="_blank" title="Get Gitingest Extension from Firefox Add-ons"><img height="48" src="https://github.com/user-attachments/assets/c0e99e6b-97cf-4af2-9737-099db7d3538b" alt="Get The Add-on for Firefox" /></a>
 <a href="https://microsoftedge.microsoft.com/addons/detail/nfobhllgcekbmpifkjlopfdfdmljmipf" target="_blank" title="Get Gitingest Extension from Firefox Add-ons"><img height="48" src="https://github.com/user-attachments/assets/204157eb-4cae-4c0e-b2cb-db514419fd9e" alt="Get from the Edge Add-ons" /></a>
 <!-- markdownlint-enable MD033 -->
 
 The extension is open source at [lcandy2/gitingest-extension](https://github.com/lcandy2/gitingest-extension).
+
 Issues and feature requests are welcome to the repo.
 
 ## 💡 Command line usage
@@ -71,7 +72,7 @@ summary, tree, content = ingest("path/to/directory")
 summary, tree, content = ingest("https://github.com/cyclotruc/gitingest")
 ```
 
-By default, this won't write a file but can be enabled with the `output` argument
+By default, this won't write a file but can be enabled with the `output` argument.
 
 ## 🌐 Self-host
 
@@ -87,31 +88,30 @@ By default, this won't write a file but can be enabled with the `output` argumen
    docker run -d --name gitingest -p 8000:8000 gitingest
    ```
 
-The application will be available at `http://localhost:8000`
+The application will be available at `http://localhost:8000`.
 
 If you are hosting it on a domain, you can specify the allowed hostnames via env variable `ALLOWED_HOSTS`.
 
    ```bash
-   #Default: "gitingest.com,*.gitingest.com,localhost, 127.0.0.1".
+   # Default: "gitingest.com, *.gitingest.com, localhost, 127.0.0.1".
    ALLOWED_HOSTS="example.com, localhost, 127.0.0.1"
    ```
 
 ## 🛠️ Stack
 
-- [Tailwind CSS](https://tailwindcss.com/) - Frontend
+- [Tailwind CSS](https://tailwindcss.com) - Frontend
 - [FastAPI](https://github.com/fastapi/fastapi) - Backend framework
-- [Jinja2](https://jinja.palletsprojects.com/) - HTML templating
+- [Jinja2](https://jinja.palletsprojects.com) - HTML templating
 - [tiktoken](https://github.com/openai/tiktoken) - Token estimation
-- [apianalytics.dev](https://www.apianalytics.dev/) - Simple Analytics
+- [apianalytics.dev](https://www.apianalytics.dev) - Simple Analytics
 
-### Looking for a javascript/node package?
+### Looking for a JavaScript/Node package?
 
 Check out the NPM alternative 📦 Repomix: <https://github.com/yamadashy/repomix>
 
 ## ✔️ Contributing to Gitingest
 
-Gitingest aims to be friendly for first time contributors, with a simple python and html codebase.
- If you need any help while working with the code, reach out to us on [discord](https://discord.com/invite/zerRaGK9EC)
+Gitingest aims to be friendly for first time contributors, with a simple python and html codebase. If you need any help while working with the code, reach out to us on [Discord](https://discord.com/invite/zerRaGK9EC).
 
 ### Ways to help (non-technical)
 
@@ -125,7 +125,7 @@ Gitingest aims to be friendly for first time contributors, with a simple python
 2. Setup the dev environment (see Development section bellow)
 3. Run unit tests with `pytest`
 4. Commit your changes and run `pre-commit`
-5. Open a pull request on Github for review and feedback
+5. Open a pull request on GitHub for review and feedback
 6. (Optionnal) Invite project maintainer to your branch for easier collaboration
 
 ## 🔧 Development
@@ -161,7 +161,7 @@ Gitingest aims to be friendly for first time contributors, with a simple python
    pytest
    ```
 
-The application should be available at `http://localhost:8000`
+The application should be available at `http://localhost:8000`.
 
 ### Working on the CLI
 
diff --git a/src/gitingest/__init__.py b/src/gitingest/__init__.py
index c592350b..692de607 100644
--- a/src/gitingest/__init__.py
+++ b/src/gitingest/__init__.py
@@ -1,4 +1,4 @@
-""" Gitingest: A package for ingesting data from git repositories. """
+""" Gitingest: A package for ingesting data from Git repositories. """
 
 from gitingest.query_ingestion import run_ingest_query
 from gitingest.query_parser import parse_query
diff --git a/src/gitingest/exceptions.py b/src/gitingest/exceptions.py
index bfb3888b..8808cf77 100644
--- a/src/gitingest/exceptions.py
+++ b/src/gitingest/exceptions.py
@@ -23,7 +23,7 @@ def __init__(self, pattern: str) -> None:
 
 class AsyncTimeoutError(Exception):
     """
-    Raised when an async operation exceeds its timeout limit.
+    Exception raised when an async operation exceeds its timeout limit.
 
     This exception is used by the `async_timeout` decorator to signal that the wrapped
     asynchronous function has exceeded the specified time limit for execution.
@@ -38,7 +38,7 @@ def __init__(self, max_files: int) -> None:
 
 
 class MaxFileSizeReachedError(Exception):
-    """Raised when the maximum file size is reached."""
+    """Exception raised when the maximum file size is reached."""
 
     def __init__(self, max_size: int):
         super().__init__(f"Maximum file size limit ({max_size/1024/1024:.1f}MB) reached.")
diff --git a/src/gitingest/query_ingestion.py b/src/gitingest/query_ingestion.py
index c58ea810..3396ca6e 100644
--- a/src/gitingest/query_ingestion.py
+++ b/src/gitingest/query_ingestion.py
@@ -170,7 +170,9 @@ def _read_file_content(file_path: Path) -> str:
 
 def _sort_children(children: list[dict[str, Any]]) -> list[dict[str, Any]]:
     """
-    Sort children nodes with:
+    Sort the children nodes of a directory according to a specific order.
+
+    Order of sorting:
     1. README.md first
     2. Regular files (not starting with dot)
     3. Hidden files (starting with dot)
diff --git a/src/gitingest/repository_clone.py b/src/gitingest/repository_clone.py
index 01ba3877..57374ada 100644
--- a/src/gitingest/repository_clone.py
+++ b/src/gitingest/repository_clone.py
@@ -37,7 +37,7 @@ class CloneConfig:
 @async_timeout(CLONE_TIMEOUT)
 async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]:
     """
-    Clones a repository to a local path based on the provided configuration.
+    Clone a repository to a local path based on the provided configuration.
 
     This function handles the process of cloning a Git repository to the local file system.
     It can clone a specific branch or commit if provided, and it raises exceptions if
@@ -55,7 +55,7 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]:
     Returns
     -------
     tuple[bytes, bytes]
-        A tuple containing the stdout and stderr of the git commands executed.
+        A tuple containing the stdout and stderr of the Git commands executed.
 
     Raises
     ------
@@ -101,13 +101,12 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]:
 
 async def _check_repo_exists(url: str) -> bool:
     """
-    Check if a repository exists at the given URL using an HTTP HEAD request.
+    Check if a Git repository exists at the provided URL.
 
     Parameters
     ----------
     url : str
-        The URL of the repository.
-
+        The URL of the Git repository to check.
     Returns
     -------
     bool
@@ -130,22 +129,22 @@ async def _check_repo_exists(url: str) -> bool:
 
 async def _run_git_command(*args: str) -> tuple[bytes, bytes]:
     """
-    Executes a git command asynchronously and captures its output.
+    Execute a Git command asynchronously and captures its output.
 
     Parameters
     ----------
     *args : str
-        The git command and its arguments to execute.
+        The Git command and its arguments to execute.
 
     Returns
     -------
     tuple[bytes, bytes]
-        A tuple containing the stdout and stderr of the git command.
+        A tuple containing the stdout and stderr of the Git command.
 
     Raises
     ------
     RuntimeError
-        If the git command exits with a non-zero status.
+        If the Git command exits with a non-zero status.
     """
     proc = await asyncio.create_subprocess_exec(
         *args,
diff --git a/src/gitingest/repository_ingest.py b/src/gitingest/repository_ingest.py
index e2cecaa3..a1149847 100644
--- a/src/gitingest/repository_ingest.py
+++ b/src/gitingest/repository_ingest.py
@@ -27,7 +27,7 @@ def ingest(
     Parameters
     ----------
     source : str
-        The source to analyze, which can be a URL (for a GitHub repository) or a local directory path.
+        The source to analyze, which can be a URL (for a Git repository) or a local directory path.
     max_file_size : int
         Maximum allowed file size for file ingestion. Files larger than this size are ignored, by default
         10*1024*1024 (10 MB).
diff --git a/src/query_processor.py b/src/query_processor.py
index f6c7df85..544a2eea 100644
--- a/src/query_processor.py
+++ b/src/query_processor.py
@@ -26,7 +26,7 @@ async def process_query(
     """
     Process a query by parsing input, cloning a repository, and generating a summary.
 
-    Handle user input, process GitHub repository data, and prepare
+    Handle user input, process Git repository data, and prepare
     a response for rendering a template with the processed results or an error message.
 
     Parameters
@@ -34,7 +34,7 @@ async def process_query(
     request : Request
         The HTTP request object.
     input_text : str
-        Input text provided by the user, typically a GitHub repository URL or slug.
+        Input text provided by the user, typically a Git repository URL or slug.
     slider_position : int
         Position of the slider, representing the maximum file size in the query.
     pattern_type : str
@@ -63,13 +63,13 @@ async def process_query(
     else:
         raise ValueError(f"Invalid pattern type: {pattern_type}")
 
-    template = "index.jinja" if is_index else "github.jinja"
+    template = "index.jinja" if is_index else "git.jinja"
     template_response = partial(templates.TemplateResponse, name=template)
     max_file_size = log_slider_to_size(slider_position)
 
     context = {
         "request": request,
-        "github_url": input_text,
+        "repo_url": input_text,
         "examples": EXAMPLE_REPOS if is_index else [],
         "default_file_size": slider_position,
         "pattern_type": pattern_type,
diff --git a/src/routers/dynamic.py b/src/routers/dynamic.py
index add89c4f..0787fbfa 100644
--- a/src/routers/dynamic.py
+++ b/src/routers/dynamic.py
@@ -14,29 +14,29 @@
 @router.get("/{full_path:path}")
 async def catch_all(request: Request, full_path: str) -> HTMLResponse:
     """
-    Renders a page with a GitHub URL based on the provided path.
+    Render a page with a Git URL based on the provided path.
 
-    This endpoint catches all GET requests with a dynamic path, constructs a GitHub URL
-    using the `full_path` parameter, and renders the `github.jinja` template with that URL.
+    This endpoint catches all GET requests with a dynamic path, constructs a Git URL
+    using the `full_path` parameter, and renders the `git.jinja` template with that URL.
 
     Parameters
     ----------
     request : Request
         The incoming request object, which provides context for rendering the response.
     full_path : str
-        The full path extracted from the URL, which is used to build the GitHub URL.
+        The full path extracted from the URL, which is used to build the Git URL.
 
     Returns
     -------
     HTMLResponse
-        An HTML response containing the rendered template, with the GitHub URL
+        An HTML response containing the rendered template, with the Git URL
         and other default parameters such as loading state and file size.
     """
     return templates.TemplateResponse(
-        "github.jinja",
+        "git.jinja",
         {
             "request": request,
-            "github_url": f"https://github.com/{full_path}",
+            "repo_url": full_path,
             "loading": True,
             "default_file_size": 243,
         },
@@ -53,7 +53,7 @@ async def process_catch_all(
     pattern: str = Form(...),
 ) -> HTMLResponse:
     """
-    Processes the form submission with user input for query parameters.
+    Process the form submission with user input for query parameters.
 
     This endpoint handles POST requests, processes the input parameters (e.g., text, file size, pattern),
     and calls the `process_query` function to handle the query logic, returning the result as an HTML response.
diff --git a/src/routers/index.py b/src/routers/index.py
index 70a3f6d2..b338c301 100644
--- a/src/routers/index.py
+++ b/src/routers/index.py
@@ -15,7 +15,7 @@
 @router.get("/", response_class=HTMLResponse)
 async def home(request: Request) -> HTMLResponse:
     """
-    Renders the home page with example repositories and default parameters.
+    Render the home page with example repositories and default parameters.
 
     This endpoint serves the home page of the application, rendering the `index.jinja` template
     and providing it with a list of example repositories and default file size values.
@@ -51,7 +51,7 @@ async def index_post(
     pattern: str = Form(...),
 ) -> HTMLResponse:
     """
-    Processes the form submission with user input for query parameters.
+    Process the form submission with user input for query parameters.
 
     This endpoint handles POST requests from the home page form. It processes the user-submitted
     input (e.g., text, file size, pattern type) and invokes the `process_query` function to handle
diff --git a/src/templates/api.jinja b/src/templates/api.jinja
index 85fa0c3b..9bad379a 100644
--- a/src/templates/api.jinja
+++ b/src/templates/api.jinja
@@ -26,7 +26,7 @@
                     <a href="https://github.com/cyclotruc/gitingest/issues/new"
                        target="_blank"
                        rel="noopener noreferrer"
-                       class="text-[#6e5000] hover:underline">open an issue on github</a>
+                       class="text-[#6e5000] hover:underline">Open an issue on GitHub</a>
                     to suggest features.
                 </p>
             </div>
diff --git a/src/templates/base.jinja b/src/templates/base.jinja
index 7c8359cf..a6e30bf5 100644
--- a/src/templates/base.jinja
+++ b/src/templates/base.jinja
@@ -6,7 +6,7 @@
         <link rel="icon" type="image/x-icon" href="/static/favicon.ico">
         <!-- Search Engine Meta Tags -->
         <meta name="description"
-              content="Replace 'hub' with 'ingest' in any Github Url for a prompt-friendly text">
+              content="Replace 'hub' with 'ingest' in any GitHub URL for a prompt-friendly text.">
         <meta name="keywords"
               content="Gitingest, AI tools, LLM integration, Ingest, Digest, Context, Prompt, Git workflow, codebase extraction, Git repository, Git automation, Summarize, prompt-friendly">
         <meta name="robots" content="index, follow">
@@ -28,7 +28,7 @@
         <!-- OpenGraph Meta Tags -->
         <meta property="og:title" content="Gitingest">
         <meta property="og:description"
-              content="Replace 'hub' with 'ingest' in any Github Url for a prompt-friendly text">
+              content="Replace 'hub' with 'ingest' in any GitHub URL for a prompt-friendly text.">
         <meta property="og:type" content="website">
         <meta property="og:url" content="{{ request.url }}">
         <meta property="og:image" content="/static/og-image.png">
diff --git a/src/templates/components/footer.jinja b/src/templates/components/footer.jinja
index 61fadb28..1a8f3e6e 100644
--- a/src/templates/components/footer.jinja
+++ b/src/templates/components/footer.jinja
@@ -1,7 +1,7 @@
 <footer class="w-full border-t-[3px] border-gray-900 mt-auto">
     <div class="max-w-4xl mx-auto px-4 py-4">
         <div class="grid grid-cols-3 items-center text-gray-900 text-sm">
-            <!-- Left column - Github links -->
+            <!-- Left column - GitHub links -->
             <div class="flex items-center space-x-4">
                 <a href="https://github.com/cyclotruc/gitingest"
                    target="_blank"
diff --git a/src/templates/components/github_form.jinja b/src/templates/components/git_form.jinja
similarity index 98%
rename from src/templates/components/github_form.jinja
rename to src/templates/components/git_form.jinja
index 7be65aee..0d1d8047 100644
--- a/src/templates/components/github_form.jinja
+++ b/src/templates/components/git_form.jinja
@@ -12,7 +12,7 @@
                        name="input_text"
                        id="input_text"
                        placeholder="https://github.com/..."
-                       value="{{ github_url if github_url else '' }}"
+                       value="{{ repo_url if repo_url else '' }}"
                        required
                        class="border-[3px] w-full relative z-20 border-gray-900 placeholder-gray-600 text-lg font-medium focus:outline-none py-3.5 px-6 rounded">
             </div>
diff --git a/src/templates/github.jinja b/src/templates/git.jinja
similarity index 97%
rename from src/templates/github.jinja
rename to src/templates/git.jinja
index c373367c..62def5c1 100644
--- a/src/templates/github.jinja
+++ b/src/templates/git.jinja
@@ -6,7 +6,7 @@
              data-message="{{ error_message }}">{{ error_message }}</div>
     {% endif %}
     {% with is_index=true, show_examples=false %}
-        {% include 'components/github_form.jinja' %}
+        {% include 'components/git_form.jinja' %}
     {% endwith %}
     {% if loading %}
         <div class="relative mt-10">
diff --git a/src/templates/index.jinja b/src/templates/index.jinja
index 467b2f3f..f5beac08 100644
--- a/src/templates/index.jinja
+++ b/src/templates/index.jinja
@@ -73,10 +73,10 @@
              data-message="{{ error_message }}">{{ error_message }}</div>
     {% endif %}
     {% with is_index=true, show_examples=true %}
-        {% include 'components/github_form.jinja' %}
+        {% include 'components/git_form.jinja' %}
     {% endwith %}
     <p class="text-gray-600 text-sm max-w-2xl mx-auto text-center mt-4">
-        You can also replace 'hub' with 'ingest' in any Github URL
+        You can also replace 'hub' with 'ingest' in any GitHub URL.
     </p>
     {% include 'components/result.jinja' %}
 {% endblock %}

From 95b5e27bbcd8225be9db93dc11f7bfdf28eab8f8 Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Fri, 10 Jan 2025 12:41:22 +0100
Subject: [PATCH 2/7] refactor: convert key functions and tests to asynchronous

- Made `parse_query` in query_processor.py asynchronous
- Made `main` in cli.py asynchronous
- Made `ingest` in repository_ingest.py asynchronous
- Updated test functions in test_query_parser.py to support async
---
 src/gitingest/cli.py               |  4 +-
 src/gitingest/query_parser.py      |  2 +-
 src/gitingest/repository_ingest.py |  4 +-
 src/query_processor.py             |  2 +-
 tests/test_query_parser.py         | 80 +++++++++++++++---------------
 5 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py
index ada231a4..371263a7 100644
--- a/src/gitingest/cli.py
+++ b/src/gitingest/cli.py
@@ -14,7 +14,7 @@
 @click.option("--max-size", "-s", default=MAX_FILE_SIZE, help="Maximum file size to process in bytes")
 @click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude")
 @click.option("--include-pattern", "-i", multiple=True, help="Patterns to include")
-def main(
+async def main(
     source: str,
     output: str | None,
     max_size: int,
@@ -54,7 +54,7 @@ def main(
 
         if not output:
             output = "digest.txt"
-        summary, _, _ = ingest(source, max_size, include_patterns, exclude_patterns, output=output)
+        summary, _, _ = await ingest(source, max_size, include_patterns, exclude_patterns, output=output)
 
         click.echo(f"Analysis complete! Output written to: {output}")
         click.echo("\nSummary:")
diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py
index f232e63e..9b796ac9 100644
--- a/src/gitingest/query_parser.py
+++ b/src/gitingest/query_parser.py
@@ -15,7 +15,7 @@
 HEX_DIGITS = set(string.hexdigits)
 
 
-def parse_query(
+async def parse_query(
     source: str,
     max_file_size: int,
     from_web: bool,
diff --git a/src/gitingest/repository_ingest.py b/src/gitingest/repository_ingest.py
index a1149847..c7efa942 100644
--- a/src/gitingest/repository_ingest.py
+++ b/src/gitingest/repository_ingest.py
@@ -10,7 +10,7 @@
 from gitingest.repository_clone import CloneConfig, clone_repo
 
 
-def ingest(
+async def ingest(
     source: str,
     max_file_size: int = 10 * 1024 * 1024,  # 10 MB
     include_patterns: list[str] | str | None = None,
@@ -52,7 +52,7 @@ def ingest(
         If `clone_repo` does not return a coroutine, or if the `source` is of an unsupported type.
     """
     try:
-        query = parse_query(
+        query = await parse_query(
             source=source,
             max_file_size=max_file_size,
             from_web=False,
diff --git a/src/query_processor.py b/src/query_processor.py
index 544a2eea..a66bdd3c 100644
--- a/src/query_processor.py
+++ b/src/query_processor.py
@@ -77,7 +77,7 @@ async def process_query(
     }
 
     try:
-        query = parse_query(
+        query = await parse_query(
             source=input_text,
             max_file_size=max_file_size,
             from_web=True,
diff --git a/tests/test_query_parser.py b/tests/test_query_parser.py
index 97a829d9..472875f7 100644
--- a/tests/test_query_parser.py
+++ b/tests/test_query_parser.py
@@ -8,9 +8,9 @@
 from gitingest.query_parser import _parse_patterns, _parse_url, parse_query
 
 
-def test_parse_url_valid_https() -> None:
+async def test_parse_url_valid_https() -> None:
     """
-    Test `_parse_url` with valid HTTPS URLs from supported platforms (GitHub, GitLab, Bitbucket).
+    Test `_parse_url` with valid HTTPS URLs from supported platforms (GitHub, GitLab, Bitbucket, Gitea).
     Verifies that user and repository names are correctly extracted.
     """
     test_cases = [
@@ -19,13 +19,13 @@ def test_parse_url_valid_https() -> None:
         "https://bitbucket.org/user/repo",
     ]
     for url in test_cases:
-        result = _parse_url(url)
+        result = await _parse_url(url)
         assert result["user_name"] == "user"
         assert result["repo_name"] == "repo"
         assert result["url"] == url
 
 
-def test_parse_url_valid_http() -> None:
+async def test_parse_url_valid_http() -> None:
     """
     Test `_parse_url` with valid HTTP URLs from supported platforms.
     Verifies that user and repository names, as well as the slug, are correctly extracted.
@@ -36,88 +36,88 @@ def test_parse_url_valid_http() -> None:
         "http://bitbucket.org/user/repo",
     ]
     for url in test_cases:
-        result = _parse_url(url)
+        result = await _parse_url(url)
         assert result["user_name"] == "user"
         assert result["repo_name"] == "repo"
         assert result["slug"] == "user-repo"
 
 
-def test_parse_url_invalid() -> None:
+async def test_parse_url_invalid() -> None:
     """
     Test `_parse_url` with an invalid URL that does not include a repository structure.
     Verifies that a ValueError is raised with an appropriate error message.
     """
-    url = "https://only-domain.com"
+    url = "https://github.com"
     with pytest.raises(ValueError, match="Invalid repository URL"):
-        _parse_url(url)
+        await _parse_url(url)
 
 
-def test_parse_query_basic() -> None:
+async def test_parse_query_basic() -> None:
     """
     Test `parse_query` with basic inputs including valid repository URLs.
     Verifies that user and repository names, URL, and ignore patterns are correctly parsed.
     """
     test_cases = ["https://github.com/user/repo", "https://gitlab.com/user/repo"]
     for url in test_cases:
-        result = parse_query(url, max_file_size=50, from_web=True, ignore_patterns="*.txt")
+        result = await parse_query(url, max_file_size=50, from_web=True, ignore_patterns="*.txt")
         assert result["user_name"] == "user"
         assert result["repo_name"] == "repo"
         assert result["url"] == url
         assert "*.txt" in result["ignore_patterns"]
 
 
-def test_parse_query_mixed_case() -> None:
+async def test_parse_query_mixed_case() -> None:
     """
     Test `parse_query` with mixed case URLs.
     """
     url = "Https://GitHub.COM/UsEr/rEpO"
-    result = parse_query(url, max_file_size=50, from_web=True)
+    result = await parse_query(url, max_file_size=50, from_web=True)
     assert result["user_name"] == "user"
     assert result["repo_name"] == "repo"
 
 
-def test_parse_query_include_pattern() -> None:
+async def test_parse_query_include_pattern() -> None:
     """
     Test `parse_query` with an include pattern.
     Verifies that the include pattern is set correctly and default ignore patterns are applied.
     """
     url = "https://github.com/user/repo"
-    result = parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py")
+    result = await parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py")
     assert result["include_patterns"] == ["*.py"]
     assert set(result["ignore_patterns"]) == set(DEFAULT_IGNORE_PATTERNS)
 
 
-def test_parse_query_invalid_pattern() -> None:
+async def test_parse_query_invalid_pattern() -> None:
     """
     Test `parse_query` with an invalid pattern containing special characters.
     Verifies that a ValueError is raised with an appropriate error message.
     """
     url = "https://github.com/user/repo"
     with pytest.raises(ValueError, match="Pattern.*contains invalid characters"):
-        parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py;rm -rf")
+        await parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py;rm -rf")
 
 
-def test_parse_url_with_subpaths() -> None:
+async def test_parse_url_with_subpaths() -> None:
     """
     Test `_parse_url` with a URL containing a branch and subpath.
     Verifies that user name, repository name, branch, and subpath are correctly extracted.
     """
     url = "https://github.com/user/repo/tree/main/subdir/file"
-    result = _parse_url(url)
+    result = await _parse_url(url)
     assert result["user_name"] == "user"
     assert result["repo_name"] == "repo"
     assert result["branch"] == "main"
     assert result["subpath"] == "/subdir/file"
 
 
-def test_parse_url_invalid_repo_structure() -> None:
+async def test_parse_url_invalid_repo_structure() -> None:
     """
     Test `_parse_url` with an invalid repository structure in the URL.
     Verifies that a ValueError is raised with an appropriate error message.
     """
     url = "https://github.com/user"
     with pytest.raises(ValueError, match="Invalid repository URL"):
-        _parse_url(url)
+        await _parse_url(url)
 
 
 def test_parse_patterns_valid() -> None:
@@ -140,35 +140,35 @@ def test_parse_patterns_invalid_characters() -> None:
         _parse_patterns(patterns)
 
 
-def test_parse_query_with_large_file_size() -> None:
+async def test_parse_query_with_large_file_size() -> None:
     """
     Test `parse_query` with a very large file size limit.
     Verifies that the file size limit and default ignore patterns are set correctly.
     """
     url = "https://github.com/user/repo"
-    result = parse_query(url, max_file_size=10**9, from_web=True)
+    result = await parse_query(url, max_file_size=10**9, from_web=True)
     assert result["max_file_size"] == 10**9
     assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS
 
 
-def test_parse_query_empty_patterns() -> None:
+async def test_parse_query_empty_patterns() -> None:
     """
     Test `parse_query` with empty include and ignore patterns.
     Verifies that the include patterns are set to None and default ignore patterns are applied.
     """
     url = "https://github.com/user/repo"
-    result = parse_query(url, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="")
+    result = await parse_query(url, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="")
     assert result["include_patterns"] is None
     assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS
 
 
-def test_parse_query_include_and_ignore_overlap() -> None:
+async def test_parse_query_include_and_ignore_overlap() -> None:
     """
     Test `parse_query` with overlapping include and ignore patterns.
     Verifies that overlapping patterns are removed from the ignore patterns.
     """
     url = "https://github.com/user/repo"
-    result = parse_query(
+    result = await parse_query(
         url,
         max_file_size=50,
         from_web=True,
@@ -180,41 +180,41 @@ def test_parse_query_include_and_ignore_overlap() -> None:
     assert "*.txt" in result["ignore_patterns"]
 
 
-def test_parse_query_local_path() -> None:
+async def test_parse_query_local_path() -> None:
     """
     Test `parse_query` with a local file path.
     Verifies that the local path is set, a unique ID is generated, and the slug is correctly created.
     """
     path = "/home/user/project"
-    result = parse_query(path, max_file_size=100, from_web=False)
+    result = await parse_query(path, max_file_size=100, from_web=False)
     tail = Path("home/user/project")
     assert result["local_path"].parts[-len(tail.parts) :] == tail.parts
     assert result["id"] is not None
     assert result["slug"] == "user/project"
 
 
-def test_parse_query_relative_path() -> None:
+async def test_parse_query_relative_path() -> None:
     """
     Test `parse_query` with a relative file path.
     Verifies that the local path and slug are correctly resolved.
     """
     path = "./project"
-    result = parse_query(path, max_file_size=100, from_web=False)
+    result = await parse_query(path, max_file_size=100, from_web=False)
     tail = Path("project")
     assert result["local_path"].parts[-len(tail.parts) :] == tail.parts
     assert result["slug"].endswith("project")
 
 
-def test_parse_query_empty_source() -> None:
+async def test_parse_query_empty_source() -> None:
     """
     Test `parse_query` with an empty source input.
     Verifies that a ValueError is raised with an appropriate error message.
     """
     with pytest.raises(ValueError, match="Invalid repository URL"):
-        parse_query("", max_file_size=100, from_web=True)
+        await parse_query("", max_file_size=100, from_web=True)
 
 
-def test_parse_url_branch_and_commit_distinction() -> None:
+async def test_parse_url_branch_and_commit_distinction() -> None:
     """
     Test `_parse_url` with URLs containing either a branch name or a commit hash.
     Verifies that the branch and commit are correctly distinguished.
@@ -222,8 +222,8 @@ def test_parse_url_branch_and_commit_distinction() -> None:
     url_branch = "https://github.com/user/repo/tree/main"
     url_commit = "https://github.com/user/repo/tree/abcd1234abcd1234abcd1234abcd1234abcd1234"
 
-    result_branch = _parse_url(url_branch)
-    result_commit = _parse_url(url_commit)
+    result_branch = await _parse_url(url_branch)
+    result_commit = await _parse_url(url_commit)
 
     assert result_branch["branch"] == "main"
     assert result_branch["commit"] is None
@@ -232,23 +232,23 @@ def test_parse_url_branch_and_commit_distinction() -> None:
     assert result_commit["commit"] == "abcd1234abcd1234abcd1234abcd1234abcd1234"
 
 
-def test_parse_query_uuid_uniqueness() -> None:
+async def test_parse_query_uuid_uniqueness() -> None:
     """
     Test `parse_query` to ensure that each call generates a unique UUID for the query result.
     """
     path = "/home/user/project"
-    result1 = parse_query(path, max_file_size=100, from_web=False)
-    result2 = parse_query(path, max_file_size=100, from_web=False)
+    result1 = await parse_query(path, max_file_size=100, from_web=False)
+    result2 = await parse_query(path, max_file_size=100, from_web=False)
     assert result1["id"] != result2["id"]
 
 
-def test_parse_url_with_query_and_fragment() -> None:
+async def test_parse_url_with_query_and_fragment() -> None:
     """
     Test `_parse_url` with a URL containing query parameters and a fragment.
     Verifies that the URL is cleaned and other fields are correctly extracted.
     """
     url = "https://github.com/user/repo?arg=value#fragment"
-    result = _parse_url(url)
+    result = await _parse_url(url)
     assert result["user_name"] == "user"
     assert result["repo_name"] == "repo"
     assert result["url"] == "https://github.com/user/repo"  # URL should be cleaned

From 9a19c92142625b20d4428b4dff3b08e42fac1ba6 Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Fri, 10 Jan 2025 13:17:48 +0100
Subject: [PATCH 3/7] refactor: rename _parse_url and standardize docstrings

- Renamed `_parse_url` to `_parse_repo_source` in query_parser.py
- Adjusted docstrings to adhere to PEP 257 by using imperative tense
---
 src/gitingest/query_parser.py | 14 +++++++-------
 tests/test_query_parser.py    | 32 ++++++++++++++++----------------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py
index 9b796ac9..bc356988 100644
--- a/src/gitingest/query_parser.py
+++ b/src/gitingest/query_parser.py
@@ -53,7 +53,7 @@ async def parse_query(
 
     # Determine the parsing method based on the source type
     if from_web or source.startswith("https://") or "github.com" in source:
-        query = _parse_url(source)
+        query = _parse_repo_source(source)
     else:
         query = _parse_path(source)
 
@@ -80,7 +80,7 @@ async def parse_query(
     return query
 
 
-def _parse_url(url: str) -> dict[str, Any]:
+def _parse_repo_source(url: str) -> dict[str, Any]:
     """
     Parse a GitHub repository URL into a structured query dictionary.
 
@@ -165,7 +165,7 @@ def _parse_url(url: str) -> dict[str, Any]:
 
 def _is_valid_git_commit_hash(commit: str) -> bool:
     """
-    Validates if the provided string is a valid Git commit hash.
+    Validate if the provided string is a valid Git commit hash.
 
     This function checks if the commit hash is a 40-character string consisting only
     of hexadecimal digits, which is the standard format for Git commit hashes.
@@ -185,7 +185,7 @@ def _is_valid_git_commit_hash(commit: str) -> bool:
 
 def _normalize_pattern(pattern: str) -> str:
     """
-    Normalizes the given pattern by removing leading separators and appending a wildcard.
+    Normalize the given pattern by removing leading separators and appending a wildcard.
 
     This function processes the pattern string by stripping leading directory separators
     and appending a wildcard (`*`) if the pattern ends with a separator.
@@ -249,7 +249,7 @@ def _parse_patterns(pattern: list[str] | str) -> list[str]:
 
 def _override_ignore_patterns(ignore_patterns: list[str], include_patterns: list[str]) -> list[str]:
     """
-    Removes patterns from ignore_patterns that are present in include_patterns using set difference.
+    Remove patterns from ignore_patterns that are present in include_patterns using set difference.
 
     Parameters
     ----------
@@ -268,7 +268,7 @@ def _override_ignore_patterns(ignore_patterns: list[str], include_patterns: list
 
 def _parse_path(path_str: str) -> dict[str, Any]:
     """
-    Parses a file path into a structured query dictionary.
+    Parse a file path into a structured query dictionary.
 
     This function takes a file path and constructs a query dictionary that includes
     relevant details such as the absolute path and the slug (a combination of the
@@ -297,7 +297,7 @@ def _parse_path(path_str: str) -> dict[str, Any]:
 
 def _is_valid_pattern(pattern: str) -> bool:
     """
-    Validates if the given pattern contains only valid characters.
+    Validate if the given pattern contains only valid characters.
 
     This function checks if the pattern contains only alphanumeric characters or one
     of the following allowed characters: dash (`-`), underscore (`_`), dot (`.`),
diff --git a/tests/test_query_parser.py b/tests/test_query_parser.py
index 472875f7..1fe666b6 100644
--- a/tests/test_query_parser.py
+++ b/tests/test_query_parser.py
@@ -5,12 +5,12 @@
 import pytest
 
 from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS
-from gitingest.query_parser import _parse_patterns, _parse_url, parse_query
+from gitingest.query_parser import _parse_patterns, _parse_repo_source, parse_query
 
 
 async def test_parse_url_valid_https() -> None:
     """
-    Test `_parse_url` with valid HTTPS URLs from supported platforms (GitHub, GitLab, Bitbucket, Gitea).
+    Test `_parse_repo_source` with valid HTTPS URLs from supported platforms (GitHub, GitLab, Bitbucket, Gitea).
     Verifies that user and repository names are correctly extracted.
     """
     test_cases = [
@@ -19,7 +19,7 @@ async def test_parse_url_valid_https() -> None:
         "https://bitbucket.org/user/repo",
     ]
     for url in test_cases:
-        result = await _parse_url(url)
+        result = await _parse_repo_source(url)
         assert result["user_name"] == "user"
         assert result["repo_name"] == "repo"
         assert result["url"] == url
@@ -27,7 +27,7 @@ async def test_parse_url_valid_https() -> None:
 
 async def test_parse_url_valid_http() -> None:
     """
-    Test `_parse_url` with valid HTTP URLs from supported platforms.
+    Test `_parse_repo_source` with valid HTTP URLs from supported platforms.
     Verifies that user and repository names, as well as the slug, are correctly extracted.
     """
     test_cases = [
@@ -36,7 +36,7 @@ async def test_parse_url_valid_http() -> None:
         "http://bitbucket.org/user/repo",
     ]
     for url in test_cases:
-        result = await _parse_url(url)
+        result = await _parse_repo_source(url)
         assert result["user_name"] == "user"
         assert result["repo_name"] == "repo"
         assert result["slug"] == "user-repo"
@@ -44,12 +44,12 @@ async def test_parse_url_valid_http() -> None:
 
 async def test_parse_url_invalid() -> None:
     """
-    Test `_parse_url` with an invalid URL that does not include a repository structure.
+    Test `_parse_repo_source` with an invalid URL that does not include a repository structure.
     Verifies that a ValueError is raised with an appropriate error message.
     """
     url = "https://github.com"
     with pytest.raises(ValueError, match="Invalid repository URL"):
-        await _parse_url(url)
+        await _parse_repo_source(url)
 
 
 async def test_parse_query_basic() -> None:
@@ -99,11 +99,11 @@ async def test_parse_query_invalid_pattern() -> None:
 
 async def test_parse_url_with_subpaths() -> None:
     """
-    Test `_parse_url` with a URL containing a branch and subpath.
+    Test `_parse_repo_source` with a URL containing a branch and subpath.
     Verifies that user name, repository name, branch, and subpath are correctly extracted.
     """
     url = "https://github.com/user/repo/tree/main/subdir/file"
-    result = await _parse_url(url)
+    result = await _parse_repo_source(url)
     assert result["user_name"] == "user"
     assert result["repo_name"] == "repo"
     assert result["branch"] == "main"
@@ -112,12 +112,12 @@ async def test_parse_url_with_subpaths() -> None:
 
 async def test_parse_url_invalid_repo_structure() -> None:
     """
-    Test `_parse_url` with an invalid repository structure in the URL.
+    Test `_parse_repo_source` with an invalid repository structure in the URL.
     Verifies that a ValueError is raised with an appropriate error message.
     """
     url = "https://github.com/user"
     with pytest.raises(ValueError, match="Invalid repository URL"):
-        await _parse_url(url)
+        await _parse_repo_source(url)
 
 
 def test_parse_patterns_valid() -> None:
@@ -216,14 +216,14 @@ async def test_parse_query_empty_source() -> None:
 
 async def test_parse_url_branch_and_commit_distinction() -> None:
     """
-    Test `_parse_url` with URLs containing either a branch name or a commit hash.
+    Test `_parse_repo_source` with URLs containing either a branch name or a commit hash.
     Verifies that the branch and commit are correctly distinguished.
     """
     url_branch = "https://github.com/user/repo/tree/main"
     url_commit = "https://github.com/user/repo/tree/abcd1234abcd1234abcd1234abcd1234abcd1234"
 
-    result_branch = await _parse_url(url_branch)
-    result_commit = await _parse_url(url_commit)
+    result_branch = await _parse_repo_source(url_branch)
+    result_commit = await _parse_repo_source(url_commit)
 
     assert result_branch["branch"] == "main"
     assert result_branch["commit"] is None
@@ -244,11 +244,11 @@ async def test_parse_query_uuid_uniqueness() -> None:
 
 async def test_parse_url_with_query_and_fragment() -> None:
     """
-    Test `_parse_url` with a URL containing query parameters and a fragment.
+    Test `_parse_repo_source` with a URL containing query parameters and a fragment.
     Verifies that the URL is cleaned and other fields are correctly extracted.
     """
     url = "https://github.com/user/repo?arg=value#fragment"
-    result = await _parse_url(url)
+    result = await _parse_repo_source(url)
     assert result["user_name"] == "user"
     assert result["repo_name"] == "repo"
     assert result["url"] == "https://github.com/user/repo"  # URL should be cleaned

From a57f614987746058b23a2b72380b0af607b9607a Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Fri, 10 Jan 2025 13:28:42 +0100
Subject: [PATCH 4/7] refactor: implement _get_status_code, adjust
 _check_repo_exists, and update tests

- Implemented function `_get_status_code` in repository_clone.py to extract the status code from an HTTP response
- Adjusted `_check_repo_exists` in repository_clone.py to utilize the new `_get_status_code` function
- Modified `_check_repo_exists` to return True for status codes 200 and 301, and False for 404 and 302
- Updated `test_check_repo_exists_with_redirect` in test_repository_clone.py to verify that `_check_repo_exists` returns False for status code 302
- Implemented test `test_check_repo_exists_with_permanent_redirect` in test_repository_clone.py to verify that `_check_repo_exists` returns True for status code 301
---
 src/gitingest/repository_clone.py | 39 ++++++++++++++++++++++++++++---
 tests/test_repository_clone.py    | 22 +++++++++++++++--
 2 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/src/gitingest/repository_clone.py b/src/gitingest/repository_clone.py
index 57374ada..d251a6f1 100644
--- a/src/gitingest/repository_clone.py
+++ b/src/gitingest/repository_clone.py
@@ -111,6 +111,11 @@ async def _check_repo_exists(url: str) -> bool:
     -------
     bool
         True if the repository exists, False otherwise.
+
+    Raises
+    ------
+    RuntimeError
+        If the curl command returns an unexpected status code.
     """
     proc = await asyncio.create_subprocess_exec(
         "curl",
@@ -120,11 +125,20 @@ async def _check_repo_exists(url: str) -> bool:
         stderr=asyncio.subprocess.PIPE,
     )
     stdout, _ = await proc.communicate()
+
     if proc.returncode != 0:
         return False
-    # Check if stdout contains "404" status code
-    stdout_str = stdout.decode()
-    return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str
+
+    response = stdout.decode()
+    status_code = _get_status_code(response)
+
+    if status_code in (200, 301):
+        return True
+
+    if status_code in (404, 302):
+        return False
+
+    raise RuntimeError(f"Unexpected status code: {status_code}")
 
 
 async def _run_git_command(*args: str) -> tuple[bytes, bytes]:
@@ -157,3 +171,22 @@ async def _run_git_command(*args: str) -> tuple[bytes, bytes]:
         raise RuntimeError(f"Git command failed: {' '.join(args)}\nError: {error_message}")
 
     return stdout, stderr
+
+
+def _get_status_code(response: str) -> int:
+    """
+    Extract the status code from an HTTP response.
+
+    Parameters
+    ----------
+    response : str
+        The HTTP response string.
+
+    Returns
+    -------
+    int
+        The status code of the response
+    """
+    status_line = response.splitlines()[0].strip()
+    status_code = int(status_line.split(" ", 2)[1])
+    return status_code
diff --git a/tests/test_repository_clone.py b/tests/test_repository_clone.py
index 892bd04e..3bfa3b2f 100644
--- a/tests/test_repository_clone.py
+++ b/tests/test_repository_clone.py
@@ -204,8 +204,9 @@ async def test_clone_repo_commit_without_branch() -> None:
 @pytest.mark.asyncio
 async def test_check_repo_exists_with_redirect() -> None:
     """
-    Test the `_check_repo_exists` function for handling HTTP redirects (302 Found).
-    Verifies that it correctly identifies the repository's existence.
+    Test the `_check_repo_exists` function when the repository URL returns a redirect response.
+
+    Verifies that the function returns False when a 302 Found response is received.
     """
     url = "https://github.com/user/repo"
     with patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) as mock_exec:
@@ -214,4 +215,21 @@ async def test_check_repo_exists_with_redirect() -> None:
         mock_process.returncode = 0  # Simulate successful request
         mock_exec.return_value = mock_process
 
+        assert await _check_repo_exists(url) is False
+
+
+@pytest.mark.asyncio
+async def test_check_repo_exists_with_permanent_redirect() -> None:
+    """
+    Test the `_check_repo_exists` function when the repository URL returns a redirect response.
+
+    Verifies that the function returns True when a 301 Found response is received.
+    """
+    url = "https://github.com/user/repo"
+    with patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) as mock_exec:
+        mock_process = AsyncMock()
+        mock_process.communicate.return_value = (b"HTTP/1.1 301 Found\n", b"")
+        mock_process.returncode = 0  # Simulate successful request
+        mock_exec.return_value = mock_process
+
         assert await _check_repo_exists(url)

From 9bdee8f7012cdf42485387eb11c09d741b37435d Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Fri, 10 Jan 2025 13:50:05 +0100
Subject: [PATCH 5/7] feat: make parser domain-agnostic to support multiple Git
 hosts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- added list of known domains/Git hosts in `query_parser.py`
- fixed bug from [#115](https://github.com/cyclotruc/gitingest/pull/115): corrected case handling for URL components—scheme, domain, username, and repository are case-insensitive, but paths beyond (e.g., file names, branches) are case-sensitive
- implemented `try_domains_for_user_and_repo` in `query_parser.py` to iteratively guess the correct domain until success or supported hosts are exhausted
- added helper functions `_get_user_and_repo_from_path`, `_validate_host`, and `_validate_scheme` in `query_parser.py`
- extended `_parse_repo_source` in `query_parser.py` to be Git host agnostic by using `try_domains_for_user_and_repo`
- added tests `test_parse_url_unsupported_host` and `test_parse_query_with_branch` in `test_query_parser.py`
- created new file `test_git_host_agnostic.py` to verify domain/Git host agnostic behavior
---
 src/gitingest/query_parser.py                | 208 ++++++++++++++-----
 src/main.py                                  |   2 +-
 tests/query_parser/test_git_host_agnostic.py |  71 +++++++
 tests/test_query_parser.py                   |  22 ++
 4 files changed, 251 insertions(+), 52 deletions(-)
 create mode 100644 tests/query_parser/test_git_host_agnostic.py

diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py
index bc356988..2981f097 100644
--- a/src/gitingest/query_parser.py
+++ b/src/gitingest/query_parser.py
@@ -11,8 +11,16 @@
 from config import TMP_BASE_PATH
 from gitingest.exceptions import InvalidPatternError
 from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS
+from gitingest.repository_clone import _check_repo_exists
 
-HEX_DIGITS = set(string.hexdigits)
+HEX_DIGITS: set[str] = set(string.hexdigits)
+
+KNOWN_GIT_HOSTS: list[str] = [
+    "github.com",
+    "gitlab.com",
+    "bitbucket.org",
+    "gitea.com",
+]
 
 
 async def parse_query(
@@ -48,16 +56,16 @@ async def parse_query(
         A dictionary containing the parsed query parameters, including 'max_file_size',
         'ignore_patterns', and 'include_patterns'.
     """
-    # Normalize and clean up the source string to make it case-insensitive
-    source = source.lower().strip()
 
     # Determine the parsing method based on the source type
-    if from_web or source.startswith("https://") or "github.com" in source:
-        query = _parse_repo_source(source)
+    if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS):
+        # We either have a full URL or a domain-less slug
+        query = await _parse_repo_source(source)
     else:
+        # Local path scenario
         query = _parse_path(source)
 
-    # Process ignore patterns
+    # Combine ignore patterns
     ignore_patterns_list = DEFAULT_IGNORE_PATTERNS.copy()
     if ignore_patterns:
         ignore_patterns_list += _parse_patterns(ignore_patterns)
@@ -69,7 +77,6 @@ async def parse_query(
     else:
         parsed_include = None
 
-    # Update the query dictionary with max_file_size and processed patterns
     query.update(
         {
             "max_file_size": max_file_size,
@@ -80,52 +87,54 @@ async def parse_query(
     return query
 
 
-def _parse_repo_source(url: str) -> dict[str, Any]:
+async def _parse_repo_source(source: str) -> dict[str, Any]:
     """
-    Parse a GitHub repository URL into a structured query dictionary.
+    Parse a repository URL into a structured query dictionary.
 
-    This function extracts relevant information from a GitHub URL, such as the username,
-    repository name, commit, branch, and subpath, and returns them in a structured format.
+    If source is:
+      - A fully qualified URL (https://gitlab.com/...), parse & verify that domain
+      - A URL missing 'https://' (gitlab.com/...), add 'https://' and parse
+      - A 'slug' (like 'pandas-dev/pandas'), attempt known domains until we find one that exists.
 
     Parameters
     ----------
-    url : str
-        The GitHub URL to parse.
+    source : str
+        The URL or domain-less slug to parse.
 
     Returns
     -------
     dict[str, Any]
-        A dictionary containing the parsed details of the GitHub repository, including
-        the username, repository name, commit, branch, and other relevant information.
-
-    Raises
-    ------
-    ValueError
-        If the URL is invalid or does not correspond to a valid Git repository.
+        A dictionary containing the parsed details of the repository, including the username,
+        repository name, commit, branch, and other relevant information.
     """
-    # Clean up the URL
-    url = url.split(" ")[0]  # remove trailing text
-    url = unquote(url)  # decode URL-encoded characters
+    source = unquote(source)
 
-    if not url.startswith(("https://", "http://")):
-        url = "https://" + url
+    # Attempt to parse
+    parsed_url = urlparse(source)
 
-    # Parse URL and reconstruct it without query parameters and fragments
-    parsed_url = urlparse(url)
-    url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
+    if parsed_url.scheme:
+        _validate_scheme(parsed_url.scheme)
+        _validate_host(parsed_url.netloc.lower())
 
-    # Extract domain and path
-    url_parts = url.split("/")
-    domain = url_parts[2]
-    path_parts = url_parts[3:]
+    else:  # Will be of the form 'host/user/repo' or 'user/repo'
+        tmp_host = source.split("/")[0].lower()
+        if "." in tmp_host:
+            _validate_host(tmp_host)
+        else:
+            # No scheme, no domain => user typed "user/repo", so we'll guess the domain.
+            host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source))
+            source = f"{host}/{source}"
 
-    if len(path_parts) < 2:
-        raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.")
+        source = "https://" + source
+        parsed_url = urlparse(source)
+
+    host = parsed_url.netloc.lower()
+    user_name, repo_name = _get_user_and_repo_from_path(parsed_url.path)
 
-    user_name = path_parts[0]
-    repo_name = path_parts[1]
     _id = str(uuid.uuid4())
     slug = f"{user_name}-{repo_name}"
+    local_path = Path(TMP_BASE_PATH) / _id / slug
+    url = f"https://{host}/{user_name}/{repo_name}"
 
     parsed = {
         "user_name": user_name,
@@ -134,31 +143,39 @@ def _parse_repo_source(url: str) -> dict[str, Any]:
         "branch": None,
         "commit": None,
         "subpath": "/",
-        "local_path": Path(TMP_BASE_PATH) / _id / slug,
-        "url": f"https://{domain}/{user_name}/{repo_name}",
-        "slug": slug,
+        "local_path": local_path,
+        "url": url,
+        "slug": slug,  # e.g. "pandas-dev-pandas"
         "id": _id,
     }
 
-    # If this is an issues page or pull requests, return early without processing subpath
-    if len(path_parts) > 2 and (path_parts[2] == "issues" or path_parts[2] == "pull"):
+    remaining_parts = parsed_url.path.strip("/").split("/")[2:]
+
+    if not remaining_parts:
         return parsed
 
+    possible_type = remaining_parts.pop(0)  # e.g. 'issues', 'pull', 'tree', 'blob'
+
     # If no extra path parts, just return
-    if len(path_parts) < 4:
+    if not remaining_parts:
+        return parsed
+
+    # If this is an issues page or pull requests, return early without processing subpath
+    if remaining_parts and possible_type in ("issues", "pull"):
         return parsed
 
-    parsed["type"] = path_parts[2]  # Usually 'tree' or 'blob'
-    commit = path_parts[3]
+    parsed["type"] = possible_type
 
-    if _is_valid_git_commit_hash(commit):
-        parsed["commit"] = commit
-        if len(path_parts) > 4:
-            parsed["subpath"] += "/".join(path_parts[4:])
+    # Commit or branch
+    commit_or_branch = remaining_parts.pop(0)
+    if _is_valid_git_commit_hash(commit_or_branch):
+        parsed["commit"] = commit_or_branch
     else:
-        parsed["branch"] = commit
-        if len(path_parts) > 4:
-            parsed["subpath"] += "/".join(path_parts[4:])
+        parsed["branch"] = commit_or_branch
+
+    # Subpath if anything left
+    if remaining_parts:
+        parsed["subpath"] += "/".join(remaining_parts)
 
     return parsed
 
@@ -314,3 +331,92 @@ def _is_valid_pattern(pattern: str) -> bool:
         True if the pattern is valid, otherwise False.
     """
     return all(c.isalnum() or c in "-_./+*" for c in pattern)
+
+
+async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str:
+    """
+    Attempt to find a valid repository host for the given user_name and repo_name.
+
+    Parameters
+    ----------
+    user_name : str
+        The username or owner of the repository.
+    repo_name : str
+        The name of the repository.
+
+    Returns
+    -------
+    str
+        The domain of the valid repository host.
+
+    Raises
+    ------
+    ValueError
+        If no valid repository host is found for the given user_name and repo_name.
+    """
+    for domain in KNOWN_GIT_HOSTS:
+        candidate = f"https://{domain}/{user_name}/{repo_name}"
+        if await _check_repo_exists(candidate):
+            return domain
+    raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.")
+
+
+def _get_user_and_repo_from_path(path: str) -> tuple[str, str]:
+    """
+    Extract the user and repository names from a given path.
+
+    Parameters
+    ----------
+    path : str
+        The path to extract the user and repository names from.
+
+    Returns
+    -------
+    tuple[str, str]
+        A tuple containing the user and repository names.
+
+    Raises
+    ------
+    ValueError
+        If the path does not contain at least two parts.
+    """
+    path_parts = path.lower().strip("/").split("/")
+    if len(path_parts) < 2:
+        raise ValueError(f"Invalid repository URL '{path}'")
+    return path_parts[0], path_parts[1]
+
+
+def _validate_host(host: str) -> None:
+    """
+    Validate the given host against the known Git hosts.
+
+    Parameters
+    ----------
+    host : str
+        The host to validate.
+
+    Raises
+    ------
+    ValueError
+        If the host is not a known Git host.
+    """
+    if host not in KNOWN_GIT_HOSTS:
+        raise ValueError(f"Unknown domain '{host}' in URL")
+
+
+def _validate_scheme(scheme: str) -> None:
+    """
+    Validate the given scheme against the known schemes.
+
+    Parameters
+    ----------
+    scheme : str
+        The scheme to validate.
+
+    Raises
+    ------
+    ValueError
+        If the scheme is not 'http' or 'https'.
+    """
+    if scheme not in ("https", "http"):
+        raise ValueError(f"Invalid URL scheme '{scheme}' in URL")
diff --git a/src/main.py b/src/main.py
index 7ba36a83..f2b63fdd 100644
--- a/src/main.py
+++ b/src/main.py
@@ -78,7 +78,7 @@ async def process_folder(folder: Path) -> None:
         # Extract owner and repository name from the filename
         if txt_files and "-" in (filename := txt_files[0].stem):
             owner, repo = filename.split("-", 1)
-            repo_url = f"https://github.com/{owner}/{repo}"
+            repo_url = f"{owner}/{repo}"
             with open("history.txt", mode="a", encoding="utf-8") as history:
                 history.write(f"{repo_url}\n")
 
diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py
new file mode 100644
index 00000000..18308111
--- /dev/null
+++ b/tests/query_parser/test_git_host_agnostic.py
@@ -0,0 +1,71 @@
+""" Tests to verify that the query parser is Git host agnostic. """
+
+import pytest
+
+from gitingest.query_parser import parse_query
+
+
+@pytest.mark.parametrize(
+    "urls, expected_user, expected_repo, expected_url",
+    [
+        (
+            [
+                "https://github.com/tiangolo/fastapi",
+                "github.com/tiangolo/fastapi",
+                "tiangolo/fastapi",
+            ],
+            "tiangolo",
+            "fastapi",
+            "https://github.com/tiangolo/fastapi",
+        ),
+        (
+            [
+                "https://gitlab.com/gitlab-org/gitlab-runner",
+                "gitlab.com/gitlab-org/gitlab-runner",
+                "gitlab-org/gitlab-runner",
+            ],
+            "gitlab-org",
+            "gitlab-runner",
+            "https://gitlab.com/gitlab-org/gitlab-runner",
+        ),
+        (
+            [
+                "https://bitbucket.org/na-dna/llm-knowledge-share",
+                "bitbucket.org/na-dna/llm-knowledge-share",
+                "na-dna/llm-knowledge-share",
+            ],
+            "na-dna",
+            "llm-knowledge-share",
+            "https://bitbucket.org/na-dna/llm-knowledge-share",
+        ),
+        (
+            [
+                "https://gitea.com/xorm/xorm",
+                "gitea.com/xorm/xorm",
+                "xorm/xorm",
+            ],
+            "xorm",
+            "xorm",
+            "https://gitea.com/xorm/xorm",
+        ),
+    ],
+)
+@pytest.mark.asyncio
+async def test_parse_query_without_host(
+    urls: list[str],
+    expected_user: str,
+    expected_repo: str,
+    expected_url: str,
+) -> None:
+    for url in urls:
+        result = await parse_query(url, max_file_size=50, from_web=True)
+        # Common assertions for all cases
+        assert result["user_name"] == expected_user
+        assert result["repo_name"] == expected_repo
+        assert result["url"] == expected_url
+        assert result["slug"] == f"{expected_user}-{expected_repo}"
+        assert result["id"] is not None
+        assert result["subpath"] == "/"
+        assert result["branch"] is None
+        assert result["commit"] is None
+        assert result["type"] is None
diff --git a/tests/test_query_parser.py b/tests/test_query_parser.py
index 1fe666b6..0db65d3b 100644
--- a/tests/test_query_parser.py
+++ b/tests/test_query_parser.py
@@ -252,3 +252,25 @@ async def test_parse_url_with_query_and_fragment() -> None:
     assert result["user_name"] == "user"
     assert result["repo_name"] == "repo"
     assert result["url"] == "https://github.com/user/repo"  # URL should be cleaned
+
+
+async def test_parse_url_unsupported_host() -> None:
+    url = "https://only-domain.com"
+    with pytest.raises(ValueError, match="Unknown domain 'only-domain.com' in URL"):
+        await _parse_repo_source(url)
+
+
+async def test_parse_query_with_branch() -> None:
+    url = "https://github.com/pandas-dev/pandas/blob/2.2.x/.github/ISSUE_TEMPLATE/documentation_improvement.yaml"
+    result = await parse_query(url, max_file_size=10**9, from_web=True)
+    assert result["user_name"] == "pandas-dev"
+    assert result["repo_name"] == "pandas"
+    assert result["url"] == "https://github.com/pandas-dev/pandas"
+    assert result["slug"] == "pandas-dev-pandas"
+    assert result["id"] is not None
+    print('result["subpath"]', result["subpath"])
+    print("/.github/ISSUE_TEMPLATE/documentation_improvement.yaml")
+    assert result["subpath"] == "/.github/ISSUE_TEMPLATE/documentation_improvement.yaml"
+    assert result["branch"] == "2.2.x"
+    assert result["commit"] is None
+    assert result["type"] == "blob"

From cd1b14ef5689d5779099029d290e03618416ff04 Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Fri, 10 Jan 2025 13:52:26 +0100
Subject: [PATCH 6/7] chore: move test_query_parser.py from tests/ to
 tests/query_parser/

---
 tests/{ => query_parser}/test_query_parser.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/{ => query_parser}/test_query_parser.py (100%)

diff --git a/tests/test_query_parser.py b/tests/query_parser/test_query_parser.py
similarity index 100%
rename from tests/test_query_parser.py
rename to tests/query_parser/test_query_parser.py

From 31c695de35f30260337af621bbf488b159de3431 Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Sat, 11 Jan 2025 23:45:40 +0100
Subject: [PATCH 7/7] add codeberg.org to supported git hosts

---
 src/gitingest/query_parser.py                |  1 +
 tests/query_parser/test_git_host_agnostic.py | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py
index 2981f097..78dd6cff 100644
--- a/src/gitingest/query_parser.py
+++ b/src/gitingest/query_parser.py
@@ -20,6 +20,7 @@
     "gitlab.com",
     "bitbucket.org",
     "gitea.com",
+    "codeberg.org",
 ]
 
 
diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py
index 18308111..8e863555 100644
--- a/tests/query_parser/test_git_host_agnostic.py
+++ b/tests/query_parser/test_git_host_agnostic.py
@@ -48,6 +48,16 @@
             "xorm",
             "https://gitea.com/xorm/xorm",
         ),
+        (
+            [
+                "https://codeberg.org/forgejo/forgejo",
+                "codeberg.org/forgejo/forgejo",
+                "forgejo/forgejo",
+            ],
+            "forgejo",
+            "forgejo",
+            "https://codeberg.org/forgejo/forgejo",
+        ),
     ],
 )
 @pytest.mark.asyncio