From 32808ddd70f2a0df8a1dc2efd2bd23c5cfefe8f5 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Thu, 26 Dec 2024 10:02:06 +0100 Subject: [PATCH 1/9] chore: add pre-commit config, type hints, badges, and lint codebase - Add .pre-commit-config.yaml and pyproject.toml for Black and isort - Add missing type hints throughout the code (Dict[...] for Python 3.8 compatibility) - Added badges and convert existing badges to use format - Lint Markdown files - Lint Jinja templates with djlint --- .github/workflows/unitest.yml | 8 +- .gitignore | 1 + .pre-commit-config.yaml | 78 +++++++++ CODE_OF_CONDUCT.md | 12 +- README.md | 80 ++++++--- SECURITY.md | 2 +- pyproject.toml | 17 ++ pytest.ini | 3 +- requirements.txt | 13 +- setup.py | 4 +- src/gitingest/__init__.py | 10 +- src/gitingest/cli.py | 23 ++- src/gitingest/clone.py | 22 +-- src/gitingest/ignore_patterns.py | 102 +++++++++++ src/gitingest/ingest.py | 24 ++- src/gitingest/ingest_from_query.py | 126 ++++++++++---- src/gitingest/parse_query.py | 96 ++++------- src/gitingest/tests/conftest.py | 2 +- src/gitingest/tests/test_clone.py | 41 +++-- src/gitingest/tests/test_ingest.py | 68 ++++---- src/gitingest/tests/test_parse_query.py | 29 ++-- src/gitingest/utils.py | 12 +- src/main.py | 42 +++-- src/process_query.py | 100 ++++++++--- src/routers/__init__.py | 8 +- src/routers/download.py | 23 +-- src/routers/dynamic.py | 28 +-- src/routers/index.py | 37 ++-- src/server_utils.py | 13 +- src/static/favicon.svg | 2 +- src/static/js/snow.js | 2 +- src/static/robots.txt | 3 +- src/templates/api.jinja | 60 +++---- src/templates/base.jinja | 111 ++++++------ src/templates/components/footer.jinja | 22 ++- src/templates/components/github_form.jinja | 108 ++++++------ src/templates/components/navbar.jinja | 30 ++-- src/templates/components/result.jinja | 191 +++++++++------------ src/templates/github.jinja | 50 +++--- src/templates/index.jinja | 104 +++++------ 40 files changed, 996 insertions(+), 711 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 pyproject.toml create mode 100644 src/gitingest/ignore_patterns.py diff --git a/.github/workflows/unitest.yml b/.github/workflows/unitest.yml index e1fb7ebb..61e834e6 100644 --- a/.github/workflows/unitest.yml +++ b/.github/workflows/unitest.yml @@ -15,19 +15,19 @@ jobs: steps: - uses: actions/checkout@v4 - + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - + - name: Install dependencies run: | python -m pip install --upgrade pip pip install pytest pytest-asyncio if [ -f requirements.txt ]; then pip install -r requirements.txt; fi pip install -e . - + - name: Run tests run: | - pytest \ No newline at end of file + pytest diff --git a/.gitignore b/.gitignore index e98f538f..09c9945b 100644 --- a/.gitignore +++ b/.gitignore @@ -131,6 +131,7 @@ venv/ ENV/ env.bak/ venv.bak/ +.python-version # Spyder project settings .spyderproject diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..1b3eabd5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,78 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + # Files + - id: check-added-large-files + description: 'Prevent large files from being committed.' + args: ['--maxkb=10000'] + - id: check-case-conflict + description: 'Check for files that would conflict in case-insensitive filesystems.' + - id: fix-byte-order-marker + description: 'Remove utf-8 byte order marker.' + - id: mixed-line-ending + description: 'Replace mixed line ending.' + + # Links + - id: destroyed-symlinks + description: 'Detect symlinks which are changed to regular files with a content of a path which that symlink was pointing to.' + + # File files for parseable syntax: python + - id: check-ast + + # File and line endings + - id: end-of-file-fixer + description: 'Ensure that a file is either empty, or ends with one newline.' + - id: trailing-whitespace + description: 'Trim trailing whitespace.' + + # Python + - id: check-docstring-first + description: 'Check a common error of defining a docstring after code.' + - id: requirements-txt-fixer + description: 'Sort entries in requirements.txt.' + + - repo: https://github.com/MarcoGorelli/absolufy-imports + rev: v0.3.1 + hooks: + - id: absolufy-imports + description: 'Automatically convert relative imports to absolute. (Use `args: [--never]` to revert.)' + + - repo: https://github.com/psf/black + rev: 24.10.0 + hooks: + - id: black + + - repo: https://github.com/asottile/pyupgrade + rev: v3.19.1 + hooks: + - id: pyupgrade + description: 'Automatically upgrade syntax for newer versions.' + args: [--py3-plus, --py36-plus, --py38-plus] + + - repo: https://github.com/pre-commit/pygrep-hooks + rev: v1.10.0 + hooks: + - id: python-check-blanket-noqa + description: 'Enforce that `noqa` annotations always occur with specific codes. Sample annotations: `# noqa: F401`, `# noqa: F401,W203`.' + - id: python-check-blanket-type-ignore + description: 'Enforce that `# type: ignore` annotations always occur with specific codes. Sample annotations: `# type: ignore[attr-defined]`, `# type: ignore[attr-defined, name-defined]`.' + - id: python-use-type-annotations + description: 'Enforce that python3.6+ type annotations are used instead of type comments.' + + - repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort + description: 'Sort imports alphabetically, and automatically separated into sections and by type.' + + - repo: https://github.com/hadialqattan/pycln + rev: v2.4.0 + hooks: + - id: pycln + description: 'Remove unused import statements.' + + - repo: https://github.com/djlint/djLint + rev: v1.36.4 + hooks: + - id: djlint-reformat-jinja diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 48ba75f2..2293c260 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -60,7 +60,7 @@ representative at an online or offline event. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at -romain@coderamp.io. +. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the @@ -114,15 +114,13 @@ the community. ## Attribution -This Code of Conduct is adapted from the [Contributor Covenant][homepage], +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.0, available at -https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. +. Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). -[homepage]: https://www.contributor-covenant.org - For answers to common questions about this code of conduct, see the FAQ at -https://www.contributor-covenant.org/faq. Translations are available at -https://www.contributor-covenant.org/translations. +. Translations are available at +. diff --git a/README.md b/README.md index 991aeafe..6d0747a2 100644 --- a/README.md +++ b/README.md @@ -1,34 +1,56 @@ -[![Image](./docs/frontpage.png "GitIngest main page")](https://gitingest.com/) +[![Image](./docs/frontpage.png "GitIngest main page")](https://gitingest.com) + + + + License + + + + PyPI version + + + + Downloads + + + + GitHub issues + + + + Code style: black + + + + + Discord + + +# GitIngest -![License](https://img.shields.io/badge/license-MIT-blue.svg) - -# GitIngest 🔍 Turn any Git repository into a prompt-friendly text ingest for LLMs. You can also replace `hub` with `ingest` in any github url to access the coresponding digest -[gitingest.com](https://gitingest.com/) - +[gitingest.com](https://gitingest.com) ## 🚀 Features - **Easy code context**: Get a text digest from a git repository URL or a directory - **Smart Formatting**: Optimized output format for LLM prompts -- **Statistics about**: : +- **Statistics about**: - File and directory structure - Size of the extract - - Token count + - Token count - **CLI tool**: Run it as a command (Currently on Linux only) - **Python package**: Import it in your code - ## 📦 Installation -``` +``` bash pip install gitingest ``` - ## 💡 Command Line usage The `gitingest` command line tool allows you to analyze codebases and create a text dump of their contents. @@ -46,60 +68,62 @@ gitingest --help This will write the digest in a text file (default `digest.txt`) in your current working directory. - ## 🐛 Python package usage - ```python from gitingest import ingest summary, tree, content = ingest("path/to/directory") -#or from URL +# or from URL summary, tree, content = ingest("https://github.com/cyclotruc/gitingest") ``` By default, this won't write a file but can be enabled with the `output` argument - ## 🛠️ Using + - Tailwind CSS - Frontend - [FastAPI](https://github.com/fastapi/fastapi) - Backend framework - [tiktoken](https://github.com/openai/tiktoken) - Token estimation - [apianalytics.dev](https://www.apianalytics.dev/) - Simple Analytics +## 🌐 Self-host -## 🌐 Self-host 1. Build the image: -``` + +``` bash docker build -t gitingest . ``` 2. Run the container: -``` + +``` bash docker run -d --name gitingest -p 8000:8000 gitingest ``` + The application will be available at `http://localhost:8000` Ensure environment variables are set before running the application or deploying it via Docker. ## ✔️ Contributing -Contributions are welcome! +Contributions are welcome! Gitingest aims to be friendly for first time contributors, with a simple python and html codebase. If you need any help while working with the code, reach out to us on [discord](https://discord.com/invite/zerRaGK9EC) -### Ways to contribute +### Ways to contribute 1. Provide your feedback and ideas on discord -2. Open an Issue on github to report a bug -2. Create a Pull request +2. Open an Issue on github to report a bug +3. Create a Pull request - Fork the repository - Make your changes and test them locally - Open a pull request for review and feedback ### 🔧 Local dev -#### Environment Configuration +#### Environment Configuration + - **`ALLOWED_HOSTS`**: Specify allowed hostnames for the application. Default: `"gitingest.com,*.gitingest.com,gitdigest.dev,localhost"`. You can configure the application using the following environment variables: @@ -108,23 +132,25 @@ ALLOWED_HOSTS="gitingest.local,localhost" ``` #### Run locally -1. Clone the repository + +1. Clone the repository + ```bash git clone https://github.com/cyclotruc/gitingest.git cd gitingest ``` 2. Install dependencies + ```bash pip install -r requirements.txt ``` 3. Run the application: + ```bash cd src uvicorn main:app --reload ``` -The frontend will be available at `localhost:8000` - - +The frontend will be available at `localhost:8000` diff --git a/SECURITY.md b/SECURITY.md index cf4a494c..90a6d689 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -2,4 +2,4 @@ ## Reporting a Vulnerability -If you have discovered a vulnerability inside the project, report it privately at romain@coderamp.io. This way the maintainer can work on a proper fix without disclosing the problem to the public before it has been solved. +If you have discovered a vulnerability inside the project, report it privately at . This way the maintainer can work on a proper fix without disclosing the problem to the public before it has been solved. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..f7d6c65f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[tool.pylint.format] +max-line-length = 119 + +[tool.pycln] +all = true + +[tool.isort] +profile = "black" +line_length = 119 +remove_redundant_aliases = true +float_to_top = true +order_by_type = true +filter_files = true + +[tool.black] +line-length = 119 +skip-string-normalization = true diff --git a/pytest.ini b/pytest.ini index 7444d64d..2a155008 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,7 +3,6 @@ pythonpath = src testpaths = src/gitingest/tests asyncio_mode = auto - python_files = test_*.py python_classes = Test* -python_functions = test_* \ No newline at end of file +python_functions = test_* diff --git a/requirements.txt b/requirements.txt index 6848603b..505955b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,11 @@ -fastapi[standard] -uvicorn +black +click>=8.0.0 +djlint fastapi-analytics -slowapi -tiktoken +fastapi[standard] +pre-commit pytest pytest-asyncio -click>=8.0.0 +slowapi +tiktoken +uvicorn diff --git a/setup.py b/setup.py index 8afe6b73..6778a92c 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -from setuptools import setup, find_packages +from setuptools import find_packages, setup setup( name="gitingest", @@ -28,4 +28,4 @@ "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", ], -) \ No newline at end of file +) diff --git a/src/gitingest/__init__.py b/src/gitingest/__init__.py index ed84b214..212fefcb 100644 --- a/src/gitingest/__init__.py +++ b/src/gitingest/__init__.py @@ -1,6 +1,6 @@ -from .ingest_from_query import ingest_from_query -from .clone import clone_repo -from .parse_query import parse_query -from .ingest import ingest +from gitingest.clone import clone_repo +from gitingest.ingest import ingest +from gitingest.ingest_from_query import ingest_from_query +from gitingest.parse_query import parse_query -__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"] \ No newline at end of file +__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"] diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 81823e63..d19626d0 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -1,10 +1,11 @@ import os -import pathlib + import click +from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS from gitingest.ingest import ingest from gitingest.ingest_from_query import MAX_FILE_SIZE -from gitingest.parse_query import DEFAULT_IGNORE_PATTERNS + def normalize_pattern(pattern: str) -> str: pattern = pattern.strip() @@ -13,30 +14,38 @@ def normalize_pattern(pattern: str) -> str: pattern += "*" return pattern + @click.command() @click.argument('source', type=str, required=True) @click.option('--output', '-o', default=None, help='Output file path (default: .txt in current directory)') @click.option('--max-size', '-s', default=MAX_FILE_SIZE, help='Maximum file size to process in bytes') @click.option('--exclude-pattern', '-e', multiple=True, help='Patterns to exclude') @click.option('--include-pattern', '-i', multiple=True, help='Patterns to include') -def main(source, output, max_size, exclude_pattern, include_pattern): +def main( + source: str, + output: Optional[str], + max_size: int, + exclude_pattern: Tuple[str, ...], + include_pattern: Tuple[str, ...], +) -> None: """Analyze a directory and create a text dump of its contents.""" try: # Combine default and custom ignore patterns exclude_patterns = list(exclude_pattern) include_patterns = list(set(include_pattern)) - + if not output: output = "digest.txt" summary, tree, content = ingest(source, max_size, include_patterns, exclude_patterns, output=output) - + click.echo(f"Analysis complete! Output written to: {output}") click.echo("\nSummary:") click.echo(summary) - + except Exception as e: click.echo(f"Error: {str(e)}", err=True) raise click.Abort() + if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index 4b69bc76..5932b47e 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -1,10 +1,11 @@ import asyncio -from typing import Tuple +from typing import Dict, Tuple from gitingest.utils import async_timeout CLONE_TIMEOUT = 20 + async def check_repo_exists(url: str) -> bool: proc = await asyncio.create_subprocess_exec( "curl", @@ -20,14 +21,15 @@ async def check_repo_exists(url: str) -> bool: stdout_str = stdout.decode() return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str + @async_timeout(CLONE_TIMEOUT) -async def clone_repo(query: dict) -> str: +async def clone_repo(query: Dict[str, Any]) -> Tuple[bytes, bytes]: if not await check_repo_exists(query['url']): raise ValueError("Repository not found, make sure it is public") - + if query['commit']: proc = await asyncio.create_subprocess_exec( - "git", + "git", "clone", "--single-branch", query['url'], @@ -36,21 +38,21 @@ async def clone_repo(query: dict) -> str: stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await proc.communicate() - + proc = await asyncio.create_subprocess_exec( "git", "-C", query['local_path'], "checkout", query['branch'], - stdout=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await proc.communicate() elif query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: proc = await asyncio.create_subprocess_exec( "git", - "clone", + "clone", "--depth=1", "--single-branch", "--branch", @@ -71,7 +73,7 @@ async def clone_repo(query: dict) -> str: stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) - + stdout, stderr = await proc.communicate() - - return stdout, stderr \ No newline at end of file + + return stdout, stderr diff --git a/src/gitingest/ignore_patterns.py b/src/gitingest/ignore_patterns.py new file mode 100644 index 00000000..803c6edc --- /dev/null +++ b/src/gitingest/ignore_patterns.py @@ -0,0 +1,102 @@ +from typing import List + +DEFAULT_IGNORE_PATTERNS: List[str] = [ + # Python + '*.pyc', + '*.pyo', + '*.pyd', + '__pycache__', + '.pytest_cache', + '.coverage', + '.tox', + '.nox', + '.mypy_cache', + '.ruff_cache', + '.hypothesis', + 'poetry.lock', + 'Pipfile.lock', + # JavaScript/Node + 'node_modules', + 'bower_components', + 'package-lock.json', + 'yarn.lock', + '.npm', + '.yarn', + '.pnpm-store', + # Version control + '.git', + '.svn', + '.hg', + '.gitignore', + '.gitattributes', + '.gitmodules', + # Images and media + '*.svg', + '*.png', + '*.jpg', + '*.jpeg', + '*.gif', + '*.ico', + '*.pdf', + '*.mov', + '*.mp4', + '*.mp3', + '*.wav', + # Virtual environments + 'venv', + '.venv', + 'env', + '.env', + 'virtualenv', + # IDEs and editors + '.idea', + '.vscode', + '.vs', + '*.swp', + '*.swo', + '*.swn', + '.settings', + '.project', + '.classpath', + '*.sublime-*', + # Temporary and cache files + '*.log', + '*.bak', + '*.swp', + '*.tmp', + '*.temp', + '.cache', + '.sass-cache', + '.eslintcache', + '.DS_Store', + 'Thumbs.db', + 'desktop.ini', + # Build directories and artifacts + 'build', + 'dist', + 'target', + 'out', + '*.egg-info', + '*.egg', + '*.whl', + '*.so', + '*.dylib', + '*.dll', + '*.class', + # Documentation + 'site-packages', + '.docusaurus', + '.next', + '.nuxt', + # Other common patterns + ## Minified files + '*.min.js', + '*.min.css', + ## Source maps + '*.map', + ## Terraform + '.terraform', + '*.tfstate*', + ## Dependencies in various languages + 'vendor/', +] diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py index eac20818..8a1a54f7 100644 --- a/src/gitingest/ingest.py +++ b/src/gitingest/ingest.py @@ -1,18 +1,25 @@ import asyncio import shutil -from typing import Union, List from pathlib import Path +from typing import List, Union -from .ingest_from_query import ingest_from_query -from .clone import clone_repo -from .parse_query import parse_query +from gitingest.clone import clone_repo +from gitingest.ingest_from_query import ingest_from_query +from gitingest.parse_query import parse_query -def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, include_patterns: Union[List[str], str] = None, exclude_patterns: Union[List[str], str] = None, output: str = None) -> str: + +def ingest( + source: str, + max_file_size: int = 10 * 1024 * 1024, + include_patterns: Union[List[str], str, None] = None, + exclude_patterns: Union[List[str], str, None] = None, + output: Optional[str] = None, +) -> Tuple[str, str, str]: try: - query = parse_query(source, max_file_size, False, include_patterns, exclude_patterns) + query = parse_query(source, max_file_size, False, include_patterns, exclude_patterns) if query['url']: asyncio.run(clone_repo(query)) - + summary, tree, content = ingest_from_query(query) if output: @@ -20,9 +27,10 @@ def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, include_patterns: f.write(tree + "\n" + content) return summary, tree, content + finally: # Clean up the temporary directory if it was created if query['url']: # Get parent directory two levels up from local_path (../tmp) cleanup_path = str(Path(query['local_path']).parents[1]) - shutil.rmtree(cleanup_path, ignore_errors=True) \ No newline at end of file + shutil.rmtree(cleanup_path, ignore_errors=True) diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index 4e7d5e78..905e6181 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -1,13 +1,13 @@ import os from fnmatch import fnmatch -from typing import Dict, List, Union -import tiktoken +from typing import Dict, List, Set +import tiktoken -MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB +MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal -MAX_FILES = 10000 # Maximum number of files to process -MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500MB +MAX_FILES = 10_000 # Maximum number of files to process +MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB def should_include(path: str, base_path: str, include_patterns: List[str]) -> bool: @@ -18,6 +18,7 @@ def should_include(path: str, base_path: str, include_patterns: List[str]) -> bo include = True return include + def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> bool: rel_path = path.replace(base_path, "").lstrip(os.sep) for pattern in ignore_patterns: @@ -27,6 +28,7 @@ def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> boo return True return False + def is_safe_symlink(symlink_path: str, base_path: str) -> bool: """Check if a symlink points to a location within the base directory.""" try: @@ -37,23 +39,32 @@ def is_safe_symlink(symlink_path: str, base_path: str) -> bool: # If there's any error resolving the paths, consider it unsafe return False + def is_text_file(file_path: str) -> bool: """Determines if a file is likely a text file based on its content.""" try: with open(file_path, 'rb') as file: chunk = file.read(1024) return not bool(chunk.translate(None, bytes([7, 8, 9, 10, 12, 13, 27] + list(range(0x20, 0x100))))) - except IOError: + except OSError: return False + def read_file_content(file_path: str) -> str: try: - with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + with open(file_path, encoding='utf-8', errors='ignore') as f: return f.read() except Exception as e: return f"Error reading file: {str(e)}" -def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = 0, stats: Dict = None) -> Dict: + +def scan_directory( + path: str, + query: Dict[str, Any], + seen_paths: Optional[Set[str]] = None, + depth: int = 0, + stats: Optional[Dict[str, int]] = None, +) -> Optional[Dict[str, Any]]: """Recursively analyzes a directory and its contents with safety limits.""" if seen_paths is None: seen_paths = set() @@ -76,6 +87,7 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = if real_path in seen_paths: print(f"Skipping already visited path: {path}") return None + seen_paths.add(real_path) result = { @@ -86,7 +98,7 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = "file_count": 0, "dir_count": 0, "path": path, - "ignore_content": False + "ignore_content": False, } ignore_patterns = query['ignore_patterns'] @@ -137,14 +149,20 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = "type": "file", "size": file_size, "content": content, - "path": item_path + "path": item_path, } result["children"].append(child) result["size"] += file_size result["file_count"] += 1 elif os.path.isdir(real_path): - subdir = scan_directory(real_path, query, seen_paths, depth + 1, stats) + subdir = scan_directory( + path=real_path, + query=query, + seen_paths=seen_paths, + depth=depth + 1, + stats=stats, + ) if subdir and (not include_patterns or subdir["file_count"] > 0): subdir["name"] = item subdir["path"] = item_path @@ -175,14 +193,20 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = "type": "file", "size": file_size, "content": content, - "path": item_path + "path": item_path, } result["children"].append(child) result["size"] += file_size result["file_count"] += 1 elif os.path.isdir(item_path): - subdir = scan_directory(item_path, query, seen_paths, depth + 1, stats) + subdir = scan_directory( + path=item_path, + query=query, + seen_paths=seen_paths, + depth=depth + 1, + stats=stats, + ) if subdir and (not include_patterns or subdir["file_count"] > 0): result["children"].append(subdir) result["size"] += subdir["size"] @@ -194,7 +218,13 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = return result -def extract_files_content(query: dict, node: Dict, max_file_size: int, files: List = None) -> List[Dict]: + +def extract_files_content( + query: Dict[str, Any], + node: Dict[str, Any], + max_file_size: int, + files: Optional[List[Dict[str, Any]]] = None, +) -> List[Dict[str, Any]]: """Recursively collects all text files with their contents.""" if files is None: files = [] @@ -204,17 +234,21 @@ def extract_files_content(query: dict, node: Dict, max_file_size: int, files: Li if node["size"] > max_file_size: content = None - files.append({ - "path": node["path"].replace(query['local_path'], ""), - "content": content, - "size": node["size"] - }) + files.append( + { + "path": node["path"].replace(query['local_path'], ""), + "content": content, + "size": node["size"], + }, + ) elif node["type"] == "directory": for child in node["children"]: - extract_files_content(query, child, max_file_size, files) + extract_files_content(query=query, node=child, max_file_size=max_file_size, files=files) + return files -def create_file_content_string(files: List[Dict]) -> str: + +def create_file_content_string(files: List[Dict[str, Any]]) -> str: """Creates a formatted string of file contents with separators.""" output = "" separator = "=" * 48 + "\n" @@ -223,6 +257,7 @@ def create_file_content_string(files: List[Dict]) -> str: for file in files: if not file['content']: continue + if file['path'].lower() == '/readme.md': output += separator output += f"File: {file['path']}\n" @@ -234,6 +269,7 @@ def create_file_content_string(files: List[Dict]) -> str: for file in files: if not file['content'] or file['path'].lower() == '/readme.md': continue + output += separator output += f"File: {file['path']}\n" output += separator @@ -241,12 +277,14 @@ def create_file_content_string(files: List[Dict]) -> str: return output -def create_summary_string(query: dict, nodes: Dict, files: List[Dict]) -> str: + +def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: List[Dict[str, Any]]) -> str: """Creates a summary string with file counts and content size.""" if "user_name" in query: summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" else: summary = f"Repository: {query['slug']}\n" + summary += f"Files analyzed: {nodes['file_count']}\n" if 'subpath' in query and query['subpath'] != '/': @@ -255,11 +293,19 @@ def create_summary_string(query: dict, nodes: Dict, files: List[Dict]) -> str: summary += f"Commit: {query['commit']}\n" elif 'branch' in query and query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: summary += f"Branch: {query['branch']}\n" + return summary -def create_tree_structure(query: dict, node: Dict, prefix: str = "", is_last: bool = True) -> str: + +def create_tree_structure( + query: Dict[str, Any], + node: Dict[str, Any], + prefix: str = "", + is_last: bool = True, +) -> str: """Creates a tree-like string representation of the file structure.""" tree = "" + if not node["name"]: node["name"] = query['slug'] @@ -267,6 +313,7 @@ def create_tree_structure(query: dict, node: Dict, prefix: str = "", is_last: bo current_prefix = "└── " if is_last else "├── " name = node["name"] + "/" if node["type"] == "directory" else node["name"] tree += prefix + current_prefix + name + "\n" + if node["type"] == "directory": # Adjust prefix only if we added a node name new_prefix = prefix + (" " if is_last else "│ ") if node["name"] else prefix @@ -276,25 +323,29 @@ def create_tree_structure(query: dict, node: Dict, prefix: str = "", is_last: bo return tree -def generate_token_string(context_string: str) -> str: + +def generate_token_string(context_string: str) -> Optional[str]: """Returns the number of tokens in a text string.""" formatted_tokens = "" try: - encoding = tiktoken.get_encoding("cl100k_base", ) + encoding = tiktoken.get_encoding("cl100k_base") total_tokens = len(encoding.encode(context_string, disallowed_special=())) - + except Exception as e: print(e) return None + if total_tokens > 1000000: formatted_tokens = f"{total_tokens/1000000:.1f}M" elif total_tokens > 1000: formatted_tokens = f"{total_tokens/1000:.1f}k" else: formatted_tokens = f"{total_tokens}" + return formatted_tokens -def ingest_single_file(path: str, query: dict) -> Dict: + +def ingest_single_file(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: if not os.path.isfile(path): raise ValueError(f"Path {path} is not a file") @@ -310,7 +361,7 @@ def ingest_single_file(path: str, query: dict) -> Dict: file_info = { "path": path.replace(query['local_path'], ""), "content": content, - "size": file_size + "size": file_size, } summary = ( @@ -326,11 +377,13 @@ def ingest_single_file(path: str, query: dict) -> Dict: formatted_tokens = generate_token_string(files_content) if formatted_tokens: summary += f"\nEstimated tokens: {formatted_tokens}" - return (summary, tree, files_content) -def ingest_directory(path: str, query: dict) -> Dict: - nodes = scan_directory(path, query) - files = extract_files_content(query, nodes, query['max_file_size']) + return summary, tree, files_content + + +def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: + nodes = scan_directory(path=path, query=query) + files = extract_files_content(query=query, node=nodes, max_file_size=query['max_file_size']) summary = create_summary_string(query, nodes, files) tree = "Directory structure:\n" + create_tree_structure(query, nodes) files_content = create_file_content_string(files) @@ -338,9 +391,11 @@ def ingest_directory(path: str, query: dict) -> Dict: formatted_tokens = generate_token_string(tree + files_content) if formatted_tokens: summary += f"\nEstimated tokens: {formatted_tokens}" - return (summary, tree, files_content) -def ingest_from_query(query: dict) -> Dict: + return summary, tree, files_content + + +def ingest_from_query(query: Dict[str, Any]) -> Tuple[str, str, str]: """Main entry point for analyzing a codebase directory or single file.""" path = f"{query['local_path']}{query['subpath']}" if not os.path.exists(path): @@ -348,6 +403,5 @@ def ingest_from_query(query: dict) -> Dict: if query.get('type') == 'blob': return ingest_single_file(path, query) - else: - return ingest_directory(path, query) + return ingest_directory(path, query) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 8b8f97a8..d8d4a320 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -1,55 +1,13 @@ -from typing import List, Union +import os import uuid -import os - - -DEFAULT_IGNORE_PATTERNS = [ - # Python - '*.pyc', '*.pyo', '*.pyd', '__pycache__', '.pytest_cache', '.coverage', - '.tox', '.nox', '.mypy_cache', '.ruff_cache', '.hypothesis', - 'poetry.lock', 'Pipfile.lock', - - # JavaScript/Node - 'node_modules', 'bower_components', 'package-lock.json', 'yarn.lock', - '.npm', '.yarn', '.pnpm-store', - - # Version control - '.git', '.svn', '.hg', '.gitignore', '.gitattributes', '.gitmodules', - - # Images and media - '*.svg', '*.png', '*.jpg', '*.jpeg', '*.gif', '*.ico', '*.pdf', - '*.mov', '*.mp4', '*.mp3', '*.wav', - - # Virtual environments - 'venv', '.venv', 'env', '.env', 'virtualenv', - - # IDEs and editors - '.idea', '.vscode', '.vs', '*.swp', '*.swo', '*.swn', - '.settings', '.project', '.classpath', '*.sublime-*', - - # Temporary and cache files - '*.log', '*.bak', '*.swp', '*.tmp', '*.temp', - '.cache', '.sass-cache', '.eslintcache', - '.DS_Store', 'Thumbs.db', 'desktop.ini', - - # Build directories and artifacts - 'build', 'dist', 'target', 'out', - '*.egg-info', '*.egg', '*.whl', - '*.so', '*.dylib', '*.dll', '*.class', - - # Documentation - 'site-packages', '.docusaurus', '.next', '.nuxt', - - # Other common patterns - '*.min.js', '*.min.css', # Minified files - '*.map', # Source maps - '.terraform', '*.tfstate*', # Terraform - 'vendor/', # Dependencies in various languages -] +from typing import Any, Dict, List, Optional, Union + +from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS TMP_BASE_PATH = "../tmp" -def parse_url(url: str) -> dict: + +def parse_url(url: str) -> Dict[str, Optional[str]]: parsed = { "user_name": None, "repo_name": None, @@ -62,22 +20,22 @@ def parse_url(url: str) -> dict: "slug": None, "id": None, } - + url = url.split(" ")[0] if not url.startswith('https://'): url = 'https://' + url - + # Extract domain and path url_parts = url.split('/') domain = url_parts[2] path_parts = url_parts[3:] - + if len(path_parts) < 2: raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.") - + parsed["user_name"] = path_parts[0] parsed["repo_name"] = path_parts[1] - + # Keep original URL format parsed["url"] = f"https://{domain}/{parsed['user_name']}/{parsed['repo_name']}" parsed['slug'] = f"{parsed['user_name']}-{parsed['repo_name']}" @@ -89,10 +47,12 @@ def parse_url(url: str) -> dict: parsed["branch"] = path_parts[3] if len(parsed['branch']) == 40 and all(c in '0123456789abcdefABCDEF' for c in parsed['branch']): parsed["commit"] = parsed['branch'] - + parsed["subpath"] = "/" + "/".join(path_parts[4:]) + return parsed + def normalize_pattern(pattern: str) -> str: pattern = pattern.strip() pattern = pattern.lstrip(os.sep) @@ -100,16 +60,21 @@ def normalize_pattern(pattern: str) -> str: pattern += "*" return pattern + def parse_patterns(pattern: Union[List[str], str]) -> List[str]: if isinstance(pattern, list): pattern = ",".join(pattern) for p in pattern.split(","): if not all(c.isalnum() or c in "-_./+*" for c in p.strip()): - raise ValueError(f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed.") + raise ValueError( + f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), " + "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." + ) patterns = [normalize_pattern(p) for p in pattern.split(",")] return patterns + def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[str]) -> List[str]: for pattern in include_patterns: if pattern in ignore_patterns: @@ -117,8 +82,7 @@ def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[ return ignore_patterns -def parse_path(path: str) -> dict: - +def parse_path(path: str) -> Dict[str, Any]: query = { "local_path": os.path.abspath(path), "slug": os.path.basename(os.path.dirname(path)) + "/" + os.path.basename(path), @@ -128,7 +92,14 @@ def parse_path(path: str) -> dict: } return query -def parse_query(source: str, max_file_size: int, from_web: bool, include_patterns: Union[List[str], str] = None, ignore_patterns: Union[List[str], str] = None) -> dict: + +def parse_query( + source: str, + max_file_size: int, + from_web: bool, + include_patterns: Union[List[str], str] = None, + ignore_patterns: Union[List[str], str] = None, +) -> Dict[str, Any]: if from_web: query = parse_url(source) else: @@ -142,15 +113,14 @@ def parse_query(source: str, max_file_size: int, from_web: bool, include_pattern ignore_patterns = DEFAULT_IGNORE_PATTERNS + parse_patterns(ignore_patterns) else: ignore_patterns = DEFAULT_IGNORE_PATTERNS - + if include_patterns and include_patterns != "": include_patterns = parse_patterns(include_patterns) ignore_patterns = override_ignore_patterns(ignore_patterns, include_patterns) - else: + else: include_patterns = None - + query['ignore_patterns'] = ignore_patterns query['include_patterns'] = include_patterns - - return query + return query diff --git a/src/gitingest/tests/conftest.py b/src/gitingest/tests/conftest.py index d31de7bb..31dba62d 100644 --- a/src/gitingest/tests/conftest.py +++ b/src/gitingest/tests/conftest.py @@ -6,4 +6,4 @@ # Add both the project root and src directory to PYTHONPATH sys.path.insert(0, project_root) -sys.path.insert(0, os.path.join(project_root, 'src')) \ No newline at end of file +sys.path.insert(0, os.path.join(project_root, 'src')) diff --git a/src/gitingest/tests/test_clone.py b/src/gitingest/tests/test_clone.py index 06579b6e..06e90e64 100644 --- a/src/gitingest/tests/test_clone.py +++ b/src/gitingest/tests/test_clone.py @@ -1,72 +1,77 @@ +from unittest.mock import AsyncMock, patch + import pytest -from clone import clone_repo, check_repo_exists -from unittest.mock import patch, AsyncMock +from clone import check_repo_exists, clone_repo + @pytest.mark.asyncio -async def test_clone_repo_with_commit(): +async def test_clone_repo_with_commit() -> None: query = { 'commit': 'a' * 40, # Simulating a valid commit hash 'branch': 'main', 'url': 'https://github.com/user/repo', - 'local_path': '/tmp/repo' + 'local_path': '/tmp/repo', } - + with patch('clone.check_repo_exists', return_value=True) as mock_check: with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b'output', b'error') mock_exec.return_value = mock_process - + await clone_repo(query) mock_check.assert_called_once_with(query['url']) assert mock_exec.call_count == 2 # Clone and checkout calls + @pytest.mark.asyncio -async def test_clone_repo_without_commit(): +async def test_clone_repo_without_commit() -> None: query = { 'commit': None, 'branch': 'main', 'url': 'https://github.com/user/repo', - 'local_path': '/tmp/repo' + 'local_path': '/tmp/repo', } - + with patch('clone.check_repo_exists', return_value=True) as mock_check: with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b'output', b'error') mock_exec.return_value = mock_process - + await clone_repo(query) mock_check.assert_called_once_with(query['url']) assert mock_exec.call_count == 1 # Only clone call + @pytest.mark.asyncio -async def test_clone_repo_nonexistent_repository(): +async def test_clone_repo_nonexistent_repository() -> None: query = { 'commit': None, 'branch': 'main', 'url': 'https://github.com/user/nonexistent-repo', - 'local_path': '/tmp/repo' + 'local_path': '/tmp/repo', } - + with patch('gitingest.clone.check_repo_exists', return_value=False) as mock_check: with pytest.raises(ValueError, match="Repository not found"): await clone_repo(query) mock_check.assert_called_once_with(query['url']) + @pytest.mark.asyncio -async def test_check_repo_exists(): +async def test_check_repo_exists() -> None: url = "https://github.com/user/repo" - + with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b'HTTP/1.1 200 OK\n', b'') mock_exec.return_value = mock_process - + # Test existing repository mock_process.returncode = 0 assert await check_repo_exists(url) is True - + # Test non-existing repository (404 response) mock_process.communicate.return_value = (b'HTTP/1.1 404 Not Found\n', b'') mock_process.returncode = 0 @@ -74,4 +79,4 @@ async def test_check_repo_exists(): # Test failed request mock_process.returncode = 1 - assert await check_repo_exists(url) is False \ No newline at end of file + assert await check_repo_exists(url) is False diff --git a/src/gitingest/tests/test_ingest.py b/src/gitingest/tests/test_ingest.py index 19a57b58..3a7ac127 100644 --- a/src/gitingest/tests/test_ingest.py +++ b/src/gitingest/tests/test_ingest.py @@ -1,12 +1,14 @@ +from pathlib import Path +from typing import Any, Dict + import pytest -from src.gitingest.ingest_from_query import ( - scan_directory, - extract_files_content, -) + +from gitingest.ingest_from_query import extract_files_content, scan_directory + # Test fixtures @pytest.fixture -def sample_query(): +def sample_query() -> Dict[str, Any]: return { 'user_name': 'test_user', 'repo_name': 'test_repo', @@ -14,16 +16,16 @@ def sample_query(): 'subpath': '/', 'branch': 'main', 'commit': None, - 'max_file_size': 1000000, + 'max_file_size': 1_000_000, 'slug': 'test_user/test_repo', 'ignore_patterns': ['*.pyc', '__pycache__', '.git'], 'include_patterns': None, - 'pattern_type': 'exclude' - + 'pattern_type': 'exclude', } + @pytest.fixture -def temp_directory(tmp_path): +def temp_directory(tmp_path: Path) -> Path: # Creates the following structure: # test_repo/ # ├── file1.txt @@ -38,58 +40,53 @@ def temp_directory(tmp_path): # | └── file_dir1.txt # └── dir2/ # └── file_dir2.txt - + test_dir = tmp_path / "test_repo" test_dir.mkdir() - + # Root files (test_dir / "file1.txt").write_text("Hello World") (test_dir / "file2.py").write_text("print('Hello')") - + # src directory and its files src_dir = test_dir / "src" src_dir.mkdir() (src_dir / "subfile1.txt").write_text("Hello from src") (src_dir / "subfile2.py").write_text("print('Hello from src')") - + # src/subdir and its files subdir = src_dir / "subdir" subdir.mkdir() (subdir / "file_subdir.txt").write_text("Hello from subdir") (subdir / "file_subdir.py").write_text("print('Hello from subdir')") - + # dir1 and its file dir1 = test_dir / "dir1" dir1.mkdir() (dir1 / "file_dir1.txt").write_text("Hello from dir1") - + # dir2 and its file dir2 = test_dir / "dir2" dir2.mkdir() (dir2 / "file_dir2.txt").write_text("Hello from dir2") - + return test_dir -def test_scan_directory(temp_directory, sample_query): - result = scan_directory( - str(temp_directory), - query=sample_query - ) - + +def test_scan_directory(temp_directory: Path, sample_query: Dict[str, Any]) -> None: + result = scan_directory(str(temp_directory), query=sample_query) + assert result['type'] == 'directory' assert result['file_count'] == 8 # All .txt and .py files - assert result['dir_count'] == 4 # src, src/subdir, dir1, dir2 + assert result['dir_count'] == 4 # src, src/subdir, dir1, dir2 assert len(result['children']) == 5 # file1.txt, file2.py, src, dir1, dir2 -def test_extract_files_content(temp_directory, sample_query): - nodes = scan_directory( - str(temp_directory), - query=sample_query - ) - - files = extract_files_content(sample_query, nodes, max_file_size=1000000) + +def test_extract_files_content(temp_directory: Path, sample_query: Dict[str, Any]) -> None: + nodes = scan_directory(str(temp_directory), query=sample_query) + files = extract_files_content(query=sample_query, node=nodes, max_file_size=1_000_000) assert len(files) == 8 # All .txt and .py files - + # Check for presence of key files paths = [f['path'] for f in files] assert any('file1.txt' in p for p in paths) @@ -101,22 +98,17 @@ def test_extract_files_content(temp_directory, sample_query): assert any('file_dir2.txt' in p for p in paths) - # TODO: test with include patterns: ['*.txt'] # TODO: test with wrong include patterns: ['*.qwerty'] -#single folder patterns +# single folder patterns # TODO: test with include patterns: ['src/*'] # TODO: test with include patterns: ['/src/*'] # TODO: test with include patterns: ['/src/'] # TODO: test with include patterns: ['/src*'] -#multiple patterns +# multiple patterns # TODO: test with multiple include patterns: ['*.txt', '*.py'] # TODO: test with multiple include patterns: ['/src/*', '*.txt'] # TODO: test with multiple include patterns: ['/src*', '*.txt'] - - - - diff --git a/src/gitingest/tests/test_parse_query.py b/src/gitingest/tests/test_parse_query.py index da614048..ae4c1659 100644 --- a/src/gitingest/tests/test_parse_query.py +++ b/src/gitingest/tests/test_parse_query.py @@ -1,12 +1,14 @@ import pytest -from gitingest.parse_query import parse_query, parse_url, DEFAULT_IGNORE_PATTERNS +from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS +from gitingest.parse_query import parse_query, parse_url -def test_parse_url_valid(): + +def test_parse_url_valid() -> None: test_cases = [ "https://github.com/user/repo", - "https://gitlab.com/user/repo", - "https://bitbucket.org/user/repo" + "https://gitlab.com/user/repo", + "https://bitbucket.org/user/repo", ] for url in test_cases: result = parse_url(url) @@ -14,16 +16,15 @@ def test_parse_url_valid(): assert result["repo_name"] == "repo" assert result["url"] == url -def test_parse_url_invalid(): + +def test_parse_url_invalid() -> None: url = "https://only-domain.com" with pytest.raises(ValueError, match="Invalid repository URL"): parse_url(url) -def test_parse_query_basic(): - test_cases = [ - "https://github.com/user/repo", - "https://gitlab.com/user/repo" - ] + +def test_parse_query_basic() -> None: + test_cases = ["https://github.com/user/repo", "https://gitlab.com/user/repo"] for url in test_cases: result = parse_query(url, max_file_size=50, from_web=True, ignore_patterns='*.txt') assert result["user_name"] == "user" @@ -31,13 +32,15 @@ def test_parse_query_basic(): assert result["url"] == url assert "*.txt" in result["ignore_patterns"] -def test_parse_query_include_pattern(): + +def test_parse_query_include_pattern() -> None: url = "https://github.com/user/repo" result = parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py') assert result["include_patterns"] == ["*.py"] assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS -def test_parse_query_invalid_pattern(): + +def test_parse_query_invalid_pattern() -> None: url = "https://github.com/user/repo" with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): - parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py;rm -rf') \ No newline at end of file + parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py;rm -rf') diff --git a/src/gitingest/utils.py b/src/gitingest/utils.py index ebbb4090..bf64aecf 100644 --- a/src/gitingest/utils.py +++ b/src/gitingest/utils.py @@ -1,16 +1,18 @@ - ## Async Timeout decorator import asyncio import functools -from typing import TypeVar, Callable +from typing import Awaitable, Callable, TypeVar T = TypeVar("T") + class AsyncTimeoutError(Exception): """Raised when an async operation exceeds its timeout limit.""" + pass -def async_timeout(seconds: int = 10): + +def async_timeout(seconds: int = 10) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]: def decorator(func: Callable[..., T]) -> Callable[..., T]: @functools.wraps(func) async def wrapper(*args, **kwargs) -> T: @@ -18,5 +20,7 @@ async def wrapper(*args, **kwargs) -> T: return await asyncio.wait_for(func(*args, **kwargs), timeout=seconds) except asyncio.TimeoutError: raise AsyncTimeoutError(f"Clone timed out after {seconds} seconds") + return wrapper - return decorator \ No newline at end of file + + return decorator diff --git a/src/main.py b/src/main.py index 3fe94233..e651528f 100644 --- a/src/main.py +++ b/src/main.py @@ -1,18 +1,18 @@ import os -from dotenv import load_dotenv +from typing import Dict +from api_analytics.fastapi import Analytics +from dotenv import load_dotenv from fastapi import FastAPI, Request -from fastapi.templating import Jinja2Templates -from fastapi.responses import HTMLResponse, FileResponse, Response +from fastapi.responses import FileResponse, HTMLResponse, Response from fastapi.staticfiles import StaticFiles -from starlette.middleware.trustedhost import TrustedHostMiddleware -from api_analytics.fastapi import Analytics +from fastapi.templating import Jinja2Templates from slowapi import _rate_limit_exceeded_handler from slowapi.errors import RateLimitExceeded +from starlette.middleware.trustedhost import TrustedHostMiddleware -from server_utils import limiter from routers import download, dynamic, index - +from server_utils import limiter load_dotenv() @@ -36,31 +36,29 @@ app.add_middleware(TrustedHostMiddleware, allowed_hosts=allowed_hosts) templates = Jinja2Templates(directory="templates") + @app.get("/health") -async def health_check(): +async def health_check() -> Dict[str, str]: return {"status": "healthy"} + @app.head("/") -async def head_root(): +async def head_root() -> HTMLResponse: """Mirror the headers and status code of the index page""" - return HTMLResponse( - content=None, - headers={ - "content-type": "text/html; charset=utf-8" - } - ) - + return HTMLResponse(content=None, headers={"content-type": "text/html; charset=utf-8"}) + + @app.get("/api/", response_class=HTMLResponse) @app.get("/api", response_class=HTMLResponse) -async def api_docs(request: Request): - return templates.TemplateResponse( - "api.jinja", {"request": request} - ) +async def api_docs(request: Request) -> HTMLResponse: + return templates.TemplateResponse("api.jinja", {"request": request}) + @app.get("/robots.txt") -async def robots(): +async def robots() -> FileResponse: return FileResponse('static/robots.txt') + app.include_router(index) app.include_router(download) -app.include_router(dynamic) \ No newline at end of file +app.include_router(dynamic) diff --git a/src/process_query.py b/src/process_query.py index 18d8d76c..bd7940fe 100644 --- a/src/process_query.py +++ b/src/process_query.py @@ -1,16 +1,24 @@ -from typing import List -from fastapi.templating import Jinja2Templates +from typing import Dict + from fastapi import Request +from fastapi.templating import Jinja2Templates -from config import MAX_DISPLAY_SIZE, EXAMPLE_REPOS -from gitingest import ingest_from_query, clone_repo, parse_query -from server_utils import logSliderToSize, Colors +from config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE +from gitingest import clone_repo, ingest_from_query, parse_query +from server_utils import Colors, logSliderToSize templates = Jinja2Templates(directory="templates") -def print_query(query, request, max_file_size, pattern_type, pattern): + +def print_query( + query: Dict[str, Any], + request: Request, + max_file_size: int, + pattern_type: str, + pattern: str, +) -> None: print(f"{Colors.WHITE}{query['url']:<20}{Colors.END}", end="") - if int(max_file_size/1024) != 50: + if int(max_file_size / 1024) != 50: print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") if pattern_type == "include" and pattern != "": print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") @@ -18,46 +26,74 @@ def print_query(query, request, max_file_size, pattern_type, pattern): print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") -def print_error(query, request, e, max_file_size, pattern_type, pattern): +def print_error( + query: Dict[str, Any], + request: Request, + e: Exception, + max_file_size: int, + pattern_type: str, + pattern: str, +) -> None: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print_query(query, request, max_file_size, pattern_type, pattern) print(f" | {Colors.RED}{e}{Colors.END}") -def print_success(query, request, max_file_size, pattern_type, pattern, summary): + +def print_success( + query: Dict[str, Any], + request: Request, + max_file_size: int, + pattern_type: str, + pattern: str, + summary: str, +) -> None: estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") print_query(query, request, max_file_size, pattern_type, pattern) print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") - -async def process_query(request: Request, input_text: str, slider_position: int, pattern_type: str = "exclude", pattern: str = "", is_index: bool = False) -> str: +async def process_query( + request: Request, + input_text: str, + slider_position: int, + pattern_type: str = "exclude", + pattern: str = "", + is_index: bool = False, +) -> HTMLResponse: template = "index.jinja" if is_index else "github.jinja" max_file_size = logSliderToSize(slider_position) + if pattern_type == "include": include_patterns = pattern exclude_patterns = None elif pattern_type == "exclude": exclude_patterns = pattern include_patterns = None + try: - query = parse_query(input_text, max_file_size, True, include_patterns, exclude_patterns) + query = parse_query( + source=input_text, + max_file_size=max_file_size, + from_web=True, + include_patterns=include_patterns, + ignore_patterns=exclude_patterns, + ) await clone_repo(query) summary, tree, content = ingest_from_query(query) with open(f"{query['local_path']}.txt", "w") as f: f.write(tree + "\n" + content) - - except Exception as e: - #hack to print error message when query is not defined + # hack to print error message when query is not defined if 'query' in locals() and query is not None and isinstance(query, dict): - print_error(query, request, e, max_file_size, pattern_type, pattern) + print_error(query, request, e, max_file_size, pattern_type, pattern) else: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print(f"{Colors.RED}{e}{Colors.END}") + return templates.TemplateResponse( - template, + template, { "request": request, "github_url": input_text, @@ -66,25 +102,37 @@ async def process_query(request: Request, input_text: str, slider_position: int, "default_file_size": slider_position, "pattern_type": pattern_type, "pattern": pattern, - } + }, ) - + if len(content) > MAX_DISPLAY_SIZE: - content = f"(Files content cropped to {int(MAX_DISPLAY_SIZE/1000)}k characters, download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] - print_success(query, request, max_file_size, pattern_type, pattern, summary) + content = ( + f"(Files content cropped to {int(MAX_DISPLAY_SIZE/1000)}k characters, " + "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] + ) + + print_success( + query=query, + request=request, + max_file_size=max_file_size, + pattern_type=pattern_type, + pattern=pattern, + summary=summary, + ) + return templates.TemplateResponse( - template, + template, { - "request": request, + "request": request, "github_url": input_text, - "result": True, + "result": True, "summary": summary, - "tree": tree, + "tree": tree, "content": content, "examples": EXAMPLE_REPOS if is_index else [], "ingest_id": query['id'], "default_file_size": slider_position, "pattern_type": pattern_type, "pattern": pattern, - } + }, ) diff --git a/src/routers/__init__.py b/src/routers/__init__.py index b1871c1a..ace7bd09 100644 --- a/src/routers/__init__.py +++ b/src/routers/__init__.py @@ -1,5 +1,5 @@ -from .download import router as download -from .dynamic import router as dynamic -from .index import router as index +from routers.download import router as download +from routers.dynamic import router as dynamic +from routers.index import router as index -__all__ = ["download", "dynamic", "index"] \ No newline at end of file +__all__ = ["download", "dynamic", "index"] diff --git a/src/routers/download.py b/src/routers/download.py index e26b2df2..95cec0fe 100644 --- a/src/routers/download.py +++ b/src/routers/download.py @@ -1,29 +1,30 @@ -from fastapi import HTTPException, APIRouter +import os + +from fastapi import APIRouter, HTTPException from fastapi.responses import Response + from config import TMP_BASE_PATH -import os router = APIRouter() + @router.get("/download/{digest_id}") -async def download_ingest(digest_id: str): +async def download_ingest(digest_id: str) -> Response: try: # Find the first .txt file in the directory directory = f"{TMP_BASE_PATH}/{digest_id}" txt_files = [f for f in os.listdir(directory) if f.endswith('.txt')] - + if not txt_files: raise FileNotFoundError("No .txt file found") - - with open(f"{directory}/{txt_files[0]}", "r") as f: + + with open(f"{directory}/{txt_files[0]}") as f: content = f.read() - + return Response( content=content, media_type="text/plain", - headers={ - "Content-Disposition": f"attachment; filename={txt_files[0]}" - } + headers={"Content-Disposition": f"attachment; filename={txt_files[0]}"}, ) except FileNotFoundError: - raise HTTPException(status_code=404, detail="Digest not found") \ No newline at end of file + raise HTTPException(status_code=404, detail="Digest not found") diff --git a/src/routers/dynamic.py b/src/routers/dynamic.py index 6a0a2f99..12216f15 100644 --- a/src/routers/dynamic.py +++ b/src/routers/dynamic.py @@ -1,4 +1,4 @@ -from fastapi import APIRouter, Request, Form +from fastapi import APIRouter, Form, Request from fastapi.responses import HTMLResponse from fastapi.templating import Jinja2Templates @@ -8,26 +8,34 @@ router = APIRouter() templates = Jinja2Templates(directory="templates") + @router.get("/{full_path:path}") -async def catch_all(request: Request, full_path: str): +async def catch_all(request: Request, full_path: str) -> HTMLResponse: return templates.TemplateResponse( "github.jinja", { "request": request, "github_url": f"https://github.com/{full_path}", "loading": True, - "default_file_size": 243 - } + "default_file_size": 243, + }, ) + @router.post("/{full_path:path}", response_class=HTMLResponse) -@limiter.limit("10/minute") +@limiter.limit("10/minute") async def process_catch_all( - request: Request, + request: Request, input_text: str = Form(...), max_file_size: int = Form(...), pattern_type: str = Form(...), - pattern: str = Form(...) -): - return await process_query(request, input_text, max_file_size, pattern_type, pattern, is_index=False) - \ No newline at end of file + pattern: str = Form(...), +) -> HTMLResponse: + return await process_query( + request, + input_text, + max_file_size, + pattern_type, + pattern, + is_index=False, + ) diff --git a/src/routers/index.py b/src/routers/index.py index 610d87ce..f2728805 100644 --- a/src/routers/index.py +++ b/src/routers/index.py @@ -1,40 +1,41 @@ -from fastapi import APIRouter, Request, Form +from fastapi import APIRouter, Form, Request from fastapi.responses import HTMLResponse from fastapi.templating import Jinja2Templates -from server_utils import limiter -from process_query import process_query from config import EXAMPLE_REPOS - +from process_query import process_query +from server_utils import limiter router = APIRouter() templates = Jinja2Templates(directory="templates") @router.get("/", response_class=HTMLResponse) -async def home(request: Request): +async def home(request: Request) -> HTMLResponse: return templates.TemplateResponse( - "index.jinja", + "index.jinja", { "request": request, "examples": EXAMPLE_REPOS, - "default_file_size": 243 - } + "default_file_size": 243, + }, ) @router.post("/", response_class=HTMLResponse) -@limiter.limit("10/minute") +@limiter.limit("10/minute") async def index_post( - request: Request, + request: Request, input_text: str = Form(...), max_file_size: int = Form(...), pattern_type: str = Form(...), - pattern: str = Form(...) -): - return await process_query(request, input_text, max_file_size, pattern_type, pattern, is_index=True) - - - - - + pattern: str = Form(...), +) -> HTMLResponse: + return await process_query( + request, + input_text, + max_file_size, + pattern_type, + pattern, + is_index=True, + ) diff --git a/src/server_utils.py b/src/server_utils.py index 584041bd..2a6e186f 100644 --- a/src/server_utils.py +++ b/src/server_utils.py @@ -1,21 +1,26 @@ +import math + ## Rate Limiter from slowapi import Limiter from slowapi.util import get_remote_address + limiter = Limiter(key_func=get_remote_address) -## Logarithmic slider to file size -import math -def logSliderToSize(position): + +## Logarithmic slider to file size conversion +def logSliderToSize(position: int) -> int: """Convert slider position to file size in KB""" maxp = 500 minv = math.log(1) maxv = math.log(102400) - + return round(math.exp(minv + (maxv - minv) * pow(position / maxp, 1.5))) * 1024 + ## Color printing utility class Colors: """ANSI color codes""" + BLACK = "\033[0;30m" RED = "\033[0;31m" GREEN = "\033[0;32m" diff --git a/src/static/favicon.svg b/src/static/favicon.svg index f9b0ae4c..dc5a443a 100644 --- a/src/static/favicon.svg +++ b/src/static/favicon.svg @@ -1 +1 @@ -1 \ No newline at end of file +1 diff --git a/src/static/js/snow.js b/src/static/js/snow.js index 0576bff1..a5e1d87f 100644 --- a/src/static/js/snow.js +++ b/src/static/js/snow.js @@ -88,4 +88,4 @@ function initSnow() { document.addEventListener('DOMContentLoaded', initSnow); // Also initialize when the HTMX content is swapped -document.addEventListener('htmx:afterSettle', initSnow); \ No newline at end of file +document.addEventListener('htmx:afterSettle', initSnow); diff --git a/src/static/robots.txt b/src/static/robots.txt index 49e4f2d9..b757ab6a 100644 --- a/src/static/robots.txt +++ b/src/static/robots.txt @@ -1,5 +1,4 @@ User-agent: * -Allow: / +Allow: / Allow: /api/ Allow: /cyclotruc/gitingest/ - diff --git a/src/templates/api.jinja b/src/templates/api.jinja index 41f0e836..c5e57bdb 100644 --- a/src/templates/api.jinja +++ b/src/templates/api.jinja @@ -1,41 +1,35 @@ {% extends "base.jinja" %} - {% block title %}Git ingest API{% endblock %} - {% block content %} -
-
-
-

API Documentation

- - -
-
-
-
- - - -
-
-

- The API is currently under development.. -

+
+
+
+

API Documentation

+
+
+
+
+ + + +
+
+

The API is currently under development..

+
+

+ We're working on making our API available to the public. + In the meantime, you can + open an issue on github + to suggest features. +

-

- We're working on making our API available to the public. - In the meantime, you can - - open an issue on github - - to suggest features. -

-
-{% endblock %} \ No newline at end of file +{% endblock %} diff --git a/src/templates/base.jinja b/src/templates/base.jinja index 3ef8bd70..e6c3fcda 100644 --- a/src/templates/base.jinja +++ b/src/templates/base.jinja @@ -1,41 +1,44 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - {% block title %}Git ingest{% endblock %} - - - - - - {% block extra_head %}{% endblock %} - - - - - {% include 'components/navbar.jinja' %} - - -
-
- {% block content %}{% endblock %} -
-
- - {% include 'components/footer.jinja' %} - - {% block extra_scripts %}{% endblock %} - - \ No newline at end of file + + {% block extra_head %}{% endblock %} + + + + {% include 'components/navbar.jinja' %} + +
+
+ {% block content %}{% endblock %} +
+
+ {% include 'components/footer.jinja' %} + {% block extra_scripts %}{% endblock %} + + diff --git a/src/templates/components/footer.jinja b/src/templates/components/footer.jinja index a0820416..e8ffa9ee 100644 --- a/src/templates/components/footer.jinja +++ b/src/templates/components/footer.jinja @@ -4,19 +4,23 @@
- \ No newline at end of file + diff --git a/src/templates/components/github_form.jinja b/src/templates/components/github_form.jinja index ec6054ef..7be65aee 100644 --- a/src/templates/components/github_form.jinja +++ b/src/templates/components/github_form.jinja @@ -2,28 +2,30 @@
- + class="absolute md:block hidden left-0 h-[4.5rem] w-[4.5rem] bottom-0 -translate-x-full ml-3">
+ id="ingestForm" + onsubmit="handleSubmit(event{% if is_index %}, true{% endif %})">
- +
-
@@ -31,74 +33,62 @@
- - - + +
-
-
- - + +
- {% if show_examples %} - -
-

Try these example repositories:

-
- {% for example in examples %} - + +
+

Try these example repositories:

+
+ {% for example in examples %} + {% endfor %} +
-
{% endif %}
-
\ No newline at end of file +
diff --git a/src/templates/components/navbar.jinja b/src/templates/components/navbar.jinja index 6275cb87..6f4b2ce0 100644 --- a/src/templates/components/navbar.jinja +++ b/src/templates/components/navbar.jinja @@ -21,7 +21,6 @@ fetchGitHubStars(); -
- \ No newline at end of file + diff --git a/src/templates/components/result.jinja b/src/templates/components/result.jinja index 00b6f934..cd0a9783 100644 --- a/src/templates/components/result.jinja +++ b/src/templates/components/result.jinja @@ -1,115 +1,94 @@ {% if result %} -
-
-
-
- -
- -
-
-

Summary

-
- - -
-
-
- -
- {% if ingest_id %} -
-
-
-
- -
- {% endif %} - - +
+
+
+
+ +
+ +
+
+

Summary

+
+
+
+ +
+ {% if ingest_id %} + - - -
-
-

Directory Structure

-
-
-
- -
-
-
-
-
- -
+
+
+
-
- - -
-
-

Files Content

-
-
-
- -
-
-
-
-
- + + + + Copy +
+
+
+ +
+
+
+ +
+
+

Files Content

+
+
+ +
+
+
+
+
+
+
-{% endif %} \ No newline at end of file +{% endif %} diff --git a/src/templates/github.jinja b/src/templates/github.jinja index fdedcce7..c373367c 100644 --- a/src/templates/github.jinja +++ b/src/templates/github.jinja @@ -1,39 +1,33 @@ {% extends "base.jinja" %} - {% block content %} -{% if error_message %} -
- {{ error_message }} -
-{% endif %} - -{% with is_index=true, show_examples=false %} - {% include 'components/github_form.jinja' %} -{% endwith %} - -{% if loading %} -
-
-
-
-

Loading...

-
-
-{% endif %} - -{% include 'components/result.jinja' %} + {% if error_message %} +
{{ error_message }}
+ {% endif %} + {% with is_index=true, show_examples=false %} + {% include 'components/github_form.jinja' %} + {% endwith %} + {% if loading %} +
+
+
+
+

Loading...

+
+
+ {% endif %} + {% include 'components/result.jinja' %} {% endblock content %} - {% block extra_scripts %} - -{% endblock extra_scripts %} \ No newline at end of file + +{% endblock extra_scripts %} diff --git a/src/templates/index.jinja b/src/templates/index.jinja index 80015ade..e29066f6 100644 --- a/src/templates/index.jinja +++ b/src/templates/index.jinja @@ -1,67 +1,57 @@ {% extends "base.jinja" %} - {% block extra_head %} - + {% endblock %} - {% block content %} -
-
- - - - -

- Prompt-friendly
codebase  -

- +
+
+ + + + + + +

+ Prompt-friendly +
+ codebase  +

+ +
+

+ Turn any Git repository into a simple text ingest of its codebase. +

+

+ This is useful for feeding a codebase into any LLM. +

+

+ You can also replace 'hub' with 'ingest' in any Github URL +

-

- Turn any Git repository into a simple text ingest of its codebase. -

-

- This is useful for feeding a codebase into any LLM. -

-

- You can also replace 'hub' with 'ingest' in any Github URL -

-
- -{% if error_message %} -
- {{ error_message }} -
-{% endif %} - -{% with is_index=true, show_examples=true %} - {% include 'components/github_form.jinja' %} -{% endwith %} - -{% include 'components/result.jinja' %} - - - - -{% endblock %} \ No newline at end of file + {% if error_message %} +
{{ error_message }}
+ {% endif %} + {% with is_index=true, show_examples=true %} + {% include 'components/github_form.jinja' %} + {% endwith %} + {% include 'components/result.jinja' %} +{% endblock %} From d0a320fd300a09d33bba1e35d9b18bcd3cf4ba22 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Fri, 27 Dec 2024 21:35:26 +0100 Subject: [PATCH 2/9] Resolve error and fix remaining type hint violations --- requirements.txt | 2 ++ src/config.py | 2 +- src/gitingest/cli.py | 1 + src/gitingest/clone.py | 2 +- src/gitingest/ingest.py | 17 ++++++++++++++--- src/gitingest/ingest_from_query.py | 12 +++++++----- src/gitingest/parse_query.py | 7 ++++--- src/gitingest/tests/test_clone.py | 3 ++- src/gitingest/tests/test_ingest.py | 4 ++++ src/gitingest/utils.py | 11 ++++++----- src/main.py | 18 ++++++++++++++++-- src/process_query.py | 7 ++++--- 12 files changed, 62 insertions(+), 24 deletions(-) diff --git a/requirements.txt b/requirements.txt index 505955b0..7d3680e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,13 @@ black click>=8.0.0 djlint +dotenv fastapi-analytics fastapi[standard] pre-commit pytest pytest-asyncio slowapi +starlette tiktoken uvicorn diff --git a/src/config.py b/src/config.py index cdf2849b..b918fb2a 100644 --- a/src/config.py +++ b/src/config.py @@ -1,4 +1,4 @@ -MAX_DISPLAY_SIZE = 300000 +MAX_DISPLAY_SIZE = 300_000 TMP_BASE_PATH = "../tmp" EXAMPLE_REPOS = [ diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index d19626d0..14df2190 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -1,4 +1,5 @@ import os +from typing import Optional, Tuple import click diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index 5932b47e..e7994c14 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -1,5 +1,5 @@ import asyncio -from typing import Dict, Tuple +from typing import Any, Dict, Tuple from gitingest.utils import async_timeout diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py index 8a1a54f7..22fae6d2 100644 --- a/src/gitingest/ingest.py +++ b/src/gitingest/ingest.py @@ -1,7 +1,8 @@ import asyncio +import inspect import shutil from pathlib import Path -from typing import List, Union +from typing import List, Optional, Tuple, Union from gitingest.clone import clone_repo from gitingest.ingest_from_query import ingest_from_query @@ -16,9 +17,19 @@ def ingest( output: Optional[str] = None, ) -> Tuple[str, str, str]: try: - query = parse_query(source, max_file_size, False, include_patterns, exclude_patterns) + query = parse_query( + source=source, + max_file_size=max_file_size, + from_web=False, + include_patterns=include_patterns, + ignore_patterns=exclude_patterns, + ) if query['url']: - asyncio.run(clone_repo(query)) + clone_result = clone_repo(query) + if inspect.iscoroutine(clone_result): + asyncio.run(clone_result) + else: + raise TypeError("clone_repo did not return a coroutine as expected.") summary, tree, content = ingest_from_query(query) diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index 905e6181..0080c25b 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -1,6 +1,6 @@ import os from fnmatch import fnmatch -from typing import Dict, List, Set +from typing import Any, Dict, List, Optional, Set, Tuple import tiktoken @@ -335,10 +335,10 @@ def generate_token_string(context_string: str) -> Optional[str]: print(e) return None - if total_tokens > 1000000: - formatted_tokens = f"{total_tokens/1000000:.1f}M" - elif total_tokens > 1000: - formatted_tokens = f"{total_tokens/1000:.1f}k" + if total_tokens > 1_000_000: + formatted_tokens = f"{total_tokens / 1_000_000:.1f}M" + elif total_tokens > 1_000: + formatted_tokens = f"{total_tokens / 1_000:.1f}k" else: formatted_tokens = f"{total_tokens}" @@ -383,6 +383,8 @@ def ingest_single_file(path: str, query: Dict[str, Any]) -> Tuple[str, str, str] def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: nodes = scan_directory(path=path, query=query) + if not nodes: + raise ValueError(f"No files found in {path}") files = extract_files_content(query=query, node=nodes, max_file_size=query['max_file_size']) summary = create_summary_string(query, nodes, files) tree = "Directory structure:\n" + create_tree_structure(query, nodes) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index d8d4a320..669f28f3 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -7,7 +7,7 @@ TMP_BASE_PATH = "../tmp" -def parse_url(url: str) -> Dict[str, Optional[str]]: +def parse_url(url: str) -> Dict[str, Any]: parsed = { "user_name": None, "repo_name": None, @@ -97,8 +97,8 @@ def parse_query( source: str, max_file_size: int, from_web: bool, - include_patterns: Union[List[str], str] = None, - ignore_patterns: Union[List[str], str] = None, + include_patterns: Optional[Union[List[str], str]] = None, + ignore_patterns: Optional[Union[List[str], str]] = None, ) -> Dict[str, Any]: if from_web: query = parse_url(source) @@ -107,6 +107,7 @@ def parse_query( query = parse_url(source) else: query = parse_path(source) + query['max_file_size'] = max_file_size if ignore_patterns and ignore_patterns != "": diff --git a/src/gitingest/tests/test_clone.py b/src/gitingest/tests/test_clone.py index 06e90e64..950c9908 100644 --- a/src/gitingest/tests/test_clone.py +++ b/src/gitingest/tests/test_clone.py @@ -1,7 +1,8 @@ from unittest.mock import AsyncMock, patch import pytest -from clone import check_repo_exists, clone_repo + +from gitingest.clone import check_repo_exists, clone_repo @pytest.mark.asyncio diff --git a/src/gitingest/tests/test_ingest.py b/src/gitingest/tests/test_ingest.py index 3a7ac127..33b174b1 100644 --- a/src/gitingest/tests/test_ingest.py +++ b/src/gitingest/tests/test_ingest.py @@ -75,6 +75,8 @@ def temp_directory(tmp_path: Path) -> Path: def test_scan_directory(temp_directory: Path, sample_query: Dict[str, Any]) -> None: result = scan_directory(str(temp_directory), query=sample_query) + if result is None: + assert False, "Result is None" assert result['type'] == 'directory' assert result['file_count'] == 8 # All .txt and .py files @@ -84,6 +86,8 @@ def test_scan_directory(temp_directory: Path, sample_query: Dict[str, Any]) -> N def test_extract_files_content(temp_directory: Path, sample_query: Dict[str, Any]) -> None: nodes = scan_directory(str(temp_directory), query=sample_query) + if nodes is None: + assert False, "Nodes is None" files = extract_files_content(query=sample_query, node=nodes, max_file_size=1_000_000) assert len(files) == 8 # All .txt and .py files diff --git a/src/gitingest/utils.py b/src/gitingest/utils.py index bf64aecf..1f07b533 100644 --- a/src/gitingest/utils.py +++ b/src/gitingest/utils.py @@ -1,9 +1,10 @@ ## Async Timeout decorator import asyncio import functools -from typing import Awaitable, Callable, TypeVar +from typing import Awaitable, Callable, ParamSpec, TypeVar T = TypeVar("T") +P = ParamSpec("P") class AsyncTimeoutError(Exception): @@ -12,14 +13,14 @@ class AsyncTimeoutError(Exception): pass -def async_timeout(seconds: int = 10) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]: - def decorator(func: Callable[..., T]) -> Callable[..., T]: +def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]: + def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]: @functools.wraps(func) - async def wrapper(*args, **kwargs) -> T: + async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: try: return await asyncio.wait_for(func(*args, **kwargs), timeout=seconds) except asyncio.TimeoutError: - raise AsyncTimeoutError(f"Clone timed out after {seconds} seconds") + raise AsyncTimeoutError(f"Operation timed out after {seconds} seconds") return wrapper diff --git a/src/main.py b/src/main.py index e651528f..a50a1c5f 100644 --- a/src/main.py +++ b/src/main.py @@ -18,10 +18,24 @@ app = FastAPI() app.state.limiter = limiter -app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) + + +# Define a wrapper handler with the correct signature +async def rate_limit_exception_handler(request: Request, exc: Exception) -> Response: + if isinstance(exc, RateLimitExceeded): + # Delegate to the actual handler + return _rate_limit_exceeded_handler(request, exc) + # Optionally, handle other exceptions or re-raise + raise exc + + +# Register the wrapper handler +app.add_exception_handler(RateLimitExceeded, rate_limit_exception_handler) app.mount("/static", StaticFiles(directory="static"), name="static") -app.add_middleware(Analytics, api_key=os.getenv('API_ANALYTICS_KEY')) +app_analytics_key = os.getenv('API_ANALYTICS_KEY') +if app_analytics_key: + app.add_middleware(Analytics, api_key=app_analytics_key) # Define the default allowed hosts default_allowed_hosts = ["gitingest.com", "*.gitingest.com", "localhost", "127.0.0.1"] diff --git a/src/process_query.py b/src/process_query.py index bd7940fe..f2af1bbc 100644 --- a/src/process_query.py +++ b/src/process_query.py @@ -1,7 +1,8 @@ -from typing import Dict +from typing import Any, Dict from fastapi import Request from fastapi.templating import Jinja2Templates +from starlette.templating import _TemplateResponse from config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE from gitingest import clone_repo, ingest_from_query, parse_query @@ -60,7 +61,7 @@ async def process_query( pattern_type: str = "exclude", pattern: str = "", is_index: bool = False, -) -> HTMLResponse: +) -> _TemplateResponse: template = "index.jinja" if is_index else "github.jinja" max_file_size = logSliderToSize(slider_position) @@ -107,7 +108,7 @@ async def process_query( if len(content) > MAX_DISPLAY_SIZE: content = ( - f"(Files content cropped to {int(MAX_DISPLAY_SIZE/1000)}k characters, " + f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] ) From 6268899caa1eeaacc74a05b50213d276668754dc Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Fri, 27 Dec 2024 22:35:24 +0100 Subject: [PATCH 3/9] Fix absolute imports and mock paths in test_clone.py to resolve test failures. --- repomix-output.txt | 3335 +++++++++++++++++++++++++++++ src/gitingest/tests/test_clone.py | 4 +- src/process_query.py | 4 +- 3 files changed, 3340 insertions(+), 3 deletions(-) create mode 100644 repomix-output.txt diff --git a/repomix-output.txt b/repomix-output.txt new file mode 100644 index 00000000..23663041 --- /dev/null +++ b/repomix-output.txt @@ -0,0 +1,3335 @@ +This file is a merged representation of the entire codebase, combining all repository files into a single document. +Generated by Repomix on: 2024-12-27T21:21:09.225Z + +================================================================ +File Summary +================================================================ + +Purpose: +-------- +This file contains a packed representation of the entire repository's contents. +It is designed to be easily consumable by AI systems for analysis, code review, +or other automated processes. + +File Format: +------------ +The content is organized as follows: +1. This summary section +2. Repository information +3. Repository structure +4. Multiple file entries, each consisting of: + a. A separator line (================) + b. The file path (File: path/to/file) + c. Another separator line + d. The full contents of the file + e. A blank line + +Usage Guidelines: +----------------- +- This file should be treated as read-only. Any changes should be made to the + original repository files, not this packed version. +- When processing this file, use the file path to distinguish + between different files in the repository. +- Be aware that this file may contain sensitive information. Handle it with + the same level of security as you would the original repository. + +Notes: +------ +- Some files may have been excluded based on .gitignore rules and Repomix's + configuration. +- Binary files are not included in this packed representation. Please refer to + the Repository Structure section for a complete list of file paths, including + binary files. + +Additional Info: +---------------- + +For more information about Repomix, visit: https://github.com/yamadashy/repomix + +================================================================ +Repository Structure +================================================================ +.github/ + workflows/ + unitest.yml +src/ + gitingest/ + tests/ + conftest.py + test_clone.py + test_ingest.py + test_parse_query.py + __init__.py + cli.py + clone.py + ignore_patterns.py + ingest_from_query.py + ingest.py + parse_query.py + utils.py + routers/ + __init__.py + download.py + dynamic.py + index.py + static/ + js/ + snow.js + utils.js + favicon.svg + robots.txt + templates/ + components/ + footer.jinja + github_form.jinja + navbar.jinja + result.jinja + api.jinja + base.jinja + github.jinja + index.jinja + config.py + main.py + process_query.py + server_utils.py +.dockerignore +.gitignore +.pre-commit-config.yaml +CODE_OF_CONDUCT.md +Dockerfile +LICENSE +pyproject.toml +pytest.ini +README.md +requirements.txt +SECURITY.md +setup.py + +================================================================ +Repository Files +================================================================ + +================ +File: .github/workflows/unitest.yml +================ +name: Unit Tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest pytest-asyncio + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install -e . + + - name: Run tests + run: | + pytest + +================ +File: src/gitingest/tests/conftest.py +================ +import os +import sys + +# Get the absolute path of the project root directory (one level up from tests) +project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +# Add both the project root and src directory to PYTHONPATH +sys.path.insert(0, project_root) +sys.path.insert(0, os.path.join(project_root, 'src')) + +================ +File: src/gitingest/tests/test_clone.py +================ +from unittest.mock import AsyncMock, patch + +import pytest + +from clone import clone_repo +from gitingest.clone import check_repo_exists + + +@pytest.mark.asyncio +async def test_clone_repo_with_commit() -> None: + query = { + 'commit': 'a' * 40, # Simulating a valid commit hash + 'branch': 'main', + 'url': 'https://github.com/user/repo', + 'local_path': '/tmp/repo', + } + + with patch('clone.check_repo_exists', return_value=True) as mock_check: + with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: + mock_process = AsyncMock() + mock_process.communicate.return_value = (b'output', b'error') + mock_exec.return_value = mock_process + + await clone_repo(query) + mock_check.assert_called_once_with(query['url']) + assert mock_exec.call_count == 2 # Clone and checkout calls + + +@pytest.mark.asyncio +async def test_clone_repo_without_commit() -> None: + query = { + 'commit': None, + 'branch': 'main', + 'url': 'https://github.com/user/repo', + 'local_path': '/tmp/repo', + } + + with patch('clone.check_repo_exists', return_value=True) as mock_check: + with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: + mock_process = AsyncMock() + mock_process.communicate.return_value = (b'output', b'error') + mock_exec.return_value = mock_process + + await clone_repo(query) + mock_check.assert_called_once_with(query['url']) + assert mock_exec.call_count == 1 # Only clone call + + +@pytest.mark.asyncio +async def test_clone_repo_nonexistent_repository() -> None: + query = { + 'commit': None, + 'branch': 'main', + 'url': 'https://github.com/user/nonexistent-repo', + 'local_path': '/tmp/repo', + } + + with patch('gitingest.clone.check_repo_exists', return_value=False) as mock_check: + with pytest.raises(ValueError, match="Repository not found"): + await clone_repo(query) + mock_check.assert_called_once_with(query['url']) + + +@pytest.mark.asyncio +async def test_check_repo_exists() -> None: + url = "https://github.com/user/repo" + + with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: + mock_process = AsyncMock() + mock_process.communicate.return_value = (b'HTTP/1.1 200 OK\n', b'') + mock_exec.return_value = mock_process + + # Test existing repository + mock_process.returncode = 0 + assert await check_repo_exists(url) is True + + # Test non-existing repository (404 response) + mock_process.communicate.return_value = (b'HTTP/1.1 404 Not Found\n', b'') + mock_process.returncode = 0 + assert await check_repo_exists(url) is False + + # Test failed request + mock_process.returncode = 1 + assert await check_repo_exists(url) is False + +================ +File: src/gitingest/tests/test_ingest.py +================ +from pathlib import Path +from typing import Any, Dict + +import pytest + +from gitingest.ingest_from_query import extract_files_content, scan_directory + + +# Test fixtures +@pytest.fixture +def sample_query() -> Dict[str, Any]: + return { + 'user_name': 'test_user', + 'repo_name': 'test_repo', + 'local_path': '/tmp/test_repo', + 'subpath': '/', + 'branch': 'main', + 'commit': None, + 'max_file_size': 1_000_000, + 'slug': 'test_user/test_repo', + 'ignore_patterns': ['*.pyc', '__pycache__', '.git'], + 'include_patterns': None, + 'pattern_type': 'exclude', + } + + +@pytest.fixture +def temp_directory(tmp_path: Path) -> Path: + # Creates the following structure: + # test_repo/ + # ├── file1.txt + # ├── file2.py + # └── src/ + # | ├── subfile1.txt + # | └── subfile2.py + # | └── subdir/ + # | └── file_subdir.txt + # | └── file_subdir.py + # └── dir1/ + # | └── file_dir1.txt + # └── dir2/ + # └── file_dir2.txt + + test_dir = tmp_path / "test_repo" + test_dir.mkdir() + + # Root files + (test_dir / "file1.txt").write_text("Hello World") + (test_dir / "file2.py").write_text("print('Hello')") + + # src directory and its files + src_dir = test_dir / "src" + src_dir.mkdir() + (src_dir / "subfile1.txt").write_text("Hello from src") + (src_dir / "subfile2.py").write_text("print('Hello from src')") + + # src/subdir and its files + subdir = src_dir / "subdir" + subdir.mkdir() + (subdir / "file_subdir.txt").write_text("Hello from subdir") + (subdir / "file_subdir.py").write_text("print('Hello from subdir')") + + # dir1 and its file + dir1 = test_dir / "dir1" + dir1.mkdir() + (dir1 / "file_dir1.txt").write_text("Hello from dir1") + + # dir2 and its file + dir2 = test_dir / "dir2" + dir2.mkdir() + (dir2 / "file_dir2.txt").write_text("Hello from dir2") + + return test_dir + + +def test_scan_directory(temp_directory: Path, sample_query: Dict[str, Any]) -> None: + result = scan_directory(str(temp_directory), query=sample_query) + if result is None: + assert False, "Result is None" + + assert result['type'] == 'directory' + assert result['file_count'] == 8 # All .txt and .py files + assert result['dir_count'] == 4 # src, src/subdir, dir1, dir2 + assert len(result['children']) == 5 # file1.txt, file2.py, src, dir1, dir2 + + +def test_extract_files_content(temp_directory: Path, sample_query: Dict[str, Any]) -> None: + nodes = scan_directory(str(temp_directory), query=sample_query) + if nodes is None: + assert False, "Nodes is None" + files = extract_files_content(query=sample_query, node=nodes, max_file_size=1_000_000) + assert len(files) == 8 # All .txt and .py files + + # Check for presence of key files + paths = [f['path'] for f in files] + assert any('file1.txt' in p for p in paths) + assert any('subfile1.txt' in p for p in paths) + assert any('file2.py' in p for p in paths) + assert any('subfile2.py' in p for p in paths) + assert any('file_subdir.txt' in p for p in paths) + assert any('file_dir1.txt' in p for p in paths) + assert any('file_dir2.txt' in p for p in paths) + + +# TODO: test with include patterns: ['*.txt'] +# TODO: test with wrong include patterns: ['*.qwerty'] + + +# single folder patterns +# TODO: test with include patterns: ['src/*'] +# TODO: test with include patterns: ['/src/*'] +# TODO: test with include patterns: ['/src/'] +# TODO: test with include patterns: ['/src*'] + +# multiple patterns +# TODO: test with multiple include patterns: ['*.txt', '*.py'] +# TODO: test with multiple include patterns: ['/src/*', '*.txt'] +# TODO: test with multiple include patterns: ['/src*', '*.txt'] + +================ +File: src/gitingest/tests/test_parse_query.py +================ +import pytest + +from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS +from gitingest.parse_query import parse_query, parse_url + + +def test_parse_url_valid() -> None: + test_cases = [ + "https://github.com/user/repo", + "https://gitlab.com/user/repo", + "https://bitbucket.org/user/repo", + ] + for url in test_cases: + result = parse_url(url) + assert result["user_name"] == "user" + assert result["repo_name"] == "repo" + assert result["url"] == url + + +def test_parse_url_invalid() -> None: + url = "https://only-domain.com" + with pytest.raises(ValueError, match="Invalid repository URL"): + parse_url(url) + + +def test_parse_query_basic() -> None: + test_cases = ["https://github.com/user/repo", "https://gitlab.com/user/repo"] + for url in test_cases: + result = parse_query(url, max_file_size=50, from_web=True, ignore_patterns='*.txt') + assert result["user_name"] == "user" + assert result["repo_name"] == "repo" + assert result["url"] == url + assert "*.txt" in result["ignore_patterns"] + + +def test_parse_query_include_pattern() -> None: + url = "https://github.com/user/repo" + result = parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py') + assert result["include_patterns"] == ["*.py"] + assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS + + +def test_parse_query_invalid_pattern() -> None: + url = "https://github.com/user/repo" + with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): + parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py;rm -rf') + +================ +File: src/gitingest/__init__.py +================ +from gitingest.clone import clone_repo +from gitingest.ingest import ingest +from gitingest.ingest_from_query import ingest_from_query +from gitingest.parse_query import parse_query + +__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"] + +================ +File: src/gitingest/cli.py +================ +import os +from typing import Optional, Tuple + +import click + +from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS +from gitingest.ingest import ingest +from gitingest.ingest_from_query import MAX_FILE_SIZE + + +def normalize_pattern(pattern: str) -> str: + pattern = pattern.strip() + pattern = pattern.lstrip(os.sep) + if pattern.endswith(os.sep): + pattern += "*" + return pattern + + +@click.command() +@click.argument('source', type=str, required=True) +@click.option('--output', '-o', default=None, help='Output file path (default: .txt in current directory)') +@click.option('--max-size', '-s', default=MAX_FILE_SIZE, help='Maximum file size to process in bytes') +@click.option('--exclude-pattern', '-e', multiple=True, help='Patterns to exclude') +@click.option('--include-pattern', '-i', multiple=True, help='Patterns to include') +def main( + source: str, + output: Optional[str], + max_size: int, + exclude_pattern: Tuple[str, ...], + include_pattern: Tuple[str, ...], +) -> None: + """Analyze a directory and create a text dump of its contents.""" + try: + # Combine default and custom ignore patterns + exclude_patterns = list(exclude_pattern) + include_patterns = list(set(include_pattern)) + + if not output: + output = "digest.txt" + summary, tree, content = ingest(source, max_size, include_patterns, exclude_patterns, output=output) + + click.echo(f"Analysis complete! Output written to: {output}") + click.echo("\nSummary:") + click.echo(summary) + + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + raise click.Abort() + + +if __name__ == '__main__': + main() + +================ +File: src/gitingest/clone.py +================ +import asyncio +from typing import Any, Dict, Tuple + +from gitingest.utils import async_timeout + +CLONE_TIMEOUT = 20 + + +async def check_repo_exists(url: str) -> bool: + proc = await asyncio.create_subprocess_exec( + "curl", + "-I", + url, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + if proc.returncode != 0: + return False + # Check if stdout contains "404" status code + stdout_str = stdout.decode() + return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str + + +@async_timeout(CLONE_TIMEOUT) +async def clone_repo(query: Dict[str, Any]) -> Tuple[bytes, bytes]: + if not await check_repo_exists(query['url']): + raise ValueError("Repository not found, make sure it is public") + + if query['commit']: + proc = await asyncio.create_subprocess_exec( + "git", + "clone", + "--single-branch", + query['url'], + query['local_path'], + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + + proc = await asyncio.create_subprocess_exec( + "git", + "-C", + query['local_path'], + "checkout", + query['branch'], + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + elif query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: + proc = await asyncio.create_subprocess_exec( + "git", + "clone", + "--depth=1", + "--single-branch", + "--branch", + query['branch'], + query['url'], + query['local_path'], + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + else: + proc = await asyncio.create_subprocess_exec( + "git", + "clone", + "--depth=1", + "--single-branch", + query['url'], + query['local_path'], + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + stdout, stderr = await proc.communicate() + + return stdout, stderr + +================ +File: src/gitingest/ignore_patterns.py +================ +from typing import List + +DEFAULT_IGNORE_PATTERNS: List[str] = [ + # Python + '*.pyc', + '*.pyo', + '*.pyd', + '__pycache__', + '.pytest_cache', + '.coverage', + '.tox', + '.nox', + '.mypy_cache', + '.ruff_cache', + '.hypothesis', + 'poetry.lock', + 'Pipfile.lock', + # JavaScript/Node + 'node_modules', + 'bower_components', + 'package-lock.json', + 'yarn.lock', + '.npm', + '.yarn', + '.pnpm-store', + # Version control + '.git', + '.svn', + '.hg', + '.gitignore', + '.gitattributes', + '.gitmodules', + # Images and media + '*.svg', + '*.png', + '*.jpg', + '*.jpeg', + '*.gif', + '*.ico', + '*.pdf', + '*.mov', + '*.mp4', + '*.mp3', + '*.wav', + # Virtual environments + 'venv', + '.venv', + 'env', + '.env', + 'virtualenv', + # IDEs and editors + '.idea', + '.vscode', + '.vs', + '*.swp', + '*.swo', + '*.swn', + '.settings', + '.project', + '.classpath', + '*.sublime-*', + # Temporary and cache files + '*.log', + '*.bak', + '*.swp', + '*.tmp', + '*.temp', + '.cache', + '.sass-cache', + '.eslintcache', + '.DS_Store', + 'Thumbs.db', + 'desktop.ini', + # Build directories and artifacts + 'build', + 'dist', + 'target', + 'out', + '*.egg-info', + '*.egg', + '*.whl', + '*.so', + '*.dylib', + '*.dll', + '*.class', + # Documentation + 'site-packages', + '.docusaurus', + '.next', + '.nuxt', + # Other common patterns + ## Minified files + '*.min.js', + '*.min.css', + ## Source maps + '*.map', + ## Terraform + '.terraform', + '*.tfstate*', + ## Dependencies in various languages + 'vendor/', +] + +================ +File: src/gitingest/ingest_from_query.py +================ +import os +from fnmatch import fnmatch +from typing import Any, Dict, List, Optional, Set, Tuple + +import tiktoken + +MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB +MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal +MAX_FILES = 10_000 # Maximum number of files to process +MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB + + +def should_include(path: str, base_path: str, include_patterns: List[str]) -> bool: + rel_path = path.replace(base_path, "").lstrip(os.sep) + include = False + for pattern in include_patterns: + if fnmatch(rel_path, pattern): + include = True + return include + + +def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> bool: + rel_path = path.replace(base_path, "").lstrip(os.sep) + for pattern in ignore_patterns: + if pattern == '': + continue + if fnmatch(rel_path, pattern): + return True + return False + + +def is_safe_symlink(symlink_path: str, base_path: str) -> bool: + """Check if a symlink points to a location within the base directory.""" + try: + target_path = os.path.realpath(symlink_path) + base_path = os.path.realpath(base_path) + return os.path.commonpath([target_path, base_path]) == base_path + except (OSError, ValueError): + # If there's any error resolving the paths, consider it unsafe + return False + + +def is_text_file(file_path: str) -> bool: + """Determines if a file is likely a text file based on its content.""" + try: + with open(file_path, 'rb') as file: + chunk = file.read(1024) + return not bool(chunk.translate(None, bytes([7, 8, 9, 10, 12, 13, 27] + list(range(0x20, 0x100))))) + except OSError: + return False + + +def read_file_content(file_path: str) -> str: + try: + with open(file_path, encoding='utf-8', errors='ignore') as f: + return f.read() + except Exception as e: + return f"Error reading file: {str(e)}" + + +def scan_directory( + path: str, + query: Dict[str, Any], + seen_paths: Optional[Set[str]] = None, + depth: int = 0, + stats: Optional[Dict[str, int]] = None, +) -> Optional[Dict[str, Any]]: + """Recursively analyzes a directory and its contents with safety limits.""" + if seen_paths is None: + seen_paths = set() + if stats is None: + stats = {"total_files": 0, "total_size": 0} + + if depth > MAX_DIRECTORY_DEPTH: + print(f"Skipping deep directory: {path} (max depth {MAX_DIRECTORY_DEPTH} reached)") + return None + + if stats["total_files"] >= MAX_FILES: + print(f"Skipping further processing: maximum file limit ({MAX_FILES}) reached") + return None + + if stats["total_size"] >= MAX_TOTAL_SIZE_BYTES: + print(f"Skipping further processing: maximum total size ({MAX_TOTAL_SIZE_BYTES/1024/1024:.1f}MB) reached") + return None + + real_path = os.path.realpath(path) + if real_path in seen_paths: + print(f"Skipping already visited path: {path}") + return None + + seen_paths.add(real_path) + + result = { + "name": os.path.basename(path), + "type": "directory", + "size": 0, + "children": [], + "file_count": 0, + "dir_count": 0, + "path": path, + "ignore_content": False, + } + + ignore_patterns = query['ignore_patterns'] + base_path = query['local_path'] + include_patterns = query['include_patterns'] + + try: + for item in os.listdir(path): + item_path = os.path.join(path, item) + + if should_exclude(item_path, base_path, ignore_patterns): + continue + + is_file = os.path.isfile(item_path) + if is_file and query['include_patterns']: + if not should_include(item_path, base_path, include_patterns): + result["ignore_content"] = True + continue + + # Handle symlinks + if os.path.islink(item_path): + if not is_safe_symlink(item_path, base_path): + print(f"Skipping symlink that points outside base directory: {item_path}") + continue + real_path = os.path.realpath(item_path) + if real_path in seen_paths: + print(f"Skipping already visited symlink target: {item_path}") + continue + + if os.path.isfile(real_path): + file_size = os.path.getsize(real_path) + if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES: + print(f"Skipping file {item_path}: would exceed total size limit") + continue + + stats["total_files"] += 1 + stats["total_size"] += file_size + + if stats["total_files"] > MAX_FILES: + print(f"Maximum file limit ({MAX_FILES}) reached") + return result + + is_text = is_text_file(real_path) + content = read_file_content(real_path) if is_text else "[Non-text file]" + + child = { + "name": item, + "type": "file", + "size": file_size, + "content": content, + "path": item_path, + } + result["children"].append(child) + result["size"] += file_size + result["file_count"] += 1 + + elif os.path.isdir(real_path): + subdir = scan_directory( + path=real_path, + query=query, + seen_paths=seen_paths, + depth=depth + 1, + stats=stats, + ) + if subdir and (not include_patterns or subdir["file_count"] > 0): + subdir["name"] = item + subdir["path"] = item_path + result["children"].append(subdir) + result["size"] += subdir["size"] + result["file_count"] += subdir["file_count"] + result["dir_count"] += 1 + subdir["dir_count"] + continue + + if os.path.isfile(item_path): + file_size = os.path.getsize(item_path) + if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES: + print(f"Skipping file {item_path}: would exceed total size limit") + continue + + stats["total_files"] += 1 + stats["total_size"] += file_size + + if stats["total_files"] > MAX_FILES: + print(f"Maximum file limit ({MAX_FILES}) reached") + return result + + is_text = is_text_file(item_path) + content = read_file_content(item_path) if is_text else "[Non-text file]" + + child = { + "name": item, + "type": "file", + "size": file_size, + "content": content, + "path": item_path, + } + result["children"].append(child) + result["size"] += file_size + result["file_count"] += 1 + + elif os.path.isdir(item_path): + subdir = scan_directory( + path=item_path, + query=query, + seen_paths=seen_paths, + depth=depth + 1, + stats=stats, + ) + if subdir and (not include_patterns or subdir["file_count"] > 0): + result["children"].append(subdir) + result["size"] += subdir["size"] + result["file_count"] += subdir["file_count"] + result["dir_count"] += 1 + subdir["dir_count"] + + except PermissionError: + print(f"Permission denied: {path}") + + return result + + +def extract_files_content( + query: Dict[str, Any], + node: Dict[str, Any], + max_file_size: int, + files: Optional[List[Dict[str, Any]]] = None, +) -> List[Dict[str, Any]]: + """Recursively collects all text files with their contents.""" + if files is None: + files = [] + + if node["type"] == "file" and node["content"] != "[Non-text file]": + content = node["content"] + if node["size"] > max_file_size: + content = None + + files.append( + { + "path": node["path"].replace(query['local_path'], ""), + "content": content, + "size": node["size"], + }, + ) + elif node["type"] == "directory": + for child in node["children"]: + extract_files_content(query=query, node=child, max_file_size=max_file_size, files=files) + + return files + + +def create_file_content_string(files: List[Dict[str, Any]]) -> str: + """Creates a formatted string of file contents with separators.""" + output = "" + separator = "=" * 48 + "\n" + + # First add README.md if it exists + for file in files: + if not file['content']: + continue + + if file['path'].lower() == '/readme.md': + output += separator + output += f"File: {file['path']}\n" + output += separator + output += f"{file['content']}\n\n" + break + + # Then add all other files in their original order + for file in files: + if not file['content'] or file['path'].lower() == '/readme.md': + continue + + output += separator + output += f"File: {file['path']}\n" + output += separator + output += f"{file['content']}\n\n" + + return output + + +def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: List[Dict[str, Any]]) -> str: + """Creates a summary string with file counts and content size.""" + if "user_name" in query: + summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" + else: + summary = f"Repository: {query['slug']}\n" + + summary += f"Files analyzed: {nodes['file_count']}\n" + + if 'subpath' in query and query['subpath'] != '/': + summary += f"Subpath: {query['subpath']}\n" + if 'commit' in query and query['commit']: + summary += f"Commit: {query['commit']}\n" + elif 'branch' in query and query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: + summary += f"Branch: {query['branch']}\n" + + return summary + + +def create_tree_structure( + query: Dict[str, Any], + node: Dict[str, Any], + prefix: str = "", + is_last: bool = True, +) -> str: + """Creates a tree-like string representation of the file structure.""" + tree = "" + + if not node["name"]: + node["name"] = query['slug'] + + if node["name"]: + current_prefix = "└── " if is_last else "├── " + name = node["name"] + "/" if node["type"] == "directory" else node["name"] + tree += prefix + current_prefix + name + "\n" + + if node["type"] == "directory": + # Adjust prefix only if we added a node name + new_prefix = prefix + (" " if is_last else "│ ") if node["name"] else prefix + children = node["children"] + for i, child in enumerate(children): + tree += create_tree_structure(query, child, new_prefix, i == len(children) - 1) + + return tree + + +def generate_token_string(context_string: str) -> Optional[str]: + """Returns the number of tokens in a text string.""" + formatted_tokens = "" + try: + encoding = tiktoken.get_encoding("cl100k_base") + total_tokens = len(encoding.encode(context_string, disallowed_special=())) + + except Exception as e: + print(e) + return None + + if total_tokens > 1_000_000: + formatted_tokens = f"{total_tokens / 1_000_000:.1f}M" + elif total_tokens > 1_000: + formatted_tokens = f"{total_tokens / 1_000:.1f}k" + else: + formatted_tokens = f"{total_tokens}" + + return formatted_tokens + + +def ingest_single_file(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: + if not os.path.isfile(path): + raise ValueError(f"Path {path} is not a file") + + file_size = os.path.getsize(path) + is_text = is_text_file(path) + if not is_text: + raise ValueError(f"File {path} is not a text file") + + content = read_file_content(path) + if file_size > query['max_file_size']: + content = "[Content ignored: file too large]" + + file_info = { + "path": path.replace(query['local_path'], ""), + "content": content, + "size": file_size, + } + + summary = ( + f"Repository: {query['user_name']}/{query['repo_name']}\n" + f"File: {os.path.basename(path)}\n" + f"Size: {file_size:,} bytes\n" + f"Lines: {len(content.splitlines()):,}\n" + ) + + files_content = create_file_content_string([file_info]) + tree = "Directory structure:\n└── " + os.path.basename(path) + + formatted_tokens = generate_token_string(files_content) + if formatted_tokens: + summary += f"\nEstimated tokens: {formatted_tokens}" + + return summary, tree, files_content + + +def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: + nodes = scan_directory(path=path, query=query) + if not nodes: + raise ValueError(f"No files found in {path}") + files = extract_files_content(query=query, node=nodes, max_file_size=query['max_file_size']) + summary = create_summary_string(query, nodes, files) + tree = "Directory structure:\n" + create_tree_structure(query, nodes) + files_content = create_file_content_string(files) + + formatted_tokens = generate_token_string(tree + files_content) + if formatted_tokens: + summary += f"\nEstimated tokens: {formatted_tokens}" + + return summary, tree, files_content + + +def ingest_from_query(query: Dict[str, Any]) -> Tuple[str, str, str]: + """Main entry point for analyzing a codebase directory or single file.""" + path = f"{query['local_path']}{query['subpath']}" + if not os.path.exists(path): + raise ValueError(f"{query['slug']} cannot be found") + + if query.get('type') == 'blob': + return ingest_single_file(path, query) + + return ingest_directory(path, query) + +================ +File: src/gitingest/ingest.py +================ +import asyncio +import inspect +import shutil +from pathlib import Path +from typing import List, Optional, Tuple, Union + +from gitingest.clone import clone_repo +from gitingest.ingest_from_query import ingest_from_query +from gitingest.parse_query import parse_query + + +def ingest( + source: str, + max_file_size: int = 10 * 1024 * 1024, + include_patterns: Union[List[str], str, None] = None, + exclude_patterns: Union[List[str], str, None] = None, + output: Optional[str] = None, +) -> Tuple[str, str, str]: + try: + query = parse_query( + source=source, + max_file_size=max_file_size, + from_web=False, + include_patterns=include_patterns, + ignore_patterns=exclude_patterns, + ) + if query['url']: + clone_result = clone_repo(query) + if inspect.iscoroutine(clone_result): + asyncio.run(clone_result) + else: + raise TypeError("clone_repo did not return a coroutine as expected.") + + summary, tree, content = ingest_from_query(query) + + if output: + with open(f"{output}", "w") as f: + f.write(tree + "\n" + content) + + return summary, tree, content + + finally: + # Clean up the temporary directory if it was created + if query['url']: + # Get parent directory two levels up from local_path (../tmp) + cleanup_path = str(Path(query['local_path']).parents[1]) + shutil.rmtree(cleanup_path, ignore_errors=True) + +================ +File: src/gitingest/parse_query.py +================ +import os +import uuid +from typing import Any, Dict, List, Optional, Union + +from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS + +TMP_BASE_PATH = "../tmp" + + +def parse_url(url: str) -> Dict[str, Any]: + parsed = { + "user_name": None, + "repo_name": None, + "type": None, + "branch": None, + "commit": None, + "subpath": "/", + "local_path": None, + "url": None, + "slug": None, + "id": None, + } + + url = url.split(" ")[0] + if not url.startswith('https://'): + url = 'https://' + url + + # Extract domain and path + url_parts = url.split('/') + domain = url_parts[2] + path_parts = url_parts[3:] + + if len(path_parts) < 2: + raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.") + + parsed["user_name"] = path_parts[0] + parsed["repo_name"] = path_parts[1] + + # Keep original URL format + parsed["url"] = f"https://{domain}/{parsed['user_name']}/{parsed['repo_name']}" + parsed['slug'] = f"{parsed['user_name']}-{parsed['repo_name']}" + parsed["id"] = str(uuid.uuid4()) + parsed["local_path"] = f"{TMP_BASE_PATH}/{parsed['id']}/{parsed['slug']}" + + if len(path_parts) > 3: + parsed["type"] = path_parts[2] + parsed["branch"] = path_parts[3] + if len(parsed['branch']) == 40 and all(c in '0123456789abcdefABCDEF' for c in parsed['branch']): + parsed["commit"] = parsed['branch'] + + parsed["subpath"] = "/" + "/".join(path_parts[4:]) + + return parsed + + +def normalize_pattern(pattern: str) -> str: + pattern = pattern.strip() + pattern = pattern.lstrip(os.sep) + if pattern.endswith(os.sep): + pattern += "*" + return pattern + + +def parse_patterns(pattern: Union[List[str], str]) -> List[str]: + if isinstance(pattern, list): + pattern = ",".join(pattern) + + for p in pattern.split(","): + if not all(c.isalnum() or c in "-_./+*" for c in p.strip()): + raise ValueError( + f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), " + "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." + ) + patterns = [normalize_pattern(p) for p in pattern.split(",")] + return patterns + + +def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[str]) -> List[str]: + for pattern in include_patterns: + if pattern in ignore_patterns: + ignore_patterns.remove(pattern) + return ignore_patterns + + +def parse_path(path: str) -> Dict[str, Any]: + query = { + "local_path": os.path.abspath(path), + "slug": os.path.basename(os.path.dirname(path)) + "/" + os.path.basename(path), + "subpath": "/", + "id": str(uuid.uuid4()), + "url": None, + } + return query + + +def parse_query( + source: str, + max_file_size: int, + from_web: bool, + include_patterns: Optional[Union[List[str], str]] = None, + ignore_patterns: Optional[Union[List[str], str]] = None, +) -> Dict[str, Any]: + if from_web: + query = parse_url(source) + else: + if source.startswith("https://") or "github.com" in source: + query = parse_url(source) + else: + query = parse_path(source) + + query['max_file_size'] = max_file_size + + if ignore_patterns and ignore_patterns != "": + ignore_patterns = DEFAULT_IGNORE_PATTERNS + parse_patterns(ignore_patterns) + else: + ignore_patterns = DEFAULT_IGNORE_PATTERNS + + if include_patterns and include_patterns != "": + include_patterns = parse_patterns(include_patterns) + ignore_patterns = override_ignore_patterns(ignore_patterns, include_patterns) + else: + include_patterns = None + + query['ignore_patterns'] = ignore_patterns + query['include_patterns'] = include_patterns + + return query + +================ +File: src/gitingest/utils.py +================ +## Async Timeout decorator +import asyncio +import functools +from typing import Awaitable, Callable, ParamSpec, TypeVar + +T = TypeVar("T") +P = ParamSpec("P") + + +class AsyncTimeoutError(Exception): + """Raised when an async operation exceeds its timeout limit.""" + + pass + + +def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]: + def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]: + @functools.wraps(func) + async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: + try: + return await asyncio.wait_for(func(*args, **kwargs), timeout=seconds) + except asyncio.TimeoutError: + raise AsyncTimeoutError(f"Operation timed out after {seconds} seconds") + + return wrapper + + return decorator + +================ +File: src/routers/__init__.py +================ +from routers.download import router as download +from routers.dynamic import router as dynamic +from routers.index import router as index + +__all__ = ["download", "dynamic", "index"] + +================ +File: src/routers/download.py +================ +import os + +from fastapi import APIRouter, HTTPException +from fastapi.responses import Response + +from config import TMP_BASE_PATH + +router = APIRouter() + + +@router.get("/download/{digest_id}") +async def download_ingest(digest_id: str) -> Response: + try: + # Find the first .txt file in the directory + directory = f"{TMP_BASE_PATH}/{digest_id}" + txt_files = [f for f in os.listdir(directory) if f.endswith('.txt')] + + if not txt_files: + raise FileNotFoundError("No .txt file found") + + with open(f"{directory}/{txt_files[0]}") as f: + content = f.read() + + return Response( + content=content, + media_type="text/plain", + headers={"Content-Disposition": f"attachment; filename={txt_files[0]}"}, + ) + except FileNotFoundError: + raise HTTPException(status_code=404, detail="Digest not found") + +================ +File: src/routers/dynamic.py +================ +from fastapi import APIRouter, Form, Request +from fastapi.responses import HTMLResponse +from fastapi.templating import Jinja2Templates + +from process_query import process_query +from server_utils import limiter + +router = APIRouter() +templates = Jinja2Templates(directory="templates") + + +@router.get("/{full_path:path}") +async def catch_all(request: Request, full_path: str) -> HTMLResponse: + return templates.TemplateResponse( + "github.jinja", + { + "request": request, + "github_url": f"https://github.com/{full_path}", + "loading": True, + "default_file_size": 243, + }, + ) + + +@router.post("/{full_path:path}", response_class=HTMLResponse) +@limiter.limit("10/minute") +async def process_catch_all( + request: Request, + input_text: str = Form(...), + max_file_size: int = Form(...), + pattern_type: str = Form(...), + pattern: str = Form(...), +) -> HTMLResponse: + return await process_query( + request, + input_text, + max_file_size, + pattern_type, + pattern, + is_index=False, + ) + +================ +File: src/routers/index.py +================ +from fastapi import APIRouter, Form, Request +from fastapi.responses import HTMLResponse +from fastapi.templating import Jinja2Templates + +from config import EXAMPLE_REPOS +from process_query import process_query +from server_utils import limiter + +router = APIRouter() +templates = Jinja2Templates(directory="templates") + + +@router.get("/", response_class=HTMLResponse) +async def home(request: Request) -> HTMLResponse: + return templates.TemplateResponse( + "index.jinja", + { + "request": request, + "examples": EXAMPLE_REPOS, + "default_file_size": 243, + }, + ) + + +@router.post("/", response_class=HTMLResponse) +@limiter.limit("10/minute") +async def index_post( + request: Request, + input_text: str = Form(...), + max_file_size: int = Form(...), + pattern_type: str = Form(...), + pattern: str = Form(...), +) -> HTMLResponse: + return await process_query( + request, + input_text, + max_file_size, + pattern_type, + pattern, + is_index=True, + ) + +================ +File: src/static/js/snow.js +================ +// Snow effect initialization +function initSnow() { + const snowCanvas = document.getElementById('snow-canvas'); + const ctx = snowCanvas.getContext('2d'); + + // Configure snow + const snowflakes = []; + const maxSnowflakes = 50; + const spawnInterval = 200; + let currentSnowflakes = 0; + let lastSpawnTime = 0; + + // Resize canvas to window size + function resizeCanvas() { + snowCanvas.width = window.innerWidth; + snowCanvas.height = window.innerHeight; + } + + // Initial setup + resizeCanvas(); + window.addEventListener('resize', resizeCanvas); + + // Snowflake class definition + class Snowflake { + constructor() { + this.reset(); + } + + reset() { + this.x = Math.random() * snowCanvas.width; + this.y = 0; + this.size = Math.random() * 3 + 2; + this.speed = Math.random() * 1 + 0.5; + this.wind = Math.random() * 0.5 - 0.25; + } + + update() { + this.y += this.speed; + this.x += this.wind; + + if (this.y > snowCanvas.height) { + this.reset(); + } + } + + draw() { + ctx.save(); + + ctx.shadowColor = 'rgba(0, 0, 0, 0.3)'; + ctx.shadowBlur = 5; + ctx.shadowOffsetX = 2; + ctx.shadowOffsetY = 2; + + ctx.beginPath(); + ctx.arc(this.x, this.y, this.size, 0, Math.PI * 2); + ctx.fillStyle = 'rgba(255, 255, 255, 1)'; + ctx.fill(); + + ctx.strokeStyle = 'rgba(200, 200, 200, 0.8)'; + ctx.lineWidth = 0.5; + ctx.stroke(); + + ctx.restore(); + } + } + + function animate(currentTime) { + ctx.clearRect(0, 0, snowCanvas.width, snowCanvas.height); + + if (currentSnowflakes < maxSnowflakes && currentTime - lastSpawnTime > spawnInterval) { + snowflakes.push(new Snowflake()); + currentSnowflakes++; + lastSpawnTime = currentTime; + } + + snowflakes.forEach(snowflake => { + snowflake.update(); + snowflake.draw(); + }); + + requestAnimationFrame(animate); + } + + requestAnimationFrame(animate); +} + +// Initialize snow when DOM content is loaded +document.addEventListener('DOMContentLoaded', initSnow); + +// Also initialize when the HTMX content is swapped +document.addEventListener('htmx:afterSettle', initSnow); + +================ +File: src/static/js/utils.js +================ +// Copy functionality +function copyText(className) { + const textarea = document.querySelector('.' + className); + const button = document.querySelector(`button[onclick="copyText('${className}')"]`); + if (!textarea || !button) return; + + // Copy text + navigator.clipboard.writeText(textarea.value) + .then(() => { + // Store original content + const originalContent = button.innerHTML; + + // Change button content + button.innerHTML = 'Copied!'; + + // Reset after 1 second + setTimeout(() => { + button.innerHTML = originalContent; + }, 1000); + }) + .catch(err => { + // Show error in button + const originalContent = button.innerHTML; + button.innerHTML = 'Failed to copy'; + setTimeout(() => { + button.innerHTML = originalContent; + }, 1000); + }); +} + + +function handleSubmit(event, showLoading = false) { + event.preventDefault(); + const form = event.target || document.getElementById('ingestForm'); + if (!form) return; + + const submitButton = form.querySelector('button[type="submit"]'); + if (!submitButton) return; + + const formData = new FormData(form); + + // Update file size + const slider = document.getElementById('file_size'); + if (slider) { + formData.delete('max_file_size'); + formData.append('max_file_size', slider.value); + } + + // Update pattern type and pattern + const patternType = document.getElementById('pattern_type'); + const pattern = document.getElementById('pattern'); + if (patternType && pattern) { + formData.delete('pattern_type'); + formData.delete('pattern'); + formData.append('pattern_type', patternType.value); + formData.append('pattern', pattern.value); + } + + const originalContent = submitButton.innerHTML; + const currentStars = document.getElementById('github-stars')?.textContent; + + if (showLoading) { + submitButton.disabled = true; + submitButton.innerHTML = ` +
+ + + + + Processing... +
+ `; + submitButton.classList.add('bg-[#ffb14d]'); + } + + // Submit the form + fetch(form.action, { + method: 'POST', + body: formData + }) + .then(response => response.text()) + .then(html => { + // Store the star count before updating the DOM + const starCount = currentStars; + + + // TEMPORARY SNOW LOGIC // + const parser = new DOMParser(); + const newDoc = parser.parseFromString(html, 'text/html'); + + const existingCanvas = document.getElementById('snow-canvas'); + document.body.innerHTML = newDoc.body.innerHTML; + if (existingCanvas) { + document.body.insertBefore(existingCanvas, document.body.firstChild); + } + // END TEMPORARY SNOW LOGIC // + + // Wait for next tick to ensure DOM is updated + setTimeout(() => { + // Reinitialize slider functionality + initializeSlider(); + + const starsElement = document.getElementById('github-stars'); + if (starsElement && starCount) { + starsElement.textContent = starCount; + } + + // Scroll to results if they exist + const resultsSection = document.querySelector('[data-results]'); + if (resultsSection) { + resultsSection.scrollIntoView({ behavior: 'smooth', block: 'start' }); + } + }, 0); + }) + .catch(error => { + submitButton.disabled = false; + submitButton.innerHTML = originalContent; + }); +} + +function copyFullDigest() { + const directoryStructure = document.querySelector('.directory-structure').value; + const filesContent = document.querySelector('.result-text').value; + const fullDigest = `${directoryStructure}\n\nFiles Content:\n\n${filesContent}`; + const button = document.querySelector('[onclick="copyFullDigest()"]'); + const originalText = button.innerHTML; + + navigator.clipboard.writeText(fullDigest).then(() => { + button.innerHTML = ` + + + + Copied! + `; + + setTimeout(() => { + button.innerHTML = originalText; + }, 2000); + }).catch(err => { + console.error('Failed to copy text: ', err); + }); +} + +// Add the logSliderToSize helper function +function logSliderToSize(position) { + const minp = 0; + const maxp = 500; + const minv = Math.log(1); + const maxv = Math.log(102400); + + const value = Math.exp(minv + (maxv - minv) * Math.pow(position / maxp, 1.5)); + return Math.round(value); +} + +// Move slider initialization to a separate function +function initializeSlider() { + const slider = document.getElementById('file_size'); + const sizeValue = document.getElementById('size_value'); + + if (!slider || !sizeValue) return; + + function updateSlider() { + const value = logSliderToSize(slider.value); + sizeValue.textContent = formatSize(value); + slider.style.backgroundSize = `${(slider.value / slider.max) * 100}% 100%`; + } + + // Update on slider change + slider.addEventListener('input', updateSlider); + + // Initialize slider position + updateSlider(); +} + +// Add helper function for formatting size +function formatSize(sizeInKB) { + if (sizeInKB >= 1024) { + return Math.round(sizeInKB / 1024) + 'mb'; + } + return Math.round(sizeInKB) + 'kb'; +} + +// Initialize slider on page load +document.addEventListener('DOMContentLoaded', initializeSlider); + +// Make sure these are available globally +window.copyText = copyText; + +window.handleSubmit = handleSubmit; +window.initializeSlider = initializeSlider; +window.formatSize = formatSize; + +// Add this new function +function setupGlobalEnterHandler() { + document.addEventListener('keydown', function (event) { + if (event.key === 'Enter' && !event.target.matches('textarea')) { + const form = document.getElementById('ingestForm'); + if (form) { + handleSubmit(new Event('submit'), true); + } + } + }); +} + +// Add to the DOMContentLoaded event listener +document.addEventListener('DOMContentLoaded', () => { + initializeSlider(); + setupGlobalEnterHandler(); +}); + +================ +File: src/static/favicon.svg +================ +1 + +================ +File: src/static/robots.txt +================ +User-agent: * +Allow: / +Allow: /api/ +Allow: /cyclotruc/gitingest/ + +================ +File: src/templates/components/footer.jinja +================ + + +================ +File: src/templates/components/github_form.jinja +================ +
+
+
+ +
+
+
+ +
+
+
+ +
+ + +
+
+ +
+
+
+
+
+ + + + +
+ +
+
+
+
+ + +
+
+ {% if show_examples %} + +
+

Try these example repositories:

+
+ {% for example in examples %} + + {% endfor %} +
+
+ {% endif %} +
+
+ +================ +File: src/templates/components/navbar.jinja +================ + +
+
+
+ + + + +
+
+
+ +================ +File: src/templates/components/result.jinja +================ +{% if result %} +
+
+
+
+ +
+ +
+
+

Summary

+
+
+
+ +
+ {% if ingest_id %} + +
+
+ +
+ {% endif %} +
+ +
+
+

Directory Structure

+
+
+ +
+
+
+
+ +
+
+
+ +
+
+

Files Content

+
+
+ +
+
+
+
+ +
+
+
+
+
+{% endif %} + +================ +File: src/templates/api.jinja +================ +{% extends "base.jinja" %} +{% block title %}Git ingest API{% endblock %} +{% block content %} +
+
+
+

API Documentation

+
+
+
+
+ + + +
+
+

The API is currently under development..

+
+
+
+

+ We're working on making our API available to the public. + In the meantime, you can + open an issue on github + to suggest features. +

+
+
+
+{% endblock %} + +================ +File: src/templates/base.jinja +================ + + + + + + + + + + + + + + + + + + + + + + + + + + + + {% block title %}Git ingest{% endblock %} + + + + + + {% block extra_head %}{% endblock %} + + + + {% include 'components/navbar.jinja' %} + +
+
+ {% block content %}{% endblock %} +
+
+ {% include 'components/footer.jinja' %} + {% block extra_scripts %}{% endblock %} + + + +================ +File: src/templates/github.jinja +================ +{% extends "base.jinja" %} +{% block content %} + {% if error_message %} +
{{ error_message }}
+ {% endif %} + {% with is_index=true, show_examples=false %} + {% include 'components/github_form.jinja' %} + {% endwith %} + {% if loading %} +
+
+
+
+

Loading...

+
+
+ {% endif %} + {% include 'components/result.jinja' %} +{% endblock content %} +{% block extra_scripts %} + +{% endblock extra_scripts %} + +================ +File: src/templates/index.jinja +================ +{% extends "base.jinja" %} +{% block extra_head %} + +{% endblock %} +{% block content %} +
+
+ + + + + + +

+ Prompt-friendly +
+ codebase  +

+ +
+

+ Turn any Git repository into a simple text ingest of its codebase. +

+

+ This is useful for feeding a codebase into any LLM. +

+

+ You can also replace 'hub' with 'ingest' in any Github URL +

+
+ {% if error_message %} +
{{ error_message }}
+ {% endif %} + {% with is_index=true, show_examples=true %} + {% include 'components/github_form.jinja' %} + {% endwith %} + {% include 'components/result.jinja' %} +{% endblock %} + +================ +File: src/config.py +================ +MAX_DISPLAY_SIZE = 300_000 +TMP_BASE_PATH = "../tmp" + +EXAMPLE_REPOS = [ + {"name": "Gitingest", "url": "https://github.com/cyclotruc/gitingest"}, + {"name": "FastAPI", "url": "https://github.com/tiangolo/fastapi"}, + {"name": "Flask", "url": "https://github.com/pallets/flask"}, + {"name": "Tldraw", "url": "https://github.com/tldraw/tldraw"}, + {"name": "ApiAnalytics", "url": "https://github.com/tom-draper/api-analytics"}, +] + +================ +File: src/main.py +================ +import os +from typing import Dict + +from api_analytics.fastapi import Analytics +from dotenv import load_dotenv +from fastapi import FastAPI, Request +from fastapi.responses import FileResponse, HTMLResponse, Response +from fastapi.staticfiles import StaticFiles +from fastapi.templating import Jinja2Templates +from slowapi import _rate_limit_exceeded_handler +from slowapi.errors import RateLimitExceeded +from starlette.middleware.trustedhost import TrustedHostMiddleware + +from routers import download, dynamic, index +from server_utils import limiter + +load_dotenv() + +app = FastAPI() +app.state.limiter = limiter + + +# Define a wrapper handler with the correct signature +async def rate_limit_exception_handler(request: Request, exc: Exception) -> Response: + if isinstance(exc, RateLimitExceeded): + # Delegate to the actual handler + return _rate_limit_exceeded_handler(request, exc) + # Optionally, handle other exceptions or re-raise + raise exc + + +# Register the wrapper handler +app.add_exception_handler(RateLimitExceeded, rate_limit_exception_handler) + +app.mount("/static", StaticFiles(directory="static"), name="static") +app_analytics_key = os.getenv('API_ANALYTICS_KEY') +if app_analytics_key: + app.add_middleware(Analytics, api_key=app_analytics_key) + +# Define the default allowed hosts +default_allowed_hosts = ["gitingest.com", "*.gitingest.com", "localhost", "127.0.0.1"] + +# Fetch allowed hosts from the environment variable or use the default +allowed_hosts = os.getenv("ALLOWED_HOSTS") +if allowed_hosts: + allowed_hosts = allowed_hosts.split(",") +else: + allowed_hosts = default_allowed_hosts + +app.add_middleware(TrustedHostMiddleware, allowed_hosts=allowed_hosts) +templates = Jinja2Templates(directory="templates") + + +@app.get("/health") +async def health_check() -> Dict[str, str]: + return {"status": "healthy"} + + +@app.head("/") +async def head_root() -> HTMLResponse: + """Mirror the headers and status code of the index page""" + return HTMLResponse(content=None, headers={"content-type": "text/html; charset=utf-8"}) + + +@app.get("/api/", response_class=HTMLResponse) +@app.get("/api", response_class=HTMLResponse) +async def api_docs(request: Request) -> HTMLResponse: + return templates.TemplateResponse("api.jinja", {"request": request}) + + +@app.get("/robots.txt") +async def robots() -> FileResponse: + return FileResponse('static/robots.txt') + + +app.include_router(index) +app.include_router(download) +app.include_router(dynamic) + +================ +File: src/process_query.py +================ +from typing import Any, Dict + +from fastapi import Request +from fastapi.templating import Jinja2Templates +from starlette.templating import _TemplateResponse + +from config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE +from gitingest import clone_repo, ingest_from_query, parse_query + +from server_utils import Colors, logSliderToSize + +templates = Jinja2Templates(directory="templates") + + +def print_query( + query: Dict[str, Any], + request: Request, + max_file_size: int, + pattern_type: str, + pattern: str, +) -> None: + print(f"{Colors.WHITE}{query['url']:<20}{Colors.END}", end="") + if int(max_file_size / 1024) != 50: + print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") + if pattern_type == "include" and pattern != "": + print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") + elif pattern_type == "exclude" and pattern != "": + print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") + + +def print_error( + query: Dict[str, Any], + request: Request, + e: Exception, + max_file_size: int, + pattern_type: str, + pattern: str, +) -> None: + print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") + print_query(query, request, max_file_size, pattern_type, pattern) + print(f" | {Colors.RED}{e}{Colors.END}") + + +def print_success( + query: Dict[str, Any], + request: Request, + max_file_size: int, + pattern_type: str, + pattern: str, + summary: str, +) -> None: + estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] + print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") + print_query(query, request, max_file_size, pattern_type, pattern) + print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") + + +async def process_query( + request: Request, + input_text: str, + slider_position: int, + pattern_type: str = "exclude", + pattern: str = "", + is_index: bool = False, +) -> _TemplateResponse: + template = "index.jinja" if is_index else "github.jinja" + max_file_size = logSliderToSize(slider_position) + + if pattern_type == "include": + include_patterns = pattern + exclude_patterns = None + elif pattern_type == "exclude": + exclude_patterns = pattern + include_patterns = None + + try: + query = parse_query( + source=input_text, + max_file_size=max_file_size, + from_web=True, + include_patterns=include_patterns, + ignore_patterns=exclude_patterns, + ) + await clone_repo(query) + summary, tree, content = ingest_from_query(query) + with open(f"{query['local_path']}.txt", "w") as f: + f.write(tree + "\n" + content) + + except Exception as e: + # hack to print error message when query is not defined + if 'query' in locals() and query is not None and isinstance(query, dict): + print_error(query, request, e, max_file_size, pattern_type, pattern) + else: + print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") + print(f"{Colors.RED}{e}{Colors.END}") + + return templates.TemplateResponse( + template, + { + "request": request, + "github_url": input_text, + "error_message": f"Error: {e}", + "examples": EXAMPLE_REPOS if is_index else [], + "default_file_size": slider_position, + "pattern_type": pattern_type, + "pattern": pattern, + }, + ) + + if len(content) > MAX_DISPLAY_SIZE: + content = ( + f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " + "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] + ) + + print_success( + query=query, + request=request, + max_file_size=max_file_size, + pattern_type=pattern_type, + pattern=pattern, + summary=summary, + ) + + return templates.TemplateResponse( + template, + { + "request": request, + "github_url": input_text, + "result": True, + "summary": summary, + "tree": tree, + "content": content, + "examples": EXAMPLE_REPOS if is_index else [], + "ingest_id": query['id'], + "default_file_size": slider_position, + "pattern_type": pattern_type, + "pattern": pattern, + }, + ) + +================ +File: src/server_utils.py +================ +import math + +## Rate Limiter +from slowapi import Limiter +from slowapi.util import get_remote_address + +limiter = Limiter(key_func=get_remote_address) + + +## Logarithmic slider to file size conversion +def logSliderToSize(position: int) -> int: + """Convert slider position to file size in KB""" + maxp = 500 + minv = math.log(1) + maxv = math.log(102400) + + return round(math.exp(minv + (maxv - minv) * pow(position / maxp, 1.5))) * 1024 + + +## Color printing utility +class Colors: + """ANSI color codes""" + + BLACK = "\033[0;30m" + RED = "\033[0;31m" + GREEN = "\033[0;32m" + BROWN = "\033[0;33m" + BLUE = "\033[0;34m" + PURPLE = "\033[0;35m" + CYAN = "\033[0;36m" + LIGHT_GRAY = "\033[0;37m" + DARK_GRAY = "\033[1;30m" + LIGHT_RED = "\033[1;31m" + LIGHT_GREEN = "\033[1;32m" + YELLOW = "\033[1;33m" + LIGHT_BLUE = "\033[1;34m" + LIGHT_PURPLE = "\033[1;35m" + LIGHT_CYAN = "\033[1;36m" + WHITE = "\033[1;37m" + BOLD = "\033[1m" + FAINT = "\033[2m" + ITALIC = "\033[3m" + UNDERLINE = "\033[4m" + BLINK = "\033[5m" + NEGATIVE = "\033[7m" + CROSSED = "\033[9m" + END = "\033[0m" + +================ +File: .dockerignore +================ +# Git +.git +.gitignore + +# Python +__pycache__ +*.pyc +*.pyo +*.pyd +.Python +env +pip-log.txt +pip-delete-this-directory.txt +.tox +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.log + +# Virtual environment +venv +.env +.venv +ENV + +# IDE +.idea +.vscode +*.swp +*.swo + +# Project specific +docs/ +tests/ +*.md +LICENSE +pytest.ini +setup.py + +================ +File: .gitignore +================ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +tmp/* + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +.python-version + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ +.vscode/settings.json +.DS_Store + +# Project specific +history.txt +cleanup.py +Caddyfile + +# ignore default output directory +tmp/* + +================ +File: .pre-commit-config.yaml +================ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + # Files + - id: check-added-large-files + description: 'Prevent large files from being committed.' + args: ['--maxkb=10000'] + - id: check-case-conflict + description: 'Check for files that would conflict in case-insensitive filesystems.' + - id: fix-byte-order-marker + description: 'Remove utf-8 byte order marker.' + - id: mixed-line-ending + description: 'Replace mixed line ending.' + + # Links + - id: destroyed-symlinks + description: 'Detect symlinks which are changed to regular files with a content of a path which that symlink was pointing to.' + + # File files for parseable syntax: python + - id: check-ast + + # File and line endings + - id: end-of-file-fixer + description: 'Ensure that a file is either empty, or ends with one newline.' + - id: trailing-whitespace + description: 'Trim trailing whitespace.' + + # Python + - id: check-docstring-first + description: 'Check a common error of defining a docstring after code.' + - id: requirements-txt-fixer + description: 'Sort entries in requirements.txt.' + + - repo: https://github.com/MarcoGorelli/absolufy-imports + rev: v0.3.1 + hooks: + - id: absolufy-imports + description: 'Automatically convert relative imports to absolute. (Use `args: [--never]` to revert.)' + + - repo: https://github.com/psf/black + rev: 24.10.0 + hooks: + - id: black + + - repo: https://github.com/asottile/pyupgrade + rev: v3.19.1 + hooks: + - id: pyupgrade + description: 'Automatically upgrade syntax for newer versions.' + args: [--py3-plus, --py36-plus, --py38-plus] + + - repo: https://github.com/pre-commit/pygrep-hooks + rev: v1.10.0 + hooks: + - id: python-check-blanket-noqa + description: 'Enforce that `noqa` annotations always occur with specific codes. Sample annotations: `# noqa: F401`, `# noqa: F401,W203`.' + - id: python-check-blanket-type-ignore + description: 'Enforce that `# type: ignore` annotations always occur with specific codes. Sample annotations: `# type: ignore[attr-defined]`, `# type: ignore[attr-defined, name-defined]`.' + - id: python-use-type-annotations + description: 'Enforce that python3.6+ type annotations are used instead of type comments.' + + - repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort + description: 'Sort imports alphabetically, and automatically separated into sections and by type.' + + - repo: https://github.com/hadialqattan/pycln + rev: v2.4.0 + hooks: + - id: pycln + description: 'Remove unused import statements.' + + - repo: https://github.com/djlint/djLint + rev: v1.36.4 + hooks: + - id: djlint-reformat-jinja + +================ +File: CODE_OF_CONDUCT.md +================ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), +version 2.0, available at +. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +For answers to common questions about this code of conduct, see the FAQ at +. Translations are available at +. + +================ +File: Dockerfile +================ +# Build stage +FROM python:3.12-slim AS builder + +WORKDIR /build + +# Copy requirements first to leverage Docker cache +COPY requirements.txt . + +# Install build dependencies and Python packages +RUN apt-get update \ + && apt-get install -y --no-install-recommends gcc python3-dev \ + && pip install --no-cache-dir --upgrade pip \ + && pip install --no-cache-dir --timeout 1000 -r requirements.txt \ + && rm -rf /var/lib/apt/lists/* + +# Runtime stage +FROM python:3.12-slim + +# Set Python environment variables +ENV PYTHONUNBUFFERED=1 +ENV PYTHONDONTWRITEBYTECODE=1 + +# Install git +RUN apt-get update \ + && apt-get install -y --no-install-recommends git \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Create a non-root user +RUN useradd -m -u 1000 appuser + +COPY --from=builder /usr/local/lib/python3.12/site-packages/ /usr/local/lib/python3.12/site-packages/ +COPY src/ ./ + +# Change ownership of the application files +RUN chown -R appuser:appuser /app + +# Switch to non-root user +USER appuser + +EXPOSE 8000 + +CMD ["python", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] + +================ +File: LICENSE +================ +MIT License + +Copyright (c) 2024 Romain Courtois + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +================ +File: pyproject.toml +================ +[tool.pylint.format] +max-line-length = 119 + +[tool.pycln] +all = true + +[tool.isort] +profile = "black" +line_length = 119 +remove_redundant_aliases = true +float_to_top = true +order_by_type = true +filter_files = true + +[tool.black] +line-length = 119 +skip-string-normalization = true + +================ +File: pytest.ini +================ +[pytest] +pythonpath = src +testpaths = src/gitingest/tests +asyncio_mode = auto + +python_files = test_*.py +python_classes = Test* +python_functions = test_* + +================ +File: README.md +================ +[![Image](./docs/frontpage.png "GitIngest main page")](https://gitingest.com) + + + + License + + + + PyPI version + + + + Downloads + + + + GitHub issues + + + + Code style: black + + + + + Discord + + +# GitIngest + +Turn any Git repository into a prompt-friendly text ingest for LLMs. + +You can also replace `hub` with `ingest` in any github url to access the coresponding digest + +[gitingest.com](https://gitingest.com) + +## 🚀 Features + +- **Easy code context**: Get a text digest from a git repository URL or a directory +- **Smart Formatting**: Optimized output format for LLM prompts +- **Statistics about**: + - File and directory structure + - Size of the extract + - Token count +- **CLI tool**: Run it as a command (Currently on Linux only) +- **Python package**: Import it in your code + +## 📦 Installation + +``` bash +pip install gitingest +``` + +## 💡 Command Line usage + +The `gitingest` command line tool allows you to analyze codebases and create a text dump of their contents. + +```bash +# Basic usage +gitingest /path/to/directory + +# From url +gitingest https://github.com/cyclotruc/gitingest + +# See more options +gitingest --help +``` + +This will write the digest in a text file (default `digest.txt`) in your current working directory. + +## 🐛 Python package usage + +```python +from gitingest import ingest + +summary, tree, content = ingest("path/to/directory") + +# or from URL +summary, tree, content = ingest("https://github.com/cyclotruc/gitingest") +``` + +By default, this won't write a file but can be enabled with the `output` argument + +## 🛠️ Using + +- Tailwind CSS - Frontend +- [FastAPI](https://github.com/fastapi/fastapi) - Backend framework +- [tiktoken](https://github.com/openai/tiktoken) - Token estimation +- [apianalytics.dev](https://www.apianalytics.dev/) - Simple Analytics + +## 🌐 Self-host + +1. Build the image: + +``` bash +docker build -t gitingest . +``` + +2. Run the container: + +``` bash +docker run -d --name gitingest -p 8000:8000 gitingest +``` + +The application will be available at `http://localhost:8000` +Ensure environment variables are set before running the application or deploying it via Docker. + +## ✔️ Contributing + +Contributions are welcome! + +Gitingest aims to be friendly for first time contributors, with a simple python and html codebase. If you need any help while working with the code, reach out to us on [discord](https://discord.com/invite/zerRaGK9EC) + +### Ways to contribute + +1. Provide your feedback and ideas on discord +2. Open an Issue on github to report a bug +3. Create a Pull request + - Fork the repository + - Make your changes and test them locally + - Open a pull request for review and feedback + +### 🔧 Local dev + +#### Environment Configuration + +- **`ALLOWED_HOSTS`**: Specify allowed hostnames for the application. Default: `"gitingest.com,*.gitingest.com,gitdigest.dev,localhost"`. +You can configure the application using the following environment variables: + +```bash +ALLOWED_HOSTS="gitingest.local,localhost" +``` + +#### Run locally + +1. Clone the repository + +```bash +git clone https://github.com/cyclotruc/gitingest.git +cd gitingest +``` + +2. Install dependencies + +```bash +pip install -r requirements.txt +``` + +3. Run the application: + +```bash +cd src +uvicorn main:app --reload +``` + +The frontend will be available at `localhost:8000` + +================ +File: requirements.txt +================ +black +click>=8.0.0 +djlint +dotenv +fastapi-analytics +fastapi[standard] +pre-commit +pytest +pytest-asyncio +slowapi +starlette +tiktoken +uvicorn + +================ +File: SECURITY.md +================ +# Security Policy + +## Reporting a Vulnerability + +If you have discovered a vulnerability inside the project, report it privately at . This way the maintainer can work on a proper fix without disclosing the problem to the public before it has been solved. + +================ +File: setup.py +================ +from setuptools import find_packages, setup + +setup( + name="gitingest", + version="0.1.2", + packages=find_packages(where="src"), + package_dir={"": "src"}, + include_package_data=True, + install_requires=[ + "click>=8.0.0", + "tiktoken", + ], + entry_points={ + "console_scripts": [ + "gitingest=gitingest.cli:main", + ], + }, + python_requires=">=3.6", + author="Romain Courtois", + author_email="romain@coderamp.io", + description="CLI tool to analyze and create text dumps of codebases for LLMs", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", + url="https://github.com/cyclotruc/gitingest", + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + ], +) diff --git a/src/gitingest/tests/test_clone.py b/src/gitingest/tests/test_clone.py index 950c9908..680181c8 100644 --- a/src/gitingest/tests/test_clone.py +++ b/src/gitingest/tests/test_clone.py @@ -14,7 +14,7 @@ async def test_clone_repo_with_commit() -> None: 'local_path': '/tmp/repo', } - with patch('clone.check_repo_exists', return_value=True) as mock_check: + with patch('gitingest.clone.check_repo_exists', return_value=True) as mock_check: with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b'output', b'error') @@ -34,7 +34,7 @@ async def test_clone_repo_without_commit() -> None: 'local_path': '/tmp/repo', } - with patch('clone.check_repo_exists', return_value=True) as mock_check: + with patch('gitingest.clone.check_repo_exists', return_value=True) as mock_check: with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b'output', b'error') diff --git a/src/process_query.py b/src/process_query.py index f2af1bbc..466b11d2 100644 --- a/src/process_query.py +++ b/src/process_query.py @@ -5,7 +5,9 @@ from starlette.templating import _TemplateResponse from config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE -from gitingest import clone_repo, ingest_from_query, parse_query +from gitingest.clone import clone_repo +from gitingest.ingest_from_query import ingest_from_query +from gitingest.parse_query import parse_query from server_utils import Colors, logSliderToSize templates = Jinja2Templates(directory="templates") From 265527891db6606f4e3d69491470e20693ccf6ac Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Fri, 27 Dec 2024 23:06:17 +0100 Subject: [PATCH 4/9] Replace deprecated 'dotenv' with 'python-dotenv' in requirements.txt to resolve installation errors. --- repomix-output.txt | 3335 -------------------------------------------- requirements.txt | 2 +- 2 files changed, 1 insertion(+), 3336 deletions(-) delete mode 100644 repomix-output.txt diff --git a/repomix-output.txt b/repomix-output.txt deleted file mode 100644 index 23663041..00000000 --- a/repomix-output.txt +++ /dev/null @@ -1,3335 +0,0 @@ -This file is a merged representation of the entire codebase, combining all repository files into a single document. -Generated by Repomix on: 2024-12-27T21:21:09.225Z - -================================================================ -File Summary -================================================================ - -Purpose: --------- -This file contains a packed representation of the entire repository's contents. -It is designed to be easily consumable by AI systems for analysis, code review, -or other automated processes. - -File Format: ------------- -The content is organized as follows: -1. This summary section -2. Repository information -3. Repository structure -4. Multiple file entries, each consisting of: - a. A separator line (================) - b. The file path (File: path/to/file) - c. Another separator line - d. The full contents of the file - e. A blank line - -Usage Guidelines: ------------------ -- This file should be treated as read-only. Any changes should be made to the - original repository files, not this packed version. -- When processing this file, use the file path to distinguish - between different files in the repository. -- Be aware that this file may contain sensitive information. Handle it with - the same level of security as you would the original repository. - -Notes: ------- -- Some files may have been excluded based on .gitignore rules and Repomix's - configuration. -- Binary files are not included in this packed representation. Please refer to - the Repository Structure section for a complete list of file paths, including - binary files. - -Additional Info: ----------------- - -For more information about Repomix, visit: https://github.com/yamadashy/repomix - -================================================================ -Repository Structure -================================================================ -.github/ - workflows/ - unitest.yml -src/ - gitingest/ - tests/ - conftest.py - test_clone.py - test_ingest.py - test_parse_query.py - __init__.py - cli.py - clone.py - ignore_patterns.py - ingest_from_query.py - ingest.py - parse_query.py - utils.py - routers/ - __init__.py - download.py - dynamic.py - index.py - static/ - js/ - snow.js - utils.js - favicon.svg - robots.txt - templates/ - components/ - footer.jinja - github_form.jinja - navbar.jinja - result.jinja - api.jinja - base.jinja - github.jinja - index.jinja - config.py - main.py - process_query.py - server_utils.py -.dockerignore -.gitignore -.pre-commit-config.yaml -CODE_OF_CONDUCT.md -Dockerfile -LICENSE -pyproject.toml -pytest.ini -README.md -requirements.txt -SECURITY.md -setup.py - -================================================================ -Repository Files -================================================================ - -================ -File: .github/workflows/unitest.yml -================ -name: Unit Tests - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - -jobs: - test: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest pytest-asyncio - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - pip install -e . - - - name: Run tests - run: | - pytest - -================ -File: src/gitingest/tests/conftest.py -================ -import os -import sys - -# Get the absolute path of the project root directory (one level up from tests) -project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - -# Add both the project root and src directory to PYTHONPATH -sys.path.insert(0, project_root) -sys.path.insert(0, os.path.join(project_root, 'src')) - -================ -File: src/gitingest/tests/test_clone.py -================ -from unittest.mock import AsyncMock, patch - -import pytest - -from clone import clone_repo -from gitingest.clone import check_repo_exists - - -@pytest.mark.asyncio -async def test_clone_repo_with_commit() -> None: - query = { - 'commit': 'a' * 40, # Simulating a valid commit hash - 'branch': 'main', - 'url': 'https://github.com/user/repo', - 'local_path': '/tmp/repo', - } - - with patch('clone.check_repo_exists', return_value=True) as mock_check: - with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: - mock_process = AsyncMock() - mock_process.communicate.return_value = (b'output', b'error') - mock_exec.return_value = mock_process - - await clone_repo(query) - mock_check.assert_called_once_with(query['url']) - assert mock_exec.call_count == 2 # Clone and checkout calls - - -@pytest.mark.asyncio -async def test_clone_repo_without_commit() -> None: - query = { - 'commit': None, - 'branch': 'main', - 'url': 'https://github.com/user/repo', - 'local_path': '/tmp/repo', - } - - with patch('clone.check_repo_exists', return_value=True) as mock_check: - with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: - mock_process = AsyncMock() - mock_process.communicate.return_value = (b'output', b'error') - mock_exec.return_value = mock_process - - await clone_repo(query) - mock_check.assert_called_once_with(query['url']) - assert mock_exec.call_count == 1 # Only clone call - - -@pytest.mark.asyncio -async def test_clone_repo_nonexistent_repository() -> None: - query = { - 'commit': None, - 'branch': 'main', - 'url': 'https://github.com/user/nonexistent-repo', - 'local_path': '/tmp/repo', - } - - with patch('gitingest.clone.check_repo_exists', return_value=False) as mock_check: - with pytest.raises(ValueError, match="Repository not found"): - await clone_repo(query) - mock_check.assert_called_once_with(query['url']) - - -@pytest.mark.asyncio -async def test_check_repo_exists() -> None: - url = "https://github.com/user/repo" - - with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: - mock_process = AsyncMock() - mock_process.communicate.return_value = (b'HTTP/1.1 200 OK\n', b'') - mock_exec.return_value = mock_process - - # Test existing repository - mock_process.returncode = 0 - assert await check_repo_exists(url) is True - - # Test non-existing repository (404 response) - mock_process.communicate.return_value = (b'HTTP/1.1 404 Not Found\n', b'') - mock_process.returncode = 0 - assert await check_repo_exists(url) is False - - # Test failed request - mock_process.returncode = 1 - assert await check_repo_exists(url) is False - -================ -File: src/gitingest/tests/test_ingest.py -================ -from pathlib import Path -from typing import Any, Dict - -import pytest - -from gitingest.ingest_from_query import extract_files_content, scan_directory - - -# Test fixtures -@pytest.fixture -def sample_query() -> Dict[str, Any]: - return { - 'user_name': 'test_user', - 'repo_name': 'test_repo', - 'local_path': '/tmp/test_repo', - 'subpath': '/', - 'branch': 'main', - 'commit': None, - 'max_file_size': 1_000_000, - 'slug': 'test_user/test_repo', - 'ignore_patterns': ['*.pyc', '__pycache__', '.git'], - 'include_patterns': None, - 'pattern_type': 'exclude', - } - - -@pytest.fixture -def temp_directory(tmp_path: Path) -> Path: - # Creates the following structure: - # test_repo/ - # ├── file1.txt - # ├── file2.py - # └── src/ - # | ├── subfile1.txt - # | └── subfile2.py - # | └── subdir/ - # | └── file_subdir.txt - # | └── file_subdir.py - # └── dir1/ - # | └── file_dir1.txt - # └── dir2/ - # └── file_dir2.txt - - test_dir = tmp_path / "test_repo" - test_dir.mkdir() - - # Root files - (test_dir / "file1.txt").write_text("Hello World") - (test_dir / "file2.py").write_text("print('Hello')") - - # src directory and its files - src_dir = test_dir / "src" - src_dir.mkdir() - (src_dir / "subfile1.txt").write_text("Hello from src") - (src_dir / "subfile2.py").write_text("print('Hello from src')") - - # src/subdir and its files - subdir = src_dir / "subdir" - subdir.mkdir() - (subdir / "file_subdir.txt").write_text("Hello from subdir") - (subdir / "file_subdir.py").write_text("print('Hello from subdir')") - - # dir1 and its file - dir1 = test_dir / "dir1" - dir1.mkdir() - (dir1 / "file_dir1.txt").write_text("Hello from dir1") - - # dir2 and its file - dir2 = test_dir / "dir2" - dir2.mkdir() - (dir2 / "file_dir2.txt").write_text("Hello from dir2") - - return test_dir - - -def test_scan_directory(temp_directory: Path, sample_query: Dict[str, Any]) -> None: - result = scan_directory(str(temp_directory), query=sample_query) - if result is None: - assert False, "Result is None" - - assert result['type'] == 'directory' - assert result['file_count'] == 8 # All .txt and .py files - assert result['dir_count'] == 4 # src, src/subdir, dir1, dir2 - assert len(result['children']) == 5 # file1.txt, file2.py, src, dir1, dir2 - - -def test_extract_files_content(temp_directory: Path, sample_query: Dict[str, Any]) -> None: - nodes = scan_directory(str(temp_directory), query=sample_query) - if nodes is None: - assert False, "Nodes is None" - files = extract_files_content(query=sample_query, node=nodes, max_file_size=1_000_000) - assert len(files) == 8 # All .txt and .py files - - # Check for presence of key files - paths = [f['path'] for f in files] - assert any('file1.txt' in p for p in paths) - assert any('subfile1.txt' in p for p in paths) - assert any('file2.py' in p for p in paths) - assert any('subfile2.py' in p for p in paths) - assert any('file_subdir.txt' in p for p in paths) - assert any('file_dir1.txt' in p for p in paths) - assert any('file_dir2.txt' in p for p in paths) - - -# TODO: test with include patterns: ['*.txt'] -# TODO: test with wrong include patterns: ['*.qwerty'] - - -# single folder patterns -# TODO: test with include patterns: ['src/*'] -# TODO: test with include patterns: ['/src/*'] -# TODO: test with include patterns: ['/src/'] -# TODO: test with include patterns: ['/src*'] - -# multiple patterns -# TODO: test with multiple include patterns: ['*.txt', '*.py'] -# TODO: test with multiple include patterns: ['/src/*', '*.txt'] -# TODO: test with multiple include patterns: ['/src*', '*.txt'] - -================ -File: src/gitingest/tests/test_parse_query.py -================ -import pytest - -from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS -from gitingest.parse_query import parse_query, parse_url - - -def test_parse_url_valid() -> None: - test_cases = [ - "https://github.com/user/repo", - "https://gitlab.com/user/repo", - "https://bitbucket.org/user/repo", - ] - for url in test_cases: - result = parse_url(url) - assert result["user_name"] == "user" - assert result["repo_name"] == "repo" - assert result["url"] == url - - -def test_parse_url_invalid() -> None: - url = "https://only-domain.com" - with pytest.raises(ValueError, match="Invalid repository URL"): - parse_url(url) - - -def test_parse_query_basic() -> None: - test_cases = ["https://github.com/user/repo", "https://gitlab.com/user/repo"] - for url in test_cases: - result = parse_query(url, max_file_size=50, from_web=True, ignore_patterns='*.txt') - assert result["user_name"] == "user" - assert result["repo_name"] == "repo" - assert result["url"] == url - assert "*.txt" in result["ignore_patterns"] - - -def test_parse_query_include_pattern() -> None: - url = "https://github.com/user/repo" - result = parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py') - assert result["include_patterns"] == ["*.py"] - assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS - - -def test_parse_query_invalid_pattern() -> None: - url = "https://github.com/user/repo" - with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): - parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py;rm -rf') - -================ -File: src/gitingest/__init__.py -================ -from gitingest.clone import clone_repo -from gitingest.ingest import ingest -from gitingest.ingest_from_query import ingest_from_query -from gitingest.parse_query import parse_query - -__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"] - -================ -File: src/gitingest/cli.py -================ -import os -from typing import Optional, Tuple - -import click - -from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS -from gitingest.ingest import ingest -from gitingest.ingest_from_query import MAX_FILE_SIZE - - -def normalize_pattern(pattern: str) -> str: - pattern = pattern.strip() - pattern = pattern.lstrip(os.sep) - if pattern.endswith(os.sep): - pattern += "*" - return pattern - - -@click.command() -@click.argument('source', type=str, required=True) -@click.option('--output', '-o', default=None, help='Output file path (default: .txt in current directory)') -@click.option('--max-size', '-s', default=MAX_FILE_SIZE, help='Maximum file size to process in bytes') -@click.option('--exclude-pattern', '-e', multiple=True, help='Patterns to exclude') -@click.option('--include-pattern', '-i', multiple=True, help='Patterns to include') -def main( - source: str, - output: Optional[str], - max_size: int, - exclude_pattern: Tuple[str, ...], - include_pattern: Tuple[str, ...], -) -> None: - """Analyze a directory and create a text dump of its contents.""" - try: - # Combine default and custom ignore patterns - exclude_patterns = list(exclude_pattern) - include_patterns = list(set(include_pattern)) - - if not output: - output = "digest.txt" - summary, tree, content = ingest(source, max_size, include_patterns, exclude_patterns, output=output) - - click.echo(f"Analysis complete! Output written to: {output}") - click.echo("\nSummary:") - click.echo(summary) - - except Exception as e: - click.echo(f"Error: {str(e)}", err=True) - raise click.Abort() - - -if __name__ == '__main__': - main() - -================ -File: src/gitingest/clone.py -================ -import asyncio -from typing import Any, Dict, Tuple - -from gitingest.utils import async_timeout - -CLONE_TIMEOUT = 20 - - -async def check_repo_exists(url: str) -> bool: - proc = await asyncio.create_subprocess_exec( - "curl", - "-I", - url, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - if proc.returncode != 0: - return False - # Check if stdout contains "404" status code - stdout_str = stdout.decode() - return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str - - -@async_timeout(CLONE_TIMEOUT) -async def clone_repo(query: Dict[str, Any]) -> Tuple[bytes, bytes]: - if not await check_repo_exists(query['url']): - raise ValueError("Repository not found, make sure it is public") - - if query['commit']: - proc = await asyncio.create_subprocess_exec( - "git", - "clone", - "--single-branch", - query['url'], - query['local_path'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - - proc = await asyncio.create_subprocess_exec( - "git", - "-C", - query['local_path'], - "checkout", - query['branch'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - elif query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: - proc = await asyncio.create_subprocess_exec( - "git", - "clone", - "--depth=1", - "--single-branch", - "--branch", - query['branch'], - query['url'], - query['local_path'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - else: - proc = await asyncio.create_subprocess_exec( - "git", - "clone", - "--depth=1", - "--single-branch", - query['url'], - query['local_path'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - - stdout, stderr = await proc.communicate() - - return stdout, stderr - -================ -File: src/gitingest/ignore_patterns.py -================ -from typing import List - -DEFAULT_IGNORE_PATTERNS: List[str] = [ - # Python - '*.pyc', - '*.pyo', - '*.pyd', - '__pycache__', - '.pytest_cache', - '.coverage', - '.tox', - '.nox', - '.mypy_cache', - '.ruff_cache', - '.hypothesis', - 'poetry.lock', - 'Pipfile.lock', - # JavaScript/Node - 'node_modules', - 'bower_components', - 'package-lock.json', - 'yarn.lock', - '.npm', - '.yarn', - '.pnpm-store', - # Version control - '.git', - '.svn', - '.hg', - '.gitignore', - '.gitattributes', - '.gitmodules', - # Images and media - '*.svg', - '*.png', - '*.jpg', - '*.jpeg', - '*.gif', - '*.ico', - '*.pdf', - '*.mov', - '*.mp4', - '*.mp3', - '*.wav', - # Virtual environments - 'venv', - '.venv', - 'env', - '.env', - 'virtualenv', - # IDEs and editors - '.idea', - '.vscode', - '.vs', - '*.swp', - '*.swo', - '*.swn', - '.settings', - '.project', - '.classpath', - '*.sublime-*', - # Temporary and cache files - '*.log', - '*.bak', - '*.swp', - '*.tmp', - '*.temp', - '.cache', - '.sass-cache', - '.eslintcache', - '.DS_Store', - 'Thumbs.db', - 'desktop.ini', - # Build directories and artifacts - 'build', - 'dist', - 'target', - 'out', - '*.egg-info', - '*.egg', - '*.whl', - '*.so', - '*.dylib', - '*.dll', - '*.class', - # Documentation - 'site-packages', - '.docusaurus', - '.next', - '.nuxt', - # Other common patterns - ## Minified files - '*.min.js', - '*.min.css', - ## Source maps - '*.map', - ## Terraform - '.terraform', - '*.tfstate*', - ## Dependencies in various languages - 'vendor/', -] - -================ -File: src/gitingest/ingest_from_query.py -================ -import os -from fnmatch import fnmatch -from typing import Any, Dict, List, Optional, Set, Tuple - -import tiktoken - -MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB -MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal -MAX_FILES = 10_000 # Maximum number of files to process -MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB - - -def should_include(path: str, base_path: str, include_patterns: List[str]) -> bool: - rel_path = path.replace(base_path, "").lstrip(os.sep) - include = False - for pattern in include_patterns: - if fnmatch(rel_path, pattern): - include = True - return include - - -def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> bool: - rel_path = path.replace(base_path, "").lstrip(os.sep) - for pattern in ignore_patterns: - if pattern == '': - continue - if fnmatch(rel_path, pattern): - return True - return False - - -def is_safe_symlink(symlink_path: str, base_path: str) -> bool: - """Check if a symlink points to a location within the base directory.""" - try: - target_path = os.path.realpath(symlink_path) - base_path = os.path.realpath(base_path) - return os.path.commonpath([target_path, base_path]) == base_path - except (OSError, ValueError): - # If there's any error resolving the paths, consider it unsafe - return False - - -def is_text_file(file_path: str) -> bool: - """Determines if a file is likely a text file based on its content.""" - try: - with open(file_path, 'rb') as file: - chunk = file.read(1024) - return not bool(chunk.translate(None, bytes([7, 8, 9, 10, 12, 13, 27] + list(range(0x20, 0x100))))) - except OSError: - return False - - -def read_file_content(file_path: str) -> str: - try: - with open(file_path, encoding='utf-8', errors='ignore') as f: - return f.read() - except Exception as e: - return f"Error reading file: {str(e)}" - - -def scan_directory( - path: str, - query: Dict[str, Any], - seen_paths: Optional[Set[str]] = None, - depth: int = 0, - stats: Optional[Dict[str, int]] = None, -) -> Optional[Dict[str, Any]]: - """Recursively analyzes a directory and its contents with safety limits.""" - if seen_paths is None: - seen_paths = set() - if stats is None: - stats = {"total_files": 0, "total_size": 0} - - if depth > MAX_DIRECTORY_DEPTH: - print(f"Skipping deep directory: {path} (max depth {MAX_DIRECTORY_DEPTH} reached)") - return None - - if stats["total_files"] >= MAX_FILES: - print(f"Skipping further processing: maximum file limit ({MAX_FILES}) reached") - return None - - if stats["total_size"] >= MAX_TOTAL_SIZE_BYTES: - print(f"Skipping further processing: maximum total size ({MAX_TOTAL_SIZE_BYTES/1024/1024:.1f}MB) reached") - return None - - real_path = os.path.realpath(path) - if real_path in seen_paths: - print(f"Skipping already visited path: {path}") - return None - - seen_paths.add(real_path) - - result = { - "name": os.path.basename(path), - "type": "directory", - "size": 0, - "children": [], - "file_count": 0, - "dir_count": 0, - "path": path, - "ignore_content": False, - } - - ignore_patterns = query['ignore_patterns'] - base_path = query['local_path'] - include_patterns = query['include_patterns'] - - try: - for item in os.listdir(path): - item_path = os.path.join(path, item) - - if should_exclude(item_path, base_path, ignore_patterns): - continue - - is_file = os.path.isfile(item_path) - if is_file and query['include_patterns']: - if not should_include(item_path, base_path, include_patterns): - result["ignore_content"] = True - continue - - # Handle symlinks - if os.path.islink(item_path): - if not is_safe_symlink(item_path, base_path): - print(f"Skipping symlink that points outside base directory: {item_path}") - continue - real_path = os.path.realpath(item_path) - if real_path in seen_paths: - print(f"Skipping already visited symlink target: {item_path}") - continue - - if os.path.isfile(real_path): - file_size = os.path.getsize(real_path) - if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES: - print(f"Skipping file {item_path}: would exceed total size limit") - continue - - stats["total_files"] += 1 - stats["total_size"] += file_size - - if stats["total_files"] > MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") - return result - - is_text = is_text_file(real_path) - content = read_file_content(real_path) if is_text else "[Non-text file]" - - child = { - "name": item, - "type": "file", - "size": file_size, - "content": content, - "path": item_path, - } - result["children"].append(child) - result["size"] += file_size - result["file_count"] += 1 - - elif os.path.isdir(real_path): - subdir = scan_directory( - path=real_path, - query=query, - seen_paths=seen_paths, - depth=depth + 1, - stats=stats, - ) - if subdir and (not include_patterns or subdir["file_count"] > 0): - subdir["name"] = item - subdir["path"] = item_path - result["children"].append(subdir) - result["size"] += subdir["size"] - result["file_count"] += subdir["file_count"] - result["dir_count"] += 1 + subdir["dir_count"] - continue - - if os.path.isfile(item_path): - file_size = os.path.getsize(item_path) - if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES: - print(f"Skipping file {item_path}: would exceed total size limit") - continue - - stats["total_files"] += 1 - stats["total_size"] += file_size - - if stats["total_files"] > MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") - return result - - is_text = is_text_file(item_path) - content = read_file_content(item_path) if is_text else "[Non-text file]" - - child = { - "name": item, - "type": "file", - "size": file_size, - "content": content, - "path": item_path, - } - result["children"].append(child) - result["size"] += file_size - result["file_count"] += 1 - - elif os.path.isdir(item_path): - subdir = scan_directory( - path=item_path, - query=query, - seen_paths=seen_paths, - depth=depth + 1, - stats=stats, - ) - if subdir and (not include_patterns or subdir["file_count"] > 0): - result["children"].append(subdir) - result["size"] += subdir["size"] - result["file_count"] += subdir["file_count"] - result["dir_count"] += 1 + subdir["dir_count"] - - except PermissionError: - print(f"Permission denied: {path}") - - return result - - -def extract_files_content( - query: Dict[str, Any], - node: Dict[str, Any], - max_file_size: int, - files: Optional[List[Dict[str, Any]]] = None, -) -> List[Dict[str, Any]]: - """Recursively collects all text files with their contents.""" - if files is None: - files = [] - - if node["type"] == "file" and node["content"] != "[Non-text file]": - content = node["content"] - if node["size"] > max_file_size: - content = None - - files.append( - { - "path": node["path"].replace(query['local_path'], ""), - "content": content, - "size": node["size"], - }, - ) - elif node["type"] == "directory": - for child in node["children"]: - extract_files_content(query=query, node=child, max_file_size=max_file_size, files=files) - - return files - - -def create_file_content_string(files: List[Dict[str, Any]]) -> str: - """Creates a formatted string of file contents with separators.""" - output = "" - separator = "=" * 48 + "\n" - - # First add README.md if it exists - for file in files: - if not file['content']: - continue - - if file['path'].lower() == '/readme.md': - output += separator - output += f"File: {file['path']}\n" - output += separator - output += f"{file['content']}\n\n" - break - - # Then add all other files in their original order - for file in files: - if not file['content'] or file['path'].lower() == '/readme.md': - continue - - output += separator - output += f"File: {file['path']}\n" - output += separator - output += f"{file['content']}\n\n" - - return output - - -def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: List[Dict[str, Any]]) -> str: - """Creates a summary string with file counts and content size.""" - if "user_name" in query: - summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" - else: - summary = f"Repository: {query['slug']}\n" - - summary += f"Files analyzed: {nodes['file_count']}\n" - - if 'subpath' in query and query['subpath'] != '/': - summary += f"Subpath: {query['subpath']}\n" - if 'commit' in query and query['commit']: - summary += f"Commit: {query['commit']}\n" - elif 'branch' in query and query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: - summary += f"Branch: {query['branch']}\n" - - return summary - - -def create_tree_structure( - query: Dict[str, Any], - node: Dict[str, Any], - prefix: str = "", - is_last: bool = True, -) -> str: - """Creates a tree-like string representation of the file structure.""" - tree = "" - - if not node["name"]: - node["name"] = query['slug'] - - if node["name"]: - current_prefix = "└── " if is_last else "├── " - name = node["name"] + "/" if node["type"] == "directory" else node["name"] - tree += prefix + current_prefix + name + "\n" - - if node["type"] == "directory": - # Adjust prefix only if we added a node name - new_prefix = prefix + (" " if is_last else "│ ") if node["name"] else prefix - children = node["children"] - for i, child in enumerate(children): - tree += create_tree_structure(query, child, new_prefix, i == len(children) - 1) - - return tree - - -def generate_token_string(context_string: str) -> Optional[str]: - """Returns the number of tokens in a text string.""" - formatted_tokens = "" - try: - encoding = tiktoken.get_encoding("cl100k_base") - total_tokens = len(encoding.encode(context_string, disallowed_special=())) - - except Exception as e: - print(e) - return None - - if total_tokens > 1_000_000: - formatted_tokens = f"{total_tokens / 1_000_000:.1f}M" - elif total_tokens > 1_000: - formatted_tokens = f"{total_tokens / 1_000:.1f}k" - else: - formatted_tokens = f"{total_tokens}" - - return formatted_tokens - - -def ingest_single_file(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: - if not os.path.isfile(path): - raise ValueError(f"Path {path} is not a file") - - file_size = os.path.getsize(path) - is_text = is_text_file(path) - if not is_text: - raise ValueError(f"File {path} is not a text file") - - content = read_file_content(path) - if file_size > query['max_file_size']: - content = "[Content ignored: file too large]" - - file_info = { - "path": path.replace(query['local_path'], ""), - "content": content, - "size": file_size, - } - - summary = ( - f"Repository: {query['user_name']}/{query['repo_name']}\n" - f"File: {os.path.basename(path)}\n" - f"Size: {file_size:,} bytes\n" - f"Lines: {len(content.splitlines()):,}\n" - ) - - files_content = create_file_content_string([file_info]) - tree = "Directory structure:\n└── " + os.path.basename(path) - - formatted_tokens = generate_token_string(files_content) - if formatted_tokens: - summary += f"\nEstimated tokens: {formatted_tokens}" - - return summary, tree, files_content - - -def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: - nodes = scan_directory(path=path, query=query) - if not nodes: - raise ValueError(f"No files found in {path}") - files = extract_files_content(query=query, node=nodes, max_file_size=query['max_file_size']) - summary = create_summary_string(query, nodes, files) - tree = "Directory structure:\n" + create_tree_structure(query, nodes) - files_content = create_file_content_string(files) - - formatted_tokens = generate_token_string(tree + files_content) - if formatted_tokens: - summary += f"\nEstimated tokens: {formatted_tokens}" - - return summary, tree, files_content - - -def ingest_from_query(query: Dict[str, Any]) -> Tuple[str, str, str]: - """Main entry point for analyzing a codebase directory or single file.""" - path = f"{query['local_path']}{query['subpath']}" - if not os.path.exists(path): - raise ValueError(f"{query['slug']} cannot be found") - - if query.get('type') == 'blob': - return ingest_single_file(path, query) - - return ingest_directory(path, query) - -================ -File: src/gitingest/ingest.py -================ -import asyncio -import inspect -import shutil -from pathlib import Path -from typing import List, Optional, Tuple, Union - -from gitingest.clone import clone_repo -from gitingest.ingest_from_query import ingest_from_query -from gitingest.parse_query import parse_query - - -def ingest( - source: str, - max_file_size: int = 10 * 1024 * 1024, - include_patterns: Union[List[str], str, None] = None, - exclude_patterns: Union[List[str], str, None] = None, - output: Optional[str] = None, -) -> Tuple[str, str, str]: - try: - query = parse_query( - source=source, - max_file_size=max_file_size, - from_web=False, - include_patterns=include_patterns, - ignore_patterns=exclude_patterns, - ) - if query['url']: - clone_result = clone_repo(query) - if inspect.iscoroutine(clone_result): - asyncio.run(clone_result) - else: - raise TypeError("clone_repo did not return a coroutine as expected.") - - summary, tree, content = ingest_from_query(query) - - if output: - with open(f"{output}", "w") as f: - f.write(tree + "\n" + content) - - return summary, tree, content - - finally: - # Clean up the temporary directory if it was created - if query['url']: - # Get parent directory two levels up from local_path (../tmp) - cleanup_path = str(Path(query['local_path']).parents[1]) - shutil.rmtree(cleanup_path, ignore_errors=True) - -================ -File: src/gitingest/parse_query.py -================ -import os -import uuid -from typing import Any, Dict, List, Optional, Union - -from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS - -TMP_BASE_PATH = "../tmp" - - -def parse_url(url: str) -> Dict[str, Any]: - parsed = { - "user_name": None, - "repo_name": None, - "type": None, - "branch": None, - "commit": None, - "subpath": "/", - "local_path": None, - "url": None, - "slug": None, - "id": None, - } - - url = url.split(" ")[0] - if not url.startswith('https://'): - url = 'https://' + url - - # Extract domain and path - url_parts = url.split('/') - domain = url_parts[2] - path_parts = url_parts[3:] - - if len(path_parts) < 2: - raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.") - - parsed["user_name"] = path_parts[0] - parsed["repo_name"] = path_parts[1] - - # Keep original URL format - parsed["url"] = f"https://{domain}/{parsed['user_name']}/{parsed['repo_name']}" - parsed['slug'] = f"{parsed['user_name']}-{parsed['repo_name']}" - parsed["id"] = str(uuid.uuid4()) - parsed["local_path"] = f"{TMP_BASE_PATH}/{parsed['id']}/{parsed['slug']}" - - if len(path_parts) > 3: - parsed["type"] = path_parts[2] - parsed["branch"] = path_parts[3] - if len(parsed['branch']) == 40 and all(c in '0123456789abcdefABCDEF' for c in parsed['branch']): - parsed["commit"] = parsed['branch'] - - parsed["subpath"] = "/" + "/".join(path_parts[4:]) - - return parsed - - -def normalize_pattern(pattern: str) -> str: - pattern = pattern.strip() - pattern = pattern.lstrip(os.sep) - if pattern.endswith(os.sep): - pattern += "*" - return pattern - - -def parse_patterns(pattern: Union[List[str], str]) -> List[str]: - if isinstance(pattern, list): - pattern = ",".join(pattern) - - for p in pattern.split(","): - if not all(c.isalnum() or c in "-_./+*" for c in p.strip()): - raise ValueError( - f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), " - "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." - ) - patterns = [normalize_pattern(p) for p in pattern.split(",")] - return patterns - - -def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[str]) -> List[str]: - for pattern in include_patterns: - if pattern in ignore_patterns: - ignore_patterns.remove(pattern) - return ignore_patterns - - -def parse_path(path: str) -> Dict[str, Any]: - query = { - "local_path": os.path.abspath(path), - "slug": os.path.basename(os.path.dirname(path)) + "/" + os.path.basename(path), - "subpath": "/", - "id": str(uuid.uuid4()), - "url": None, - } - return query - - -def parse_query( - source: str, - max_file_size: int, - from_web: bool, - include_patterns: Optional[Union[List[str], str]] = None, - ignore_patterns: Optional[Union[List[str], str]] = None, -) -> Dict[str, Any]: - if from_web: - query = parse_url(source) - else: - if source.startswith("https://") or "github.com" in source: - query = parse_url(source) - else: - query = parse_path(source) - - query['max_file_size'] = max_file_size - - if ignore_patterns and ignore_patterns != "": - ignore_patterns = DEFAULT_IGNORE_PATTERNS + parse_patterns(ignore_patterns) - else: - ignore_patterns = DEFAULT_IGNORE_PATTERNS - - if include_patterns and include_patterns != "": - include_patterns = parse_patterns(include_patterns) - ignore_patterns = override_ignore_patterns(ignore_patterns, include_patterns) - else: - include_patterns = None - - query['ignore_patterns'] = ignore_patterns - query['include_patterns'] = include_patterns - - return query - -================ -File: src/gitingest/utils.py -================ -## Async Timeout decorator -import asyncio -import functools -from typing import Awaitable, Callable, ParamSpec, TypeVar - -T = TypeVar("T") -P = ParamSpec("P") - - -class AsyncTimeoutError(Exception): - """Raised when an async operation exceeds its timeout limit.""" - - pass - - -def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]: - def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]: - @functools.wraps(func) - async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: - try: - return await asyncio.wait_for(func(*args, **kwargs), timeout=seconds) - except asyncio.TimeoutError: - raise AsyncTimeoutError(f"Operation timed out after {seconds} seconds") - - return wrapper - - return decorator - -================ -File: src/routers/__init__.py -================ -from routers.download import router as download -from routers.dynamic import router as dynamic -from routers.index import router as index - -__all__ = ["download", "dynamic", "index"] - -================ -File: src/routers/download.py -================ -import os - -from fastapi import APIRouter, HTTPException -from fastapi.responses import Response - -from config import TMP_BASE_PATH - -router = APIRouter() - - -@router.get("/download/{digest_id}") -async def download_ingest(digest_id: str) -> Response: - try: - # Find the first .txt file in the directory - directory = f"{TMP_BASE_PATH}/{digest_id}" - txt_files = [f for f in os.listdir(directory) if f.endswith('.txt')] - - if not txt_files: - raise FileNotFoundError("No .txt file found") - - with open(f"{directory}/{txt_files[0]}") as f: - content = f.read() - - return Response( - content=content, - media_type="text/plain", - headers={"Content-Disposition": f"attachment; filename={txt_files[0]}"}, - ) - except FileNotFoundError: - raise HTTPException(status_code=404, detail="Digest not found") - -================ -File: src/routers/dynamic.py -================ -from fastapi import APIRouter, Form, Request -from fastapi.responses import HTMLResponse -from fastapi.templating import Jinja2Templates - -from process_query import process_query -from server_utils import limiter - -router = APIRouter() -templates = Jinja2Templates(directory="templates") - - -@router.get("/{full_path:path}") -async def catch_all(request: Request, full_path: str) -> HTMLResponse: - return templates.TemplateResponse( - "github.jinja", - { - "request": request, - "github_url": f"https://github.com/{full_path}", - "loading": True, - "default_file_size": 243, - }, - ) - - -@router.post("/{full_path:path}", response_class=HTMLResponse) -@limiter.limit("10/minute") -async def process_catch_all( - request: Request, - input_text: str = Form(...), - max_file_size: int = Form(...), - pattern_type: str = Form(...), - pattern: str = Form(...), -) -> HTMLResponse: - return await process_query( - request, - input_text, - max_file_size, - pattern_type, - pattern, - is_index=False, - ) - -================ -File: src/routers/index.py -================ -from fastapi import APIRouter, Form, Request -from fastapi.responses import HTMLResponse -from fastapi.templating import Jinja2Templates - -from config import EXAMPLE_REPOS -from process_query import process_query -from server_utils import limiter - -router = APIRouter() -templates = Jinja2Templates(directory="templates") - - -@router.get("/", response_class=HTMLResponse) -async def home(request: Request) -> HTMLResponse: - return templates.TemplateResponse( - "index.jinja", - { - "request": request, - "examples": EXAMPLE_REPOS, - "default_file_size": 243, - }, - ) - - -@router.post("/", response_class=HTMLResponse) -@limiter.limit("10/minute") -async def index_post( - request: Request, - input_text: str = Form(...), - max_file_size: int = Form(...), - pattern_type: str = Form(...), - pattern: str = Form(...), -) -> HTMLResponse: - return await process_query( - request, - input_text, - max_file_size, - pattern_type, - pattern, - is_index=True, - ) - -================ -File: src/static/js/snow.js -================ -// Snow effect initialization -function initSnow() { - const snowCanvas = document.getElementById('snow-canvas'); - const ctx = snowCanvas.getContext('2d'); - - // Configure snow - const snowflakes = []; - const maxSnowflakes = 50; - const spawnInterval = 200; - let currentSnowflakes = 0; - let lastSpawnTime = 0; - - // Resize canvas to window size - function resizeCanvas() { - snowCanvas.width = window.innerWidth; - snowCanvas.height = window.innerHeight; - } - - // Initial setup - resizeCanvas(); - window.addEventListener('resize', resizeCanvas); - - // Snowflake class definition - class Snowflake { - constructor() { - this.reset(); - } - - reset() { - this.x = Math.random() * snowCanvas.width; - this.y = 0; - this.size = Math.random() * 3 + 2; - this.speed = Math.random() * 1 + 0.5; - this.wind = Math.random() * 0.5 - 0.25; - } - - update() { - this.y += this.speed; - this.x += this.wind; - - if (this.y > snowCanvas.height) { - this.reset(); - } - } - - draw() { - ctx.save(); - - ctx.shadowColor = 'rgba(0, 0, 0, 0.3)'; - ctx.shadowBlur = 5; - ctx.shadowOffsetX = 2; - ctx.shadowOffsetY = 2; - - ctx.beginPath(); - ctx.arc(this.x, this.y, this.size, 0, Math.PI * 2); - ctx.fillStyle = 'rgba(255, 255, 255, 1)'; - ctx.fill(); - - ctx.strokeStyle = 'rgba(200, 200, 200, 0.8)'; - ctx.lineWidth = 0.5; - ctx.stroke(); - - ctx.restore(); - } - } - - function animate(currentTime) { - ctx.clearRect(0, 0, snowCanvas.width, snowCanvas.height); - - if (currentSnowflakes < maxSnowflakes && currentTime - lastSpawnTime > spawnInterval) { - snowflakes.push(new Snowflake()); - currentSnowflakes++; - lastSpawnTime = currentTime; - } - - snowflakes.forEach(snowflake => { - snowflake.update(); - snowflake.draw(); - }); - - requestAnimationFrame(animate); - } - - requestAnimationFrame(animate); -} - -// Initialize snow when DOM content is loaded -document.addEventListener('DOMContentLoaded', initSnow); - -// Also initialize when the HTMX content is swapped -document.addEventListener('htmx:afterSettle', initSnow); - -================ -File: src/static/js/utils.js -================ -// Copy functionality -function copyText(className) { - const textarea = document.querySelector('.' + className); - const button = document.querySelector(`button[onclick="copyText('${className}')"]`); - if (!textarea || !button) return; - - // Copy text - navigator.clipboard.writeText(textarea.value) - .then(() => { - // Store original content - const originalContent = button.innerHTML; - - // Change button content - button.innerHTML = 'Copied!'; - - // Reset after 1 second - setTimeout(() => { - button.innerHTML = originalContent; - }, 1000); - }) - .catch(err => { - // Show error in button - const originalContent = button.innerHTML; - button.innerHTML = 'Failed to copy'; - setTimeout(() => { - button.innerHTML = originalContent; - }, 1000); - }); -} - - -function handleSubmit(event, showLoading = false) { - event.preventDefault(); - const form = event.target || document.getElementById('ingestForm'); - if (!form) return; - - const submitButton = form.querySelector('button[type="submit"]'); - if (!submitButton) return; - - const formData = new FormData(form); - - // Update file size - const slider = document.getElementById('file_size'); - if (slider) { - formData.delete('max_file_size'); - formData.append('max_file_size', slider.value); - } - - // Update pattern type and pattern - const patternType = document.getElementById('pattern_type'); - const pattern = document.getElementById('pattern'); - if (patternType && pattern) { - formData.delete('pattern_type'); - formData.delete('pattern'); - formData.append('pattern_type', patternType.value); - formData.append('pattern', pattern.value); - } - - const originalContent = submitButton.innerHTML; - const currentStars = document.getElementById('github-stars')?.textContent; - - if (showLoading) { - submitButton.disabled = true; - submitButton.innerHTML = ` -
- - - - - Processing... -
- `; - submitButton.classList.add('bg-[#ffb14d]'); - } - - // Submit the form - fetch(form.action, { - method: 'POST', - body: formData - }) - .then(response => response.text()) - .then(html => { - // Store the star count before updating the DOM - const starCount = currentStars; - - - // TEMPORARY SNOW LOGIC // - const parser = new DOMParser(); - const newDoc = parser.parseFromString(html, 'text/html'); - - const existingCanvas = document.getElementById('snow-canvas'); - document.body.innerHTML = newDoc.body.innerHTML; - if (existingCanvas) { - document.body.insertBefore(existingCanvas, document.body.firstChild); - } - // END TEMPORARY SNOW LOGIC // - - // Wait for next tick to ensure DOM is updated - setTimeout(() => { - // Reinitialize slider functionality - initializeSlider(); - - const starsElement = document.getElementById('github-stars'); - if (starsElement && starCount) { - starsElement.textContent = starCount; - } - - // Scroll to results if they exist - const resultsSection = document.querySelector('[data-results]'); - if (resultsSection) { - resultsSection.scrollIntoView({ behavior: 'smooth', block: 'start' }); - } - }, 0); - }) - .catch(error => { - submitButton.disabled = false; - submitButton.innerHTML = originalContent; - }); -} - -function copyFullDigest() { - const directoryStructure = document.querySelector('.directory-structure').value; - const filesContent = document.querySelector('.result-text').value; - const fullDigest = `${directoryStructure}\n\nFiles Content:\n\n${filesContent}`; - const button = document.querySelector('[onclick="copyFullDigest()"]'); - const originalText = button.innerHTML; - - navigator.clipboard.writeText(fullDigest).then(() => { - button.innerHTML = ` - - - - Copied! - `; - - setTimeout(() => { - button.innerHTML = originalText; - }, 2000); - }).catch(err => { - console.error('Failed to copy text: ', err); - }); -} - -// Add the logSliderToSize helper function -function logSliderToSize(position) { - const minp = 0; - const maxp = 500; - const minv = Math.log(1); - const maxv = Math.log(102400); - - const value = Math.exp(minv + (maxv - minv) * Math.pow(position / maxp, 1.5)); - return Math.round(value); -} - -// Move slider initialization to a separate function -function initializeSlider() { - const slider = document.getElementById('file_size'); - const sizeValue = document.getElementById('size_value'); - - if (!slider || !sizeValue) return; - - function updateSlider() { - const value = logSliderToSize(slider.value); - sizeValue.textContent = formatSize(value); - slider.style.backgroundSize = `${(slider.value / slider.max) * 100}% 100%`; - } - - // Update on slider change - slider.addEventListener('input', updateSlider); - - // Initialize slider position - updateSlider(); -} - -// Add helper function for formatting size -function formatSize(sizeInKB) { - if (sizeInKB >= 1024) { - return Math.round(sizeInKB / 1024) + 'mb'; - } - return Math.round(sizeInKB) + 'kb'; -} - -// Initialize slider on page load -document.addEventListener('DOMContentLoaded', initializeSlider); - -// Make sure these are available globally -window.copyText = copyText; - -window.handleSubmit = handleSubmit; -window.initializeSlider = initializeSlider; -window.formatSize = formatSize; - -// Add this new function -function setupGlobalEnterHandler() { - document.addEventListener('keydown', function (event) { - if (event.key === 'Enter' && !event.target.matches('textarea')) { - const form = document.getElementById('ingestForm'); - if (form) { - handleSubmit(new Event('submit'), true); - } - } - }); -} - -// Add to the DOMContentLoaded event listener -document.addEventListener('DOMContentLoaded', () => { - initializeSlider(); - setupGlobalEnterHandler(); -}); - -================ -File: src/static/favicon.svg -================ -1 - -================ -File: src/static/robots.txt -================ -User-agent: * -Allow: / -Allow: /api/ -Allow: /cyclotruc/gitingest/ - -================ -File: src/templates/components/footer.jinja -================ - - -================ -File: src/templates/components/github_form.jinja -================ -
-
-
- -
-
-
- -
-
-
- -
- - -
-
- -
-
-
-
-
- - - - -
- -
-
-
-
- - -
-
- {% if show_examples %} - -
-

Try these example repositories:

-
- {% for example in examples %} - - {% endfor %} -
-
- {% endif %} -
-
- -================ -File: src/templates/components/navbar.jinja -================ - -
-
-
- - - - -
-
-
- -================ -File: src/templates/components/result.jinja -================ -{% if result %} -
-
-
-
- -
- -
-
-

Summary

-
-
-
- -
- {% if ingest_id %} - -
-
- -
- {% endif %} -
- -
-
-

Directory Structure

-
-
- -
-
-
-
- -
-
-
- -
-
-

Files Content

-
-
- -
-
-
-
- -
-
-
-
-
-{% endif %} - -================ -File: src/templates/api.jinja -================ -{% extends "base.jinja" %} -{% block title %}Git ingest API{% endblock %} -{% block content %} -
-
-
-

API Documentation

-
-
-
-
- - - -
-
-

The API is currently under development..

-
-
-
-

- We're working on making our API available to the public. - In the meantime, you can - open an issue on github - to suggest features. -

-
-
-
-{% endblock %} - -================ -File: src/templates/base.jinja -================ - - - - - - - - - - - - - - - - - - - - - - - - - - - - {% block title %}Git ingest{% endblock %} - - - - - - {% block extra_head %}{% endblock %} - - - - {% include 'components/navbar.jinja' %} - -
-
- {% block content %}{% endblock %} -
-
- {% include 'components/footer.jinja' %} - {% block extra_scripts %}{% endblock %} - - - -================ -File: src/templates/github.jinja -================ -{% extends "base.jinja" %} -{% block content %} - {% if error_message %} -
{{ error_message }}
- {% endif %} - {% with is_index=true, show_examples=false %} - {% include 'components/github_form.jinja' %} - {% endwith %} - {% if loading %} -
-
-
-
-

Loading...

-
-
- {% endif %} - {% include 'components/result.jinja' %} -{% endblock content %} -{% block extra_scripts %} - -{% endblock extra_scripts %} - -================ -File: src/templates/index.jinja -================ -{% extends "base.jinja" %} -{% block extra_head %} - -{% endblock %} -{% block content %} -
-
- - - - - - -

- Prompt-friendly -
- codebase  -

- -
-

- Turn any Git repository into a simple text ingest of its codebase. -

-

- This is useful for feeding a codebase into any LLM. -

-

- You can also replace 'hub' with 'ingest' in any Github URL -

-
- {% if error_message %} -
{{ error_message }}
- {% endif %} - {% with is_index=true, show_examples=true %} - {% include 'components/github_form.jinja' %} - {% endwith %} - {% include 'components/result.jinja' %} -{% endblock %} - -================ -File: src/config.py -================ -MAX_DISPLAY_SIZE = 300_000 -TMP_BASE_PATH = "../tmp" - -EXAMPLE_REPOS = [ - {"name": "Gitingest", "url": "https://github.com/cyclotruc/gitingest"}, - {"name": "FastAPI", "url": "https://github.com/tiangolo/fastapi"}, - {"name": "Flask", "url": "https://github.com/pallets/flask"}, - {"name": "Tldraw", "url": "https://github.com/tldraw/tldraw"}, - {"name": "ApiAnalytics", "url": "https://github.com/tom-draper/api-analytics"}, -] - -================ -File: src/main.py -================ -import os -from typing import Dict - -from api_analytics.fastapi import Analytics -from dotenv import load_dotenv -from fastapi import FastAPI, Request -from fastapi.responses import FileResponse, HTMLResponse, Response -from fastapi.staticfiles import StaticFiles -from fastapi.templating import Jinja2Templates -from slowapi import _rate_limit_exceeded_handler -from slowapi.errors import RateLimitExceeded -from starlette.middleware.trustedhost import TrustedHostMiddleware - -from routers import download, dynamic, index -from server_utils import limiter - -load_dotenv() - -app = FastAPI() -app.state.limiter = limiter - - -# Define a wrapper handler with the correct signature -async def rate_limit_exception_handler(request: Request, exc: Exception) -> Response: - if isinstance(exc, RateLimitExceeded): - # Delegate to the actual handler - return _rate_limit_exceeded_handler(request, exc) - # Optionally, handle other exceptions or re-raise - raise exc - - -# Register the wrapper handler -app.add_exception_handler(RateLimitExceeded, rate_limit_exception_handler) - -app.mount("/static", StaticFiles(directory="static"), name="static") -app_analytics_key = os.getenv('API_ANALYTICS_KEY') -if app_analytics_key: - app.add_middleware(Analytics, api_key=app_analytics_key) - -# Define the default allowed hosts -default_allowed_hosts = ["gitingest.com", "*.gitingest.com", "localhost", "127.0.0.1"] - -# Fetch allowed hosts from the environment variable or use the default -allowed_hosts = os.getenv("ALLOWED_HOSTS") -if allowed_hosts: - allowed_hosts = allowed_hosts.split(",") -else: - allowed_hosts = default_allowed_hosts - -app.add_middleware(TrustedHostMiddleware, allowed_hosts=allowed_hosts) -templates = Jinja2Templates(directory="templates") - - -@app.get("/health") -async def health_check() -> Dict[str, str]: - return {"status": "healthy"} - - -@app.head("/") -async def head_root() -> HTMLResponse: - """Mirror the headers and status code of the index page""" - return HTMLResponse(content=None, headers={"content-type": "text/html; charset=utf-8"}) - - -@app.get("/api/", response_class=HTMLResponse) -@app.get("/api", response_class=HTMLResponse) -async def api_docs(request: Request) -> HTMLResponse: - return templates.TemplateResponse("api.jinja", {"request": request}) - - -@app.get("/robots.txt") -async def robots() -> FileResponse: - return FileResponse('static/robots.txt') - - -app.include_router(index) -app.include_router(download) -app.include_router(dynamic) - -================ -File: src/process_query.py -================ -from typing import Any, Dict - -from fastapi import Request -from fastapi.templating import Jinja2Templates -from starlette.templating import _TemplateResponse - -from config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE -from gitingest import clone_repo, ingest_from_query, parse_query - -from server_utils import Colors, logSliderToSize - -templates = Jinja2Templates(directory="templates") - - -def print_query( - query: Dict[str, Any], - request: Request, - max_file_size: int, - pattern_type: str, - pattern: str, -) -> None: - print(f"{Colors.WHITE}{query['url']:<20}{Colors.END}", end="") - if int(max_file_size / 1024) != 50: - print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") - if pattern_type == "include" and pattern != "": - print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") - elif pattern_type == "exclude" and pattern != "": - print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") - - -def print_error( - query: Dict[str, Any], - request: Request, - e: Exception, - max_file_size: int, - pattern_type: str, - pattern: str, -) -> None: - print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") - print_query(query, request, max_file_size, pattern_type, pattern) - print(f" | {Colors.RED}{e}{Colors.END}") - - -def print_success( - query: Dict[str, Any], - request: Request, - max_file_size: int, - pattern_type: str, - pattern: str, - summary: str, -) -> None: - estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] - print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") - print_query(query, request, max_file_size, pattern_type, pattern) - print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") - - -async def process_query( - request: Request, - input_text: str, - slider_position: int, - pattern_type: str = "exclude", - pattern: str = "", - is_index: bool = False, -) -> _TemplateResponse: - template = "index.jinja" if is_index else "github.jinja" - max_file_size = logSliderToSize(slider_position) - - if pattern_type == "include": - include_patterns = pattern - exclude_patterns = None - elif pattern_type == "exclude": - exclude_patterns = pattern - include_patterns = None - - try: - query = parse_query( - source=input_text, - max_file_size=max_file_size, - from_web=True, - include_patterns=include_patterns, - ignore_patterns=exclude_patterns, - ) - await clone_repo(query) - summary, tree, content = ingest_from_query(query) - with open(f"{query['local_path']}.txt", "w") as f: - f.write(tree + "\n" + content) - - except Exception as e: - # hack to print error message when query is not defined - if 'query' in locals() and query is not None and isinstance(query, dict): - print_error(query, request, e, max_file_size, pattern_type, pattern) - else: - print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") - print(f"{Colors.RED}{e}{Colors.END}") - - return templates.TemplateResponse( - template, - { - "request": request, - "github_url": input_text, - "error_message": f"Error: {e}", - "examples": EXAMPLE_REPOS if is_index else [], - "default_file_size": slider_position, - "pattern_type": pattern_type, - "pattern": pattern, - }, - ) - - if len(content) > MAX_DISPLAY_SIZE: - content = ( - f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " - "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] - ) - - print_success( - query=query, - request=request, - max_file_size=max_file_size, - pattern_type=pattern_type, - pattern=pattern, - summary=summary, - ) - - return templates.TemplateResponse( - template, - { - "request": request, - "github_url": input_text, - "result": True, - "summary": summary, - "tree": tree, - "content": content, - "examples": EXAMPLE_REPOS if is_index else [], - "ingest_id": query['id'], - "default_file_size": slider_position, - "pattern_type": pattern_type, - "pattern": pattern, - }, - ) - -================ -File: src/server_utils.py -================ -import math - -## Rate Limiter -from slowapi import Limiter -from slowapi.util import get_remote_address - -limiter = Limiter(key_func=get_remote_address) - - -## Logarithmic slider to file size conversion -def logSliderToSize(position: int) -> int: - """Convert slider position to file size in KB""" - maxp = 500 - minv = math.log(1) - maxv = math.log(102400) - - return round(math.exp(minv + (maxv - minv) * pow(position / maxp, 1.5))) * 1024 - - -## Color printing utility -class Colors: - """ANSI color codes""" - - BLACK = "\033[0;30m" - RED = "\033[0;31m" - GREEN = "\033[0;32m" - BROWN = "\033[0;33m" - BLUE = "\033[0;34m" - PURPLE = "\033[0;35m" - CYAN = "\033[0;36m" - LIGHT_GRAY = "\033[0;37m" - DARK_GRAY = "\033[1;30m" - LIGHT_RED = "\033[1;31m" - LIGHT_GREEN = "\033[1;32m" - YELLOW = "\033[1;33m" - LIGHT_BLUE = "\033[1;34m" - LIGHT_PURPLE = "\033[1;35m" - LIGHT_CYAN = "\033[1;36m" - WHITE = "\033[1;37m" - BOLD = "\033[1m" - FAINT = "\033[2m" - ITALIC = "\033[3m" - UNDERLINE = "\033[4m" - BLINK = "\033[5m" - NEGATIVE = "\033[7m" - CROSSED = "\033[9m" - END = "\033[0m" - -================ -File: .dockerignore -================ -# Git -.git -.gitignore - -# Python -__pycache__ -*.pyc -*.pyo -*.pyd -.Python -env -pip-log.txt -pip-delete-this-directory.txt -.tox -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.log - -# Virtual environment -venv -.env -.venv -ENV - -# IDE -.idea -.vscode -*.swp -*.swo - -# Project specific -docs/ -tests/ -*.md -LICENSE -pytest.ini -setup.py - -================ -File: .gitignore -================ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -tmp/* - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ -.python-version - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ -.vscode/settings.json -.DS_Store - -# Project specific -history.txt -cleanup.py -Caddyfile - -# ignore default output directory -tmp/* - -================ -File: .pre-commit-config.yaml -================ -repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 - hooks: - # Files - - id: check-added-large-files - description: 'Prevent large files from being committed.' - args: ['--maxkb=10000'] - - id: check-case-conflict - description: 'Check for files that would conflict in case-insensitive filesystems.' - - id: fix-byte-order-marker - description: 'Remove utf-8 byte order marker.' - - id: mixed-line-ending - description: 'Replace mixed line ending.' - - # Links - - id: destroyed-symlinks - description: 'Detect symlinks which are changed to regular files with a content of a path which that symlink was pointing to.' - - # File files for parseable syntax: python - - id: check-ast - - # File and line endings - - id: end-of-file-fixer - description: 'Ensure that a file is either empty, or ends with one newline.' - - id: trailing-whitespace - description: 'Trim trailing whitespace.' - - # Python - - id: check-docstring-first - description: 'Check a common error of defining a docstring after code.' - - id: requirements-txt-fixer - description: 'Sort entries in requirements.txt.' - - - repo: https://github.com/MarcoGorelli/absolufy-imports - rev: v0.3.1 - hooks: - - id: absolufy-imports - description: 'Automatically convert relative imports to absolute. (Use `args: [--never]` to revert.)' - - - repo: https://github.com/psf/black - rev: 24.10.0 - hooks: - - id: black - - - repo: https://github.com/asottile/pyupgrade - rev: v3.19.1 - hooks: - - id: pyupgrade - description: 'Automatically upgrade syntax for newer versions.' - args: [--py3-plus, --py36-plus, --py38-plus] - - - repo: https://github.com/pre-commit/pygrep-hooks - rev: v1.10.0 - hooks: - - id: python-check-blanket-noqa - description: 'Enforce that `noqa` annotations always occur with specific codes. Sample annotations: `# noqa: F401`, `# noqa: F401,W203`.' - - id: python-check-blanket-type-ignore - description: 'Enforce that `# type: ignore` annotations always occur with specific codes. Sample annotations: `# type: ignore[attr-defined]`, `# type: ignore[attr-defined, name-defined]`.' - - id: python-use-type-annotations - description: 'Enforce that python3.6+ type annotations are used instead of type comments.' - - - repo: https://github.com/PyCQA/isort - rev: 5.13.2 - hooks: - - id: isort - description: 'Sort imports alphabetically, and automatically separated into sections and by type.' - - - repo: https://github.com/hadialqattan/pycln - rev: v2.4.0 - hooks: - - id: pycln - description: 'Remove unused import statements.' - - - repo: https://github.com/djlint/djLint - rev: v1.36.4 - hooks: - - id: djlint-reformat-jinja - -================ -File: CODE_OF_CONDUCT.md -================ -# Contributor Covenant Code of Conduct - -## Our Pledge - -We as members, contributors, and leaders pledge to make participation in our -community a harassment-free experience for everyone, regardless of age, body -size, visible or invisible disability, ethnicity, sex characteristics, gender -identity and expression, level of experience, education, socio-economic status, -nationality, personal appearance, race, religion, or sexual identity -and orientation. - -We pledge to act and interact in ways that contribute to an open, welcoming, -diverse, inclusive, and healthy community. - -## Our Standards - -Examples of behavior that contributes to a positive environment for our -community include: - -* Demonstrating empathy and kindness toward other people -* Being respectful of differing opinions, viewpoints, and experiences -* Giving and gracefully accepting constructive feedback -* Accepting responsibility and apologizing to those affected by our mistakes, - and learning from the experience -* Focusing on what is best not just for us as individuals, but for the - overall community - -Examples of unacceptable behavior include: - -* The use of sexualized language or imagery, and sexual attention or - advances of any kind -* Trolling, insulting or derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or email - address, without their explicit permission -* Other conduct which could reasonably be considered inappropriate in a - professional setting - -## Enforcement Responsibilities - -Community leaders are responsible for clarifying and enforcing our standards of -acceptable behavior and will take appropriate and fair corrective action in -response to any behavior that they deem inappropriate, threatening, offensive, -or harmful. - -Community leaders have the right and responsibility to remove, edit, or reject -comments, commits, code, wiki edits, issues, and other contributions that are -not aligned to this Code of Conduct, and will communicate reasons for moderation -decisions when appropriate. - -## Scope - -This Code of Conduct applies within all community spaces, and also applies when -an individual is officially representing the community in public spaces. -Examples of representing our community include using an official e-mail address, -posting via an official social media account, or acting as an appointed -representative at an online or offline event. - -## Enforcement - -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported to the community leaders responsible for enforcement at -. -All complaints will be reviewed and investigated promptly and fairly. - -All community leaders are obligated to respect the privacy and security of the -reporter of any incident. - -## Enforcement Guidelines - -Community leaders will follow these Community Impact Guidelines in determining -the consequences for any action they deem in violation of this Code of Conduct: - -### 1. Correction - -**Community Impact**: Use of inappropriate language or other behavior deemed -unprofessional or unwelcome in the community. - -**Consequence**: A private, written warning from community leaders, providing -clarity around the nature of the violation and an explanation of why the -behavior was inappropriate. A public apology may be requested. - -### 2. Warning - -**Community Impact**: A violation through a single incident or series -of actions. - -**Consequence**: A warning with consequences for continued behavior. No -interaction with the people involved, including unsolicited interaction with -those enforcing the Code of Conduct, for a specified period of time. This -includes avoiding interactions in community spaces as well as external channels -like social media. Violating these terms may lead to a temporary or -permanent ban. - -### 3. Temporary Ban - -**Community Impact**: A serious violation of community standards, including -sustained inappropriate behavior. - -**Consequence**: A temporary ban from any sort of interaction or public -communication with the community for a specified period of time. No public or -private interaction with the people involved, including unsolicited interaction -with those enforcing the Code of Conduct, is allowed during this period. -Violating these terms may lead to a permanent ban. - -### 4. Permanent Ban - -**Community Impact**: Demonstrating a pattern of violation of community -standards, including sustained inappropriate behavior, harassment of an -individual, or aggression toward or disparagement of classes of individuals. - -**Consequence**: A permanent ban from any sort of public interaction within -the community. - -## Attribution - -This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), -version 2.0, available at -. - -Community Impact Guidelines were inspired by [Mozilla's code of conduct -enforcement ladder](https://github.com/mozilla/diversity). - -For answers to common questions about this code of conduct, see the FAQ at -. Translations are available at -. - -================ -File: Dockerfile -================ -# Build stage -FROM python:3.12-slim AS builder - -WORKDIR /build - -# Copy requirements first to leverage Docker cache -COPY requirements.txt . - -# Install build dependencies and Python packages -RUN apt-get update \ - && apt-get install -y --no-install-recommends gcc python3-dev \ - && pip install --no-cache-dir --upgrade pip \ - && pip install --no-cache-dir --timeout 1000 -r requirements.txt \ - && rm -rf /var/lib/apt/lists/* - -# Runtime stage -FROM python:3.12-slim - -# Set Python environment variables -ENV PYTHONUNBUFFERED=1 -ENV PYTHONDONTWRITEBYTECODE=1 - -# Install git -RUN apt-get update \ - && apt-get install -y --no-install-recommends git \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /app - -# Create a non-root user -RUN useradd -m -u 1000 appuser - -COPY --from=builder /usr/local/lib/python3.12/site-packages/ /usr/local/lib/python3.12/site-packages/ -COPY src/ ./ - -# Change ownership of the application files -RUN chown -R appuser:appuser /app - -# Switch to non-root user -USER appuser - -EXPOSE 8000 - -CMD ["python", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] - -================ -File: LICENSE -================ -MIT License - -Copyright (c) 2024 Romain Courtois - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -================ -File: pyproject.toml -================ -[tool.pylint.format] -max-line-length = 119 - -[tool.pycln] -all = true - -[tool.isort] -profile = "black" -line_length = 119 -remove_redundant_aliases = true -float_to_top = true -order_by_type = true -filter_files = true - -[tool.black] -line-length = 119 -skip-string-normalization = true - -================ -File: pytest.ini -================ -[pytest] -pythonpath = src -testpaths = src/gitingest/tests -asyncio_mode = auto - -python_files = test_*.py -python_classes = Test* -python_functions = test_* - -================ -File: README.md -================ -[![Image](./docs/frontpage.png "GitIngest main page")](https://gitingest.com) - - - - License - - - - PyPI version - - - - Downloads - - - - GitHub issues - - - - Code style: black - - - - - Discord - - -# GitIngest - -Turn any Git repository into a prompt-friendly text ingest for LLMs. - -You can also replace `hub` with `ingest` in any github url to access the coresponding digest - -[gitingest.com](https://gitingest.com) - -## 🚀 Features - -- **Easy code context**: Get a text digest from a git repository URL or a directory -- **Smart Formatting**: Optimized output format for LLM prompts -- **Statistics about**: - - File and directory structure - - Size of the extract - - Token count -- **CLI tool**: Run it as a command (Currently on Linux only) -- **Python package**: Import it in your code - -## 📦 Installation - -``` bash -pip install gitingest -``` - -## 💡 Command Line usage - -The `gitingest` command line tool allows you to analyze codebases and create a text dump of their contents. - -```bash -# Basic usage -gitingest /path/to/directory - -# From url -gitingest https://github.com/cyclotruc/gitingest - -# See more options -gitingest --help -``` - -This will write the digest in a text file (default `digest.txt`) in your current working directory. - -## 🐛 Python package usage - -```python -from gitingest import ingest - -summary, tree, content = ingest("path/to/directory") - -# or from URL -summary, tree, content = ingest("https://github.com/cyclotruc/gitingest") -``` - -By default, this won't write a file but can be enabled with the `output` argument - -## 🛠️ Using - -- Tailwind CSS - Frontend -- [FastAPI](https://github.com/fastapi/fastapi) - Backend framework -- [tiktoken](https://github.com/openai/tiktoken) - Token estimation -- [apianalytics.dev](https://www.apianalytics.dev/) - Simple Analytics - -## 🌐 Self-host - -1. Build the image: - -``` bash -docker build -t gitingest . -``` - -2. Run the container: - -``` bash -docker run -d --name gitingest -p 8000:8000 gitingest -``` - -The application will be available at `http://localhost:8000` -Ensure environment variables are set before running the application or deploying it via Docker. - -## ✔️ Contributing - -Contributions are welcome! - -Gitingest aims to be friendly for first time contributors, with a simple python and html codebase. If you need any help while working with the code, reach out to us on [discord](https://discord.com/invite/zerRaGK9EC) - -### Ways to contribute - -1. Provide your feedback and ideas on discord -2. Open an Issue on github to report a bug -3. Create a Pull request - - Fork the repository - - Make your changes and test them locally - - Open a pull request for review and feedback - -### 🔧 Local dev - -#### Environment Configuration - -- **`ALLOWED_HOSTS`**: Specify allowed hostnames for the application. Default: `"gitingest.com,*.gitingest.com,gitdigest.dev,localhost"`. -You can configure the application using the following environment variables: - -```bash -ALLOWED_HOSTS="gitingest.local,localhost" -``` - -#### Run locally - -1. Clone the repository - -```bash -git clone https://github.com/cyclotruc/gitingest.git -cd gitingest -``` - -2. Install dependencies - -```bash -pip install -r requirements.txt -``` - -3. Run the application: - -```bash -cd src -uvicorn main:app --reload -``` - -The frontend will be available at `localhost:8000` - -================ -File: requirements.txt -================ -black -click>=8.0.0 -djlint -dotenv -fastapi-analytics -fastapi[standard] -pre-commit -pytest -pytest-asyncio -slowapi -starlette -tiktoken -uvicorn - -================ -File: SECURITY.md -================ -# Security Policy - -## Reporting a Vulnerability - -If you have discovered a vulnerability inside the project, report it privately at . This way the maintainer can work on a proper fix without disclosing the problem to the public before it has been solved. - -================ -File: setup.py -================ -from setuptools import find_packages, setup - -setup( - name="gitingest", - version="0.1.2", - packages=find_packages(where="src"), - package_dir={"": "src"}, - include_package_data=True, - install_requires=[ - "click>=8.0.0", - "tiktoken", - ], - entry_points={ - "console_scripts": [ - "gitingest=gitingest.cli:main", - ], - }, - python_requires=">=3.6", - author="Romain Courtois", - author_email="romain@coderamp.io", - description="CLI tool to analyze and create text dumps of codebases for LLMs", - long_description=open("README.md").read(), - long_description_content_type="text/markdown", - url="https://github.com/cyclotruc/gitingest", - classifiers=[ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3", - ], -) diff --git a/requirements.txt b/requirements.txt index 7d3680e2..2688a88d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ black click>=8.0.0 djlint -dotenv fastapi-analytics fastapi[standard] pre-commit pytest pytest-asyncio +python-dotenv slowapi starlette tiktoken From 086aba050fce2e04ed20583864d13850600ec562 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sat, 28 Dec 2024 19:13:52 +0100 Subject: [PATCH 5/9] Refactor and enhance gitingest module for improved clarity, maintainability, and functionality. - **Introduced the `CloneConfig` dataclass** to encapsulate cloning parameters, including `url`, `local_path`, `commit`, and `branch`. - **Enhanced documentation** by adding detailed docstrings to the functions `check_repo_exists`, `run_git_command`, and `clone_repo`. - **Improved error handling** by refining exception management processes. - **Streamlined repository existence checks** for increased reliability. - **Added the `run_git_command` function** to centralize and simplify the execution of Git commands. - **Refactored code structure** to enhance readability and maintainability. --- - **Replaced manual hexadecimal comparison (`"0123456789abcdefABCDEF"`)** with the `string` module by defining `HEX_DIGITS = set(string.hexdigits)`. - **Revised the construction of the `parsed` dictionary** in the `parse_url` function for clarity. - **Refactored the `parse_patterns` function** to store patterns in a list (`patterns`) instead of repeatedly joining and splitting them. - **Enhanced documentation** by adding docstrings to the `override_ignore_patterns` and `parse_query` functions. - **Removed redundant `pattern.strip()` call** in `normalize_pattern`, as this is now handled within `parse_patterns`. - **Optimized the `override_ignore_patterns` function** by implementing set difference for unordered comparisons. - **Improved the `parse_query` function's structure** for better readability and maintainability. --- - **Refined `print_query`, `print_error`, and `print_success` functions** to accept only the `url` parameter, removing the dependency on the entire `query` object. - **Eliminated the unused `request` argument** from the above functions. - **Integrated the `CloneConfig` dataclass** for improved parameter handling. --- - **Adopted the `CloneConfig` dataclass** for consistent parameter management. --- - **Removed the unused `files` argument** from the `create_summary_string` function to reduce unnecessary complexity. --- - **Simplified the `AsyncTimeoutError` class** by removing a redundant `pass` statement. --- - **Updated tests** to utilize the `CloneConfig` dataclass and align with the newly introduced `run_git_command` function for encapsulated Git command execution. --- - **Aligned comparison with `DEFAULT_IGNORE_PATTERNS`** to use a set difference, ensuring unordered existence comparison. --- src/gitingest/cli.py | 3 +- src/gitingest/clone.py | 174 ++++++++++++++++-------- src/gitingest/ingest.py | 13 +- src/gitingest/ingest_from_query.py | 11 +- src/gitingest/parse_query.py | 144 ++++++++++++-------- src/gitingest/tests/test_clone.py | 44 +++--- src/gitingest/tests/test_parse_query.py | 2 +- src/gitingest/utils.py | 2 - src/process_query.py | 51 +++---- 9 files changed, 261 insertions(+), 183 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 14df2190..d634d880 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -3,7 +3,6 @@ import click -from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS from gitingest.ingest import ingest from gitingest.ingest_from_query import MAX_FILE_SIZE @@ -37,7 +36,7 @@ def main( if not output: output = "digest.txt" - summary, tree, content = ingest(source, max_size, include_patterns, exclude_patterns, output=output) + summary, _, _ = ingest(source, max_size, include_patterns, exclude_patterns, output=output) click.echo(f"Analysis complete! Output written to: {output}") click.echo("\nSummary:") diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index e7994c14..d058ed7b 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -1,12 +1,34 @@ import asyncio -from typing import Any, Dict, Tuple +from dataclasses import dataclass +from typing import Optional, Tuple -from gitingest.utils import async_timeout +from gitingest.utils import AsyncTimeoutError, async_timeout CLONE_TIMEOUT = 20 +@dataclass +class CloneConfig: + url: str + local_path: str + commit: Optional[str] = None + branch: Optional[str] = None + + async def check_repo_exists(url: str) -> bool: + """ + Check if a repository exists at the given URL using an HTTP HEAD request. + + Parameters + ---------- + url : str + The URL of the repository. + + Returns + ------- + bool + True if the repository exists, False otherwise. + """ proc = await asyncio.create_subprocess_exec( "curl", "-I", @@ -14,7 +36,7 @@ async def check_repo_exists(url: str) -> bool: stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) - stdout, stderr = await proc.communicate() + stdout, _ = await proc.communicate() if proc.returncode != 0: return False # Check if stdout contains "404" status code @@ -22,58 +44,104 @@ async def check_repo_exists(url: str) -> bool: return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str -@async_timeout(CLONE_TIMEOUT) -async def clone_repo(query: Dict[str, Any]) -> Tuple[bytes, bytes]: - if not await check_repo_exists(query['url']): - raise ValueError("Repository not found, make sure it is public") +async def run_git_command(*args: str) -> Tuple[bytes, bytes]: + """ + Executes a git command asynchronously and captures its output. + + Parameters + ---------- + *args : str + The git command and its arguments to execute. - if query['commit']: - proc = await asyncio.create_subprocess_exec( - "git", - "clone", - "--single-branch", - query['url'], - query['local_path'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - - proc = await asyncio.create_subprocess_exec( - "git", - "-C", - query['local_path'], - "checkout", - query['branch'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - elif query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: - proc = await asyncio.create_subprocess_exec( - "git", - "clone", - "--depth=1", - "--single-branch", - "--branch", - query['branch'], - query['url'], - query['local_path'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - else: - proc = await asyncio.create_subprocess_exec( - "git", - "clone", - "--depth=1", - "--single-branch", - query['url'], - query['local_path'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) + Returns + ------- + Tuple[bytes, bytes] + A tuple containing the stdout and stderr of the git command. + Raises + ------ + RuntimeError + If the git command exits with a non-zero status. + """ + proc = await asyncio.create_subprocess_exec( + *args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) stdout, stderr = await proc.communicate() + if proc.returncode != 0: + error_message = stderr.decode().strip() + raise RuntimeError(f"Git command failed: {' '.join(args)}\nError: {error_message}") return stdout, stderr + + +@async_timeout(CLONE_TIMEOUT) +async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]: + """ + Clones a repository to a local path based on the provided query parameters. + + Parameters + ---------- + config : CloneConfig + A dictionary containing the following keys: + - url (str): The URL of the repository. + - local_path (str): The local path to clone the repository to. + - commit (Optional[str]): The specific commit hash to checkout. + - branch (Optional[str]): The branch to clone. Defaults to 'main' or 'master' if not provided. + + Returns + ------- + Tuple[bytes, bytes] + A tuple containing the stdout and stderr of the git commands executed. + + Raises + ------ + ValueError + If the repository does not exist or if required query parameters are missing. + RuntimeError + If any git command fails during execution. + AsyncTimeoutError + If the cloning process exceeds the specified timeout. + """ + # Extract and validate query parameters + url: str = config.url + local_path: str = config.local_path + commit: Optional[str] = config.commit + branch: Optional[str] = config.branch + + if not url: + raise ValueError("The 'url' parameter is required.") + + if not local_path: + raise ValueError("The 'local_path' parameter is required.") + + # if commit and branch: + # raise ValueError("Provide either 'commit' or 'branch', not both.") + + # Check if the repository exists + if not await check_repo_exists(url): + raise ValueError("Repository not found, make sure it is public") + + try: + if commit: + # Scenario 1: Clone and checkout a specific commit + # Clone the repository without depth to ensure full history for checkout + clone_cmd = ["git", "clone", "--single-branch", url, local_path] + await run_git_command(*clone_cmd) + + # Checkout the specific commit + checkout_cmd = ["git", "-C", local_path, "checkout", commit] + return await run_git_command(*checkout_cmd) + + if branch and branch.lower() not in ('main', 'master'): + # Scenario 2: Clone a specific branch with shallow depth + clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path] + return await run_git_command(*clone_cmd) + + # Scenario 3: Clone the default branch with shallow depth + clone_cmd = ["git", "clone", "--depth=1", "--single-branch", url, local_path] + return await run_git_command(*clone_cmd) + + except (RuntimeError, asyncio.TimeoutError, AsyncTimeoutError): + raise # Re-raise the exception diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py index 22fae6d2..9eeeafef 100644 --- a/src/gitingest/ingest.py +++ b/src/gitingest/ingest.py @@ -4,14 +4,14 @@ from pathlib import Path from typing import List, Optional, Tuple, Union -from gitingest.clone import clone_repo +from gitingest.clone import CloneConfig, clone_repo from gitingest.ingest_from_query import ingest_from_query from gitingest.parse_query import parse_query def ingest( source: str, - max_file_size: int = 10 * 1024 * 1024, + max_file_size: int = 10 * 1024 * 1024, # 10 MB include_patterns: Union[List[str], str, None] = None, exclude_patterns: Union[List[str], str, None] = None, output: Optional[str] = None, @@ -25,7 +25,14 @@ def ingest( ignore_patterns=exclude_patterns, ) if query['url']: - clone_result = clone_repo(query) + # Extract relevant fields for CloneConfig + clone_config = CloneConfig( + url=f"https://github.com/{query['slug']}.git", + local_path=query['local_path'], + commit=query.get('commit'), + branch=query.get('branch'), + ) + clone_result = clone_repo(clone_config) if inspect.iscoroutine(clone_result): asyncio.run(clone_result) else: diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index 0080c25b..a9130a39 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -278,7 +278,7 @@ def create_file_content_string(files: List[Dict[str, Any]]) -> str: return output -def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: List[Dict[str, Any]]) -> str: +def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any]) -> str: """Creates a summary string with file counts and content size.""" if "user_name" in query: summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" @@ -297,12 +297,7 @@ def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: L return summary -def create_tree_structure( - query: Dict[str, Any], - node: Dict[str, Any], - prefix: str = "", - is_last: bool = True, -) -> str: +def create_tree_structure(query: Dict[str, Any], node: Dict[str, Any], prefix: str = "", is_last: bool = True) -> str: """Creates a tree-like string representation of the file structure.""" tree = "" @@ -386,7 +381,7 @@ def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: if not nodes: raise ValueError(f"No files found in {path}") files = extract_files_content(query=query, node=nodes, max_file_size=query['max_file_size']) - summary = create_summary_string(query, nodes, files) + summary = create_summary_string(query, nodes) tree = "Directory structure:\n" + create_tree_structure(query, nodes) files_content = create_file_content_string(files) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 669f28f3..00b09333 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -1,26 +1,15 @@ import os +import string import uuid from typing import Any, Dict, List, Optional, Union from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS TMP_BASE_PATH = "../tmp" +HEX_DIGITS = set(string.hexdigits) def parse_url(url: str) -> Dict[str, Any]: - parsed = { - "user_name": None, - "repo_name": None, - "type": None, - "branch": None, - "commit": None, - "subpath": "/", - "local_path": None, - "url": None, - "slug": None, - "id": None, - } - url = url.split(" ")[0] if not url.startswith('https://'): url = 'https://' + url @@ -33,28 +22,38 @@ def parse_url(url: str) -> Dict[str, Any]: if len(path_parts) < 2: raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.") - parsed["user_name"] = path_parts[0] - parsed["repo_name"] = path_parts[1] + user_name = path_parts[0] + repo_name = path_parts[1] + slug = f"{user_name}-{repo_name}" + _id = str(uuid.uuid4()) - # Keep original URL format - parsed["url"] = f"https://{domain}/{parsed['user_name']}/{parsed['repo_name']}" - parsed['slug'] = f"{parsed['user_name']}-{parsed['repo_name']}" - parsed["id"] = str(uuid.uuid4()) - parsed["local_path"] = f"{TMP_BASE_PATH}/{parsed['id']}/{parsed['slug']}" + parsed = { + "url": f"https://{domain}/{user_name}/{repo_name}", + "local_path": f"{TMP_BASE_PATH}/{_id}/{slug}", + "commit": None, + "branch": None, + "user_name": user_name, + "repo_name": repo_name, + "type": None, + "subpath": "/", + "slug": slug, + "id": _id, + } if len(path_parts) > 3: parsed["type"] = path_parts[2] - parsed["branch"] = path_parts[3] - if len(parsed['branch']) == 40 and all(c in '0123456789abcdefABCDEF' for c in parsed['branch']): - parsed["commit"] = parsed['branch'] + branch = path_parts[3] + + parsed["branch"] = branch + if len(branch) == 40 and all(c in HEX_DIGITS for c in branch): + parsed["commit"] = branch - parsed["subpath"] = "/" + "/".join(path_parts[4:]) + parsed["subpath"] += "/".join(path_parts[4:]) return parsed def normalize_pattern(pattern: str) -> str: - pattern = pattern.strip() pattern = pattern.lstrip(os.sep) if pattern.endswith(os.sep): pattern += "*" @@ -62,33 +61,45 @@ def normalize_pattern(pattern: str) -> str: def parse_patterns(pattern: Union[List[str], str]) -> List[str]: - if isinstance(pattern, list): - pattern = ",".join(pattern) + patterns = pattern if isinstance(pattern, list) else [pattern] + patterns = [p.strip() for p in patterns] - for p in pattern.split(","): - if not all(c.isalnum() or c in "-_./+*" for c in p.strip()): + for p in patterns: + if not all(c.isalnum() or c in "-_./+*" for c in p): raise ValueError( f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), " "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." ) - patterns = [normalize_pattern(p) for p in pattern.split(",")] - return patterns + + return [normalize_pattern(p) for p in patterns] def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[str]) -> List[str]: - for pattern in include_patterns: - if pattern in ignore_patterns: - ignore_patterns.remove(pattern) - return ignore_patterns + """ + Removes patterns from ignore_patterns that are present in include_patterns using set difference. + + Parameters + ---------- + ignore_patterns : List[str] + The list of patterns to potentially remove. + include_patterns : List[str] + The list of patterns to exclude from ignore_patterns. + + Returns + ------- + List[str] + A new list of ignore_patterns with specified patterns removed. + """ + return list(set(ignore_patterns) - set(include_patterns)) def parse_path(path: str) -> Dict[str, Any]: query = { + "url": None, "local_path": os.path.abspath(path), "slug": os.path.basename(os.path.dirname(path)) + "/" + os.path.basename(path), "subpath": "/", "id": str(uuid.uuid4()), - "url": None, } return query @@ -100,28 +111,53 @@ def parse_query( include_patterns: Optional[Union[List[str], str]] = None, ignore_patterns: Optional[Union[List[str], str]] = None, ) -> Dict[str, Any]: - if from_web: + """ + Parses the input source to construct a query dictionary with specified parameters. + + Parameters + ---------- + source : str + The source URL or file path to parse. + max_file_size : int + The maximum file size in bytes to include. + from_web : bool + Flag indicating whether the source is a web URL. + include_patterns : Optional[Union[List[str], str]], optional + Patterns to include, by default None. Can be a list of strings or a single string. + ignore_patterns : Optional[Union[List[str], str]], optional + Patterns to ignore, by default None. Can be a list of strings or a single string. + + Returns + ------- + Dict[str, Any] + A dictionary containing the parsed query parameters, including 'max_file_size', + 'ignore_patterns', and 'include_patterns'. + """ + # Determine the parsing method based on the source type + if from_web or source.startswith("https://") or "github.com" in source: query = parse_url(source) else: - if source.startswith("https://") or "github.com" in source: - query = parse_url(source) - else: - query = parse_path(source) + query = parse_path(source) - query['max_file_size'] = max_file_size + # Process ignore patterns + ignore_patterns_list = DEFAULT_IGNORE_PATTERNS.copy() + if ignore_patterns: + ignore_patterns_list += parse_patterns(ignore_patterns) - if ignore_patterns and ignore_patterns != "": - ignore_patterns = DEFAULT_IGNORE_PATTERNS + parse_patterns(ignore_patterns) + # Process include patterns and override ignore patterns accordingly + if include_patterns: + parsed_include = parse_patterns(include_patterns) + ignore_patterns_list = override_ignore_patterns(ignore_patterns_list, include_patterns=parsed_include) else: - ignore_patterns = DEFAULT_IGNORE_PATTERNS - - if include_patterns and include_patterns != "": - include_patterns = parse_patterns(include_patterns) - ignore_patterns = override_ignore_patterns(ignore_patterns, include_patterns) - else: - include_patterns = None - - query['ignore_patterns'] = ignore_patterns - query['include_patterns'] = include_patterns + parsed_include = None + + # Update the query dictionary with max_file_size and processed patterns + query.update( + { + 'max_file_size': max_file_size, + 'ignore_patterns': ignore_patterns_list, + 'include_patterns': parsed_include, + } + ) return query diff --git a/src/gitingest/tests/test_clone.py b/src/gitingest/tests/test_clone.py index 680181c8..69a7d943 100644 --- a/src/gitingest/tests/test_clone.py +++ b/src/gitingest/tests/test_clone.py @@ -2,62 +2,54 @@ import pytest -from gitingest.clone import check_repo_exists, clone_repo +from gitingest.clone import CloneConfig, check_repo_exists, clone_repo @pytest.mark.asyncio async def test_clone_repo_with_commit() -> None: - query = { - 'commit': 'a' * 40, # Simulating a valid commit hash - 'branch': 'main', - 'url': 'https://github.com/user/repo', - 'local_path': '/tmp/repo', - } + clone_config = CloneConfig( + url='https://github.com/user/repo', + local_path='/tmp/repo', + commit='a' * 40, # Simulating a valid commit hash + branch='main', + ) with patch('gitingest.clone.check_repo_exists', return_value=True) as mock_check: - with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: + with patch('gitingest.clone.run_git_command', new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b'output', b'error') mock_exec.return_value = mock_process - await clone_repo(query) - mock_check.assert_called_once_with(query['url']) + await clone_repo(clone_config) + mock_check.assert_called_once_with(clone_config.url) assert mock_exec.call_count == 2 # Clone and checkout calls @pytest.mark.asyncio async def test_clone_repo_without_commit() -> None: - query = { - 'commit': None, - 'branch': 'main', - 'url': 'https://github.com/user/repo', - 'local_path': '/tmp/repo', - } + query = CloneConfig(url='https://github.com/user/repo', local_path='/tmp/repo', commit=None, branch='main') with patch('gitingest.clone.check_repo_exists', return_value=True) as mock_check: - with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: + with patch('gitingest.clone.run_git_command', new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b'output', b'error') mock_exec.return_value = mock_process await clone_repo(query) - mock_check.assert_called_once_with(query['url']) + mock_check.assert_called_once_with(query.url) assert mock_exec.call_count == 1 # Only clone call @pytest.mark.asyncio async def test_clone_repo_nonexistent_repository() -> None: - query = { - 'commit': None, - 'branch': 'main', - 'url': 'https://github.com/user/nonexistent-repo', - 'local_path': '/tmp/repo', - } + clone_config = CloneConfig( + url='https://github.com/user/nonexistent-repo', local_path='/tmp/repo', commit=None, branch='main' + ) with patch('gitingest.clone.check_repo_exists', return_value=False) as mock_check: with pytest.raises(ValueError, match="Repository not found"): - await clone_repo(query) - mock_check.assert_called_once_with(query['url']) + await clone_repo(clone_config) + mock_check.assert_called_once_with(clone_config.url) @pytest.mark.asyncio diff --git a/src/gitingest/tests/test_parse_query.py b/src/gitingest/tests/test_parse_query.py index ae4c1659..1ab5e447 100644 --- a/src/gitingest/tests/test_parse_query.py +++ b/src/gitingest/tests/test_parse_query.py @@ -37,7 +37,7 @@ def test_parse_query_include_pattern() -> None: url = "https://github.com/user/repo" result = parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py') assert result["include_patterns"] == ["*.py"] - assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS + assert set(result["ignore_patterns"]) == set(DEFAULT_IGNORE_PATTERNS) def test_parse_query_invalid_pattern() -> None: diff --git a/src/gitingest/utils.py b/src/gitingest/utils.py index 1f07b533..2445f14e 100644 --- a/src/gitingest/utils.py +++ b/src/gitingest/utils.py @@ -10,8 +10,6 @@ class AsyncTimeoutError(Exception): """Raised when an async operation exceeds its timeout limit.""" - pass - def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]: def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]: diff --git a/src/process_query.py b/src/process_query.py index 466b11d2..19ac2dbb 100644 --- a/src/process_query.py +++ b/src/process_query.py @@ -1,11 +1,9 @@ -from typing import Any, Dict - from fastapi import Request from fastapi.templating import Jinja2Templates from starlette.templating import _TemplateResponse from config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE -from gitingest.clone import clone_repo +from gitingest.clone import CloneConfig, clone_repo from gitingest.ingest_from_query import ingest_from_query from gitingest.parse_query import parse_query from server_utils import Colors, logSliderToSize @@ -13,14 +11,8 @@ templates = Jinja2Templates(directory="templates") -def print_query( - query: Dict[str, Any], - request: Request, - max_file_size: int, - pattern_type: str, - pattern: str, -) -> None: - print(f"{Colors.WHITE}{query['url']:<20}{Colors.END}", end="") +def print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: + print(f"{Colors.WHITE}{url:<20}{Colors.END}", end="") if int(max_file_size / 1024) != 50: print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") if pattern_type == "include" and pattern != "": @@ -29,30 +21,16 @@ def print_query( print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") -def print_error( - query: Dict[str, Any], - request: Request, - e: Exception, - max_file_size: int, - pattern_type: str, - pattern: str, -) -> None: +def print_error(url: str, e: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") - print_query(query, request, max_file_size, pattern_type, pattern) + print_query(url, max_file_size, pattern_type, pattern) print(f" | {Colors.RED}{e}{Colors.END}") -def print_success( - query: Dict[str, Any], - request: Request, - max_file_size: int, - pattern_type: str, - pattern: str, - summary: str, -) -> None: +def print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None: estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") - print_query(query, request, max_file_size, pattern_type, pattern) + print_query(url, max_file_size, pattern_type, pattern) print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") @@ -82,15 +60,21 @@ async def process_query( include_patterns=include_patterns, ignore_patterns=exclude_patterns, ) - await clone_repo(query) + clone_config = CloneConfig( + url=f"https://github.com/{query['slug']}.git", + local_path=query['local_path'], + commit=query.get('commit'), + branch=query.get('branch'), + ) + await clone_repo(clone_config) summary, tree, content = ingest_from_query(query) - with open(f"{query['local_path']}.txt", "w") as f: + with open(f"{clone_config.local_path}.txt", "w") as f: f.write(tree + "\n" + content) except Exception as e: # hack to print error message when query is not defined if 'query' in locals() and query is not None and isinstance(query, dict): - print_error(query, request, e, max_file_size, pattern_type, pattern) + print_error(query['url'], e, max_file_size, pattern_type, pattern) else: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print(f"{Colors.RED}{e}{Colors.END}") @@ -115,8 +99,7 @@ async def process_query( ) print_success( - query=query, - request=request, + url=query['url'], max_file_size=max_file_size, pattern_type=pattern_type, pattern=pattern, From 075b4549cef33b2636b7da7ed33bfe7efe4ffa7e Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sat, 28 Dec 2024 19:52:48 +0100 Subject: [PATCH 6/9] resolve merge conflicts --- src/gitingest/parse_query.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 45bd8f08..a2fcedb5 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -2,6 +2,7 @@ import string import uuid from typing import Any, Dict, List, Optional, Union +from urllib.parse import unquote from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS @@ -25,7 +26,7 @@ def parse_url(url: str) -> Dict[str, Any]: url = url.split(" ")[0] url = unquote(url) # Decode URL-encoded characters - + if not url.startswith('https://'): url = 'https://' + url @@ -49,7 +50,7 @@ def parse_url(url: str) -> Dict[str, Any]: if len(path_parts) > 3: parsed["type"] = path_parts[2] # Usually 'tree' or 'blob' - + # Find the commit hash or reconstruct the branch name remaining_parts = path_parts[3:] if remaining_parts[0] and len(remaining_parts[0]) == 40 and all(c in HEX_DIGITS for c in remaining_parts[0]): @@ -61,14 +62,15 @@ def parse_url(url: str) -> Dict[str, Any]: if part in ('tree', 'blob'): # Found another type indicator, everything before this was the branch name parsed["branch"] = "/".join(remaining_parts[:i]) - parsed["subpath"] = "/" + "/".join(remaining_parts[i+2:]) if len(remaining_parts) > i+2 else "/" + parsed["subpath"] = ( + "/" + "/".join(remaining_parts[i + 2 :]) if len(remaining_parts) > i + 2 else "/" + ) break else: # No additional type indicator found, assume everything is part of the branch name parsed["branch"] = "/".join(remaining_parts) parsed["subpath"] = "/" - return parsed @@ -130,7 +132,6 @@ def parse_query( include_patterns: Optional[Union[List[str], str]] = None, ignore_patterns: Optional[Union[List[str], str]] = None, ) -> Dict[str, Any]: - """ Parses the input source to construct a query dictionary with specified parameters. From 70ff34e6c3a30fbe51a36c75382057fba5886c7f Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sat, 28 Dec 2024 20:41:04 +0100 Subject: [PATCH 7/9] Refactor parse_url and parse_query for improved clarity and maintainability - **Revised the construction of the parsed dictionary** in the `parse_url` function for clarity. - **Improved the `parse_query` function's structure** for better readability and maintainability. --- src/gitingest/parse_query.py | 97 +++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 45 deletions(-) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index a2fcedb5..fe7b01a9 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -11,19 +11,6 @@ def parse_url(url: str) -> Dict[str, Any]: - parsed = { - "user_name": None, - "repo_name": None, - "type": None, - "branch": None, - "commit": None, - "subpath": "/", - "local_path": None, - "url": None, - "slug": None, - "id": None, - } - url = url.split(" ")[0] url = unquote(url) # Decode URL-encoded characters @@ -38,42 +25,62 @@ def parse_url(url: str) -> Dict[str, Any]: if len(path_parts) < 2: raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.") - parsed["user_name"] = path_parts[0] - parsed["repo_name"] = path_parts[1] - - # Keep original URL format but with decoded components - parsed["url"] = f"https://{domain}/{parsed['user_name']}/{parsed['repo_name']}" - parsed['slug'] = f"{parsed['user_name']}-{parsed['repo_name']}" - parsed["id"] = str(uuid.uuid4()) - parsed["local_path"] = f"{TMP_BASE_PATH}/{parsed['id']}/{parsed['slug']}" - - if len(path_parts) > 3: - - parsed["type"] = path_parts[2] # Usually 'tree' or 'blob' - - # Find the commit hash or reconstruct the branch name - remaining_parts = path_parts[3:] - if remaining_parts[0] and len(remaining_parts[0]) == 40 and all(c in HEX_DIGITS for c in remaining_parts[0]): - parsed["commit"] = remaining_parts[0] - parsed["subpath"] = "/" + "/".join(remaining_parts[1:]) if len(remaining_parts) > 1 else "/" - else: - # Handle branch names with slashes and special characters - for i, part in enumerate(remaining_parts): - if part in ('tree', 'blob'): - # Found another type indicator, everything before this was the branch name - parsed["branch"] = "/".join(remaining_parts[:i]) - parsed["subpath"] = ( - "/" + "/".join(remaining_parts[i + 2 :]) if len(remaining_parts) > i + 2 else "/" - ) - break - else: - # No additional type indicator found, assume everything is part of the branch name - parsed["branch"] = "/".join(remaining_parts) - parsed["subpath"] = "/" + user_name = path_parts[0] + repo_name = path_parts[1] + _id = str(uuid.uuid4()) + slug = f"{user_name}-{repo_name}" + + parsed = { + "user_name": user_name, + "repo_name": repo_name, + "type": None, + "branch": None, + "commit": None, + "subpath": "/", + "local_path": f"{TMP_BASE_PATH}/{_id}/{slug}", + # Keep original URL format but with decoded components + "url": f"https://{domain}/{user_name}/{repo_name}", + "slug": slug, + "id": _id, + } + + if len(path_parts) < 4: + return parsed + + parsed["type"] = path_parts[2] # Usually 'tree' or 'blob' + commit = path_parts[3] + + # Find the commit hash or reconstruct the branch name + remaining_parts = path_parts[3:] + + if _is_valid_git_commit_hash(commit): + parsed["commit"] = commit + if len(remaining_parts) > 1: + parsed["subpath"] += "/".join(remaining_parts[1:]) + return parsed + + # Handle branch names with slashes and special characters + + # Find the index of the first type indicator ('tree' or 'blob'), if any + type_indicator_index = next((i for i, part in enumerate(remaining_parts) if part in ('tree', 'blob')), None) + + if type_indicator_index is None: + # No type indicator found; assume the entire input is the branch name + parsed["branch"] = "/".join(remaining_parts) + return parsed + + # Found a type indicator; update branch and subpath + parsed["branch"] = "/".join(remaining_parts[:type_indicator_index]) + if len(remaining_parts) > type_indicator_index + 2: + parsed["subpath"] += "/".join(remaining_parts[type_indicator_index + 2 :]) return parsed +def _is_valid_git_commit_hash(commit: str) -> bool: + return len(commit) == 40 and all(c in HEX_DIGITS for c in commit) + + def normalize_pattern(pattern: str) -> str: pattern = pattern.lstrip(os.sep) if pattern.endswith(os.sep): From cdf0c65f20bb2cb6c3b1412f1b526afdd828dea3 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sun, 29 Dec 2024 00:00:06 +0100 Subject: [PATCH 8/9] Update src/gitingest/ingest.py --- src/gitingest/ingest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py index 812649f1..4889bc5c 100644 --- a/src/gitingest/ingest.py +++ b/src/gitingest/ingest.py @@ -28,7 +28,7 @@ def ingest( # Extract relevant fields for CloneConfig clone_config = CloneConfig( - url=f"https://github.com/{query['slug']}.git", + url=query["url"], local_path=query['local_path'], commit=query.get('commit'), branch=query.get('branch'), From 1724720ca4ba8953f56aff2b57789fc706efbc28 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sun, 29 Dec 2024 00:00:11 +0100 Subject: [PATCH 9/9] Update src/process_query.py --- src/process_query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/process_query.py b/src/process_query.py index 19ac2dbb..761fdf27 100644 --- a/src/process_query.py +++ b/src/process_query.py @@ -61,7 +61,7 @@ async def process_query( ignore_patterns=exclude_patterns, ) clone_config = CloneConfig( - url=f"https://github.com/{query['slug']}.git", + url=query["url"], local_path=query['local_path'], commit=query.get('commit'), branch=query.get('branch'),