diff --git a/.github/workflows/unitest.yml b/.github/workflows/unitest.yml index 0a537771..6759ecd5 100644 --- a/.github/workflows/unitest.yml +++ b/.github/workflows/unitest.yml @@ -15,19 +15,19 @@ jobs: steps: - uses: actions/checkout@v4 - + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - + - name: Install dependencies run: | python -m pip install --upgrade pip pip install pytest pytest-asyncio if [ -f requirements.txt ]; then pip install -r requirements.txt; fi pip install -e . - + - name: Run tests run: | pytest diff --git a/.gitignore b/.gitignore index e98f538f..09c9945b 100644 --- a/.gitignore +++ b/.gitignore @@ -131,6 +131,7 @@ venv/ ENV/ env.bak/ venv.bak/ +.python-version # Spyder project settings .spyderproject diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..1b3eabd5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,78 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + # Files + - id: check-added-large-files + description: 'Prevent large files from being committed.' + args: ['--maxkb=10000'] + - id: check-case-conflict + description: 'Check for files that would conflict in case-insensitive filesystems.' + - id: fix-byte-order-marker + description: 'Remove utf-8 byte order marker.' + - id: mixed-line-ending + description: 'Replace mixed line ending.' + + # Links + - id: destroyed-symlinks + description: 'Detect symlinks which are changed to regular files with a content of a path which that symlink was pointing to.' + + # File files for parseable syntax: python + - id: check-ast + + # File and line endings + - id: end-of-file-fixer + description: 'Ensure that a file is either empty, or ends with one newline.' + - id: trailing-whitespace + description: 'Trim trailing whitespace.' + + # Python + - id: check-docstring-first + description: 'Check a common error of defining a docstring after code.' + - id: requirements-txt-fixer + description: 'Sort entries in requirements.txt.' + + - repo: https://github.com/MarcoGorelli/absolufy-imports + rev: v0.3.1 + hooks: + - id: absolufy-imports + description: 'Automatically convert relative imports to absolute. (Use `args: [--never]` to revert.)' + + - repo: https://github.com/psf/black + rev: 24.10.0 + hooks: + - id: black + + - repo: https://github.com/asottile/pyupgrade + rev: v3.19.1 + hooks: + - id: pyupgrade + description: 'Automatically upgrade syntax for newer versions.' + args: [--py3-plus, --py36-plus, --py38-plus] + + - repo: https://github.com/pre-commit/pygrep-hooks + rev: v1.10.0 + hooks: + - id: python-check-blanket-noqa + description: 'Enforce that `noqa` annotations always occur with specific codes. Sample annotations: `# noqa: F401`, `# noqa: F401,W203`.' + - id: python-check-blanket-type-ignore + description: 'Enforce that `# type: ignore` annotations always occur with specific codes. Sample annotations: `# type: ignore[attr-defined]`, `# type: ignore[attr-defined, name-defined]`.' + - id: python-use-type-annotations + description: 'Enforce that python3.6+ type annotations are used instead of type comments.' + + - repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort + description: 'Sort imports alphabetically, and automatically separated into sections and by type.' + + - repo: https://github.com/hadialqattan/pycln + rev: v2.4.0 + hooks: + - id: pycln + description: 'Remove unused import statements.' + + - repo: https://github.com/djlint/djLint + rev: v1.36.4 + hooks: + - id: djlint-reformat-jinja diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 48ba75f2..2293c260 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -60,7 +60,7 @@ representative at an online or offline event. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at -romain@coderamp.io. +. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the @@ -114,15 +114,13 @@ the community. ## Attribution -This Code of Conduct is adapted from the [Contributor Covenant][homepage], +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.0, available at -https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. +. Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). -[homepage]: https://www.contributor-covenant.org - For answers to common questions about this code of conduct, see the FAQ at -https://www.contributor-covenant.org/faq. Translations are available at -https://www.contributor-covenant.org/translations. +. Translations are available at +. diff --git a/README.md b/README.md index 991aeafe..6d0747a2 100644 --- a/README.md +++ b/README.md @@ -1,34 +1,56 @@ -[![Image](./docs/frontpage.png "GitIngest main page")](https://gitingest.com/) +[![Image](./docs/frontpage.png "GitIngest main page")](https://gitingest.com) + + + + License + + + + PyPI version + + + + Downloads + + + + GitHub issues + + + + Code style: black + + + + + Discord + + +# GitIngest -![License](https://img.shields.io/badge/license-MIT-blue.svg) - -# GitIngest 🔍 Turn any Git repository into a prompt-friendly text ingest for LLMs. You can also replace `hub` with `ingest` in any github url to access the coresponding digest -[gitingest.com](https://gitingest.com/) - +[gitingest.com](https://gitingest.com) ## 🚀 Features - **Easy code context**: Get a text digest from a git repository URL or a directory - **Smart Formatting**: Optimized output format for LLM prompts -- **Statistics about**: : +- **Statistics about**: - File and directory structure - Size of the extract - - Token count + - Token count - **CLI tool**: Run it as a command (Currently on Linux only) - **Python package**: Import it in your code - ## 📦 Installation -``` +``` bash pip install gitingest ``` - ## 💡 Command Line usage The `gitingest` command line tool allows you to analyze codebases and create a text dump of their contents. @@ -46,60 +68,62 @@ gitingest --help This will write the digest in a text file (default `digest.txt`) in your current working directory. - ## 🐛 Python package usage - ```python from gitingest import ingest summary, tree, content = ingest("path/to/directory") -#or from URL +# or from URL summary, tree, content = ingest("https://github.com/cyclotruc/gitingest") ``` By default, this won't write a file but can be enabled with the `output` argument - ## 🛠️ Using + - Tailwind CSS - Frontend - [FastAPI](https://github.com/fastapi/fastapi) - Backend framework - [tiktoken](https://github.com/openai/tiktoken) - Token estimation - [apianalytics.dev](https://www.apianalytics.dev/) - Simple Analytics +## 🌐 Self-host -## 🌐 Self-host 1. Build the image: -``` + +``` bash docker build -t gitingest . ``` 2. Run the container: -``` + +``` bash docker run -d --name gitingest -p 8000:8000 gitingest ``` + The application will be available at `http://localhost:8000` Ensure environment variables are set before running the application or deploying it via Docker. ## ✔️ Contributing -Contributions are welcome! +Contributions are welcome! Gitingest aims to be friendly for first time contributors, with a simple python and html codebase. If you need any help while working with the code, reach out to us on [discord](https://discord.com/invite/zerRaGK9EC) -### Ways to contribute +### Ways to contribute 1. Provide your feedback and ideas on discord -2. Open an Issue on github to report a bug -2. Create a Pull request +2. Open an Issue on github to report a bug +3. Create a Pull request - Fork the repository - Make your changes and test them locally - Open a pull request for review and feedback ### 🔧 Local dev -#### Environment Configuration +#### Environment Configuration + - **`ALLOWED_HOSTS`**: Specify allowed hostnames for the application. Default: `"gitingest.com,*.gitingest.com,gitdigest.dev,localhost"`. You can configure the application using the following environment variables: @@ -108,23 +132,25 @@ ALLOWED_HOSTS="gitingest.local,localhost" ``` #### Run locally -1. Clone the repository + +1. Clone the repository + ```bash git clone https://github.com/cyclotruc/gitingest.git cd gitingest ``` 2. Install dependencies + ```bash pip install -r requirements.txt ``` 3. Run the application: + ```bash cd src uvicorn main:app --reload ``` -The frontend will be available at `localhost:8000` - - +The frontend will be available at `localhost:8000` diff --git a/SECURITY.md b/SECURITY.md index cf4a494c..90a6d689 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -2,4 +2,4 @@ ## Reporting a Vulnerability -If you have discovered a vulnerability inside the project, report it privately at romain@coderamp.io. This way the maintainer can work on a proper fix without disclosing the problem to the public before it has been solved. +If you have discovered a vulnerability inside the project, report it privately at . This way the maintainer can work on a proper fix without disclosing the problem to the public before it has been solved. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..f7d6c65f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[tool.pylint.format] +max-line-length = 119 + +[tool.pycln] +all = true + +[tool.isort] +profile = "black" +line_length = 119 +remove_redundant_aliases = true +float_to_top = true +order_by_type = true +filter_files = true + +[tool.black] +line-length = 119 +skip-string-normalization = true diff --git a/pytest.ini b/pytest.ini index 7444d64d..2a155008 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,7 +3,6 @@ pythonpath = src testpaths = src/gitingest/tests asyncio_mode = auto - python_files = test_*.py python_classes = Test* -python_functions = test_* \ No newline at end of file +python_functions = test_* diff --git a/requirements.txt b/requirements.txt index 6848603b..2688a88d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,13 @@ -fastapi[standard] -uvicorn +black +click>=8.0.0 +djlint fastapi-analytics -slowapi -tiktoken +fastapi[standard] +pre-commit pytest pytest-asyncio -click>=8.0.0 +python-dotenv +slowapi +starlette +tiktoken +uvicorn diff --git a/setup.py b/setup.py index 8afe6b73..6778a92c 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -from setuptools import setup, find_packages +from setuptools import find_packages, setup setup( name="gitingest", @@ -28,4 +28,4 @@ "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", ], -) \ No newline at end of file +) diff --git a/src/config.py b/src/config.py index cdf2849b..b918fb2a 100644 --- a/src/config.py +++ b/src/config.py @@ -1,4 +1,4 @@ -MAX_DISPLAY_SIZE = 300000 +MAX_DISPLAY_SIZE = 300_000 TMP_BASE_PATH = "../tmp" EXAMPLE_REPOS = [ diff --git a/src/gitingest/__init__.py b/src/gitingest/__init__.py index ed84b214..212fefcb 100644 --- a/src/gitingest/__init__.py +++ b/src/gitingest/__init__.py @@ -1,6 +1,6 @@ -from .ingest_from_query import ingest_from_query -from .clone import clone_repo -from .parse_query import parse_query -from .ingest import ingest +from gitingest.clone import clone_repo +from gitingest.ingest import ingest +from gitingest.ingest_from_query import ingest_from_query +from gitingest.parse_query import parse_query -__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"] \ No newline at end of file +__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"] diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 81823e63..14df2190 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -1,10 +1,12 @@ import os -import pathlib +from typing import Optional, Tuple + import click +from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS from gitingest.ingest import ingest from gitingest.ingest_from_query import MAX_FILE_SIZE -from gitingest.parse_query import DEFAULT_IGNORE_PATTERNS + def normalize_pattern(pattern: str) -> str: pattern = pattern.strip() @@ -13,30 +15,38 @@ def normalize_pattern(pattern: str) -> str: pattern += "*" return pattern + @click.command() @click.argument('source', type=str, required=True) @click.option('--output', '-o', default=None, help='Output file path (default: .txt in current directory)') @click.option('--max-size', '-s', default=MAX_FILE_SIZE, help='Maximum file size to process in bytes') @click.option('--exclude-pattern', '-e', multiple=True, help='Patterns to exclude') @click.option('--include-pattern', '-i', multiple=True, help='Patterns to include') -def main(source, output, max_size, exclude_pattern, include_pattern): +def main( + source: str, + output: Optional[str], + max_size: int, + exclude_pattern: Tuple[str, ...], + include_pattern: Tuple[str, ...], +) -> None: """Analyze a directory and create a text dump of its contents.""" try: # Combine default and custom ignore patterns exclude_patterns = list(exclude_pattern) include_patterns = list(set(include_pattern)) - + if not output: output = "digest.txt" summary, tree, content = ingest(source, max_size, include_patterns, exclude_patterns, output=output) - + click.echo(f"Analysis complete! Output written to: {output}") click.echo("\nSummary:") click.echo(summary) - + except Exception as e: click.echo(f"Error: {str(e)}", err=True) raise click.Abort() + if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index 4b69bc76..e7994c14 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -1,10 +1,11 @@ import asyncio -from typing import Tuple +from typing import Any, Dict, Tuple from gitingest.utils import async_timeout CLONE_TIMEOUT = 20 + async def check_repo_exists(url: str) -> bool: proc = await asyncio.create_subprocess_exec( "curl", @@ -20,14 +21,15 @@ async def check_repo_exists(url: str) -> bool: stdout_str = stdout.decode() return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str + @async_timeout(CLONE_TIMEOUT) -async def clone_repo(query: dict) -> str: +async def clone_repo(query: Dict[str, Any]) -> Tuple[bytes, bytes]: if not await check_repo_exists(query['url']): raise ValueError("Repository not found, make sure it is public") - + if query['commit']: proc = await asyncio.create_subprocess_exec( - "git", + "git", "clone", "--single-branch", query['url'], @@ -36,21 +38,21 @@ async def clone_repo(query: dict) -> str: stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await proc.communicate() - + proc = await asyncio.create_subprocess_exec( "git", "-C", query['local_path'], "checkout", query['branch'], - stdout=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await proc.communicate() elif query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: proc = await asyncio.create_subprocess_exec( "git", - "clone", + "clone", "--depth=1", "--single-branch", "--branch", @@ -71,7 +73,7 @@ async def clone_repo(query: dict) -> str: stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) - + stdout, stderr = await proc.communicate() - - return stdout, stderr \ No newline at end of file + + return stdout, stderr diff --git a/src/gitingest/ignore_patterns.py b/src/gitingest/ignore_patterns.py new file mode 100644 index 00000000..803c6edc --- /dev/null +++ b/src/gitingest/ignore_patterns.py @@ -0,0 +1,102 @@ +from typing import List + +DEFAULT_IGNORE_PATTERNS: List[str] = [ + # Python + '*.pyc', + '*.pyo', + '*.pyd', + '__pycache__', + '.pytest_cache', + '.coverage', + '.tox', + '.nox', + '.mypy_cache', + '.ruff_cache', + '.hypothesis', + 'poetry.lock', + 'Pipfile.lock', + # JavaScript/Node + 'node_modules', + 'bower_components', + 'package-lock.json', + 'yarn.lock', + '.npm', + '.yarn', + '.pnpm-store', + # Version control + '.git', + '.svn', + '.hg', + '.gitignore', + '.gitattributes', + '.gitmodules', + # Images and media + '*.svg', + '*.png', + '*.jpg', + '*.jpeg', + '*.gif', + '*.ico', + '*.pdf', + '*.mov', + '*.mp4', + '*.mp3', + '*.wav', + # Virtual environments + 'venv', + '.venv', + 'env', + '.env', + 'virtualenv', + # IDEs and editors + '.idea', + '.vscode', + '.vs', + '*.swp', + '*.swo', + '*.swn', + '.settings', + '.project', + '.classpath', + '*.sublime-*', + # Temporary and cache files + '*.log', + '*.bak', + '*.swp', + '*.tmp', + '*.temp', + '.cache', + '.sass-cache', + '.eslintcache', + '.DS_Store', + 'Thumbs.db', + 'desktop.ini', + # Build directories and artifacts + 'build', + 'dist', + 'target', + 'out', + '*.egg-info', + '*.egg', + '*.whl', + '*.so', + '*.dylib', + '*.dll', + '*.class', + # Documentation + 'site-packages', + '.docusaurus', + '.next', + '.nuxt', + # Other common patterns + ## Minified files + '*.min.js', + '*.min.css', + ## Source maps + '*.map', + ## Terraform + '.terraform', + '*.tfstate*', + ## Dependencies in various languages + 'vendor/', +] diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py index eac20818..22fae6d2 100644 --- a/src/gitingest/ingest.py +++ b/src/gitingest/ingest.py @@ -1,18 +1,36 @@ import asyncio +import inspect import shutil -from typing import Union, List from pathlib import Path +from typing import List, Optional, Tuple, Union -from .ingest_from_query import ingest_from_query -from .clone import clone_repo -from .parse_query import parse_query +from gitingest.clone import clone_repo +from gitingest.ingest_from_query import ingest_from_query +from gitingest.parse_query import parse_query -def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, include_patterns: Union[List[str], str] = None, exclude_patterns: Union[List[str], str] = None, output: str = None) -> str: + +def ingest( + source: str, + max_file_size: int = 10 * 1024 * 1024, + include_patterns: Union[List[str], str, None] = None, + exclude_patterns: Union[List[str], str, None] = None, + output: Optional[str] = None, +) -> Tuple[str, str, str]: try: - query = parse_query(source, max_file_size, False, include_patterns, exclude_patterns) + query = parse_query( + source=source, + max_file_size=max_file_size, + from_web=False, + include_patterns=include_patterns, + ignore_patterns=exclude_patterns, + ) if query['url']: - asyncio.run(clone_repo(query)) - + clone_result = clone_repo(query) + if inspect.iscoroutine(clone_result): + asyncio.run(clone_result) + else: + raise TypeError("clone_repo did not return a coroutine as expected.") + summary, tree, content = ingest_from_query(query) if output: @@ -20,9 +38,10 @@ def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, include_patterns: f.write(tree + "\n" + content) return summary, tree, content + finally: # Clean up the temporary directory if it was created if query['url']: # Get parent directory two levels up from local_path (../tmp) cleanup_path = str(Path(query['local_path']).parents[1]) - shutil.rmtree(cleanup_path, ignore_errors=True) \ No newline at end of file + shutil.rmtree(cleanup_path, ignore_errors=True) diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index 4e7d5e78..0080c25b 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -1,13 +1,13 @@ import os from fnmatch import fnmatch -from typing import Dict, List, Union -import tiktoken +from typing import Any, Dict, List, Optional, Set, Tuple +import tiktoken -MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB +MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal -MAX_FILES = 10000 # Maximum number of files to process -MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500MB +MAX_FILES = 10_000 # Maximum number of files to process +MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB def should_include(path: str, base_path: str, include_patterns: List[str]) -> bool: @@ -18,6 +18,7 @@ def should_include(path: str, base_path: str, include_patterns: List[str]) -> bo include = True return include + def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> bool: rel_path = path.replace(base_path, "").lstrip(os.sep) for pattern in ignore_patterns: @@ -27,6 +28,7 @@ def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> boo return True return False + def is_safe_symlink(symlink_path: str, base_path: str) -> bool: """Check if a symlink points to a location within the base directory.""" try: @@ -37,23 +39,32 @@ def is_safe_symlink(symlink_path: str, base_path: str) -> bool: # If there's any error resolving the paths, consider it unsafe return False + def is_text_file(file_path: str) -> bool: """Determines if a file is likely a text file based on its content.""" try: with open(file_path, 'rb') as file: chunk = file.read(1024) return not bool(chunk.translate(None, bytes([7, 8, 9, 10, 12, 13, 27] + list(range(0x20, 0x100))))) - except IOError: + except OSError: return False + def read_file_content(file_path: str) -> str: try: - with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + with open(file_path, encoding='utf-8', errors='ignore') as f: return f.read() except Exception as e: return f"Error reading file: {str(e)}" -def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = 0, stats: Dict = None) -> Dict: + +def scan_directory( + path: str, + query: Dict[str, Any], + seen_paths: Optional[Set[str]] = None, + depth: int = 0, + stats: Optional[Dict[str, int]] = None, +) -> Optional[Dict[str, Any]]: """Recursively analyzes a directory and its contents with safety limits.""" if seen_paths is None: seen_paths = set() @@ -76,6 +87,7 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = if real_path in seen_paths: print(f"Skipping already visited path: {path}") return None + seen_paths.add(real_path) result = { @@ -86,7 +98,7 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = "file_count": 0, "dir_count": 0, "path": path, - "ignore_content": False + "ignore_content": False, } ignore_patterns = query['ignore_patterns'] @@ -137,14 +149,20 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = "type": "file", "size": file_size, "content": content, - "path": item_path + "path": item_path, } result["children"].append(child) result["size"] += file_size result["file_count"] += 1 elif os.path.isdir(real_path): - subdir = scan_directory(real_path, query, seen_paths, depth + 1, stats) + subdir = scan_directory( + path=real_path, + query=query, + seen_paths=seen_paths, + depth=depth + 1, + stats=stats, + ) if subdir and (not include_patterns or subdir["file_count"] > 0): subdir["name"] = item subdir["path"] = item_path @@ -175,14 +193,20 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = "type": "file", "size": file_size, "content": content, - "path": item_path + "path": item_path, } result["children"].append(child) result["size"] += file_size result["file_count"] += 1 elif os.path.isdir(item_path): - subdir = scan_directory(item_path, query, seen_paths, depth + 1, stats) + subdir = scan_directory( + path=item_path, + query=query, + seen_paths=seen_paths, + depth=depth + 1, + stats=stats, + ) if subdir and (not include_patterns or subdir["file_count"] > 0): result["children"].append(subdir) result["size"] += subdir["size"] @@ -194,7 +218,13 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = return result -def extract_files_content(query: dict, node: Dict, max_file_size: int, files: List = None) -> List[Dict]: + +def extract_files_content( + query: Dict[str, Any], + node: Dict[str, Any], + max_file_size: int, + files: Optional[List[Dict[str, Any]]] = None, +) -> List[Dict[str, Any]]: """Recursively collects all text files with their contents.""" if files is None: files = [] @@ -204,17 +234,21 @@ def extract_files_content(query: dict, node: Dict, max_file_size: int, files: Li if node["size"] > max_file_size: content = None - files.append({ - "path": node["path"].replace(query['local_path'], ""), - "content": content, - "size": node["size"] - }) + files.append( + { + "path": node["path"].replace(query['local_path'], ""), + "content": content, + "size": node["size"], + }, + ) elif node["type"] == "directory": for child in node["children"]: - extract_files_content(query, child, max_file_size, files) + extract_files_content(query=query, node=child, max_file_size=max_file_size, files=files) + return files -def create_file_content_string(files: List[Dict]) -> str: + +def create_file_content_string(files: List[Dict[str, Any]]) -> str: """Creates a formatted string of file contents with separators.""" output = "" separator = "=" * 48 + "\n" @@ -223,6 +257,7 @@ def create_file_content_string(files: List[Dict]) -> str: for file in files: if not file['content']: continue + if file['path'].lower() == '/readme.md': output += separator output += f"File: {file['path']}\n" @@ -234,6 +269,7 @@ def create_file_content_string(files: List[Dict]) -> str: for file in files: if not file['content'] or file['path'].lower() == '/readme.md': continue + output += separator output += f"File: {file['path']}\n" output += separator @@ -241,12 +277,14 @@ def create_file_content_string(files: List[Dict]) -> str: return output -def create_summary_string(query: dict, nodes: Dict, files: List[Dict]) -> str: + +def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: List[Dict[str, Any]]) -> str: """Creates a summary string with file counts and content size.""" if "user_name" in query: summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" else: summary = f"Repository: {query['slug']}\n" + summary += f"Files analyzed: {nodes['file_count']}\n" if 'subpath' in query and query['subpath'] != '/': @@ -255,11 +293,19 @@ def create_summary_string(query: dict, nodes: Dict, files: List[Dict]) -> str: summary += f"Commit: {query['commit']}\n" elif 'branch' in query and query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: summary += f"Branch: {query['branch']}\n" + return summary -def create_tree_structure(query: dict, node: Dict, prefix: str = "", is_last: bool = True) -> str: + +def create_tree_structure( + query: Dict[str, Any], + node: Dict[str, Any], + prefix: str = "", + is_last: bool = True, +) -> str: """Creates a tree-like string representation of the file structure.""" tree = "" + if not node["name"]: node["name"] = query['slug'] @@ -267,6 +313,7 @@ def create_tree_structure(query: dict, node: Dict, prefix: str = "", is_last: bo current_prefix = "└── " if is_last else "├── " name = node["name"] + "/" if node["type"] == "directory" else node["name"] tree += prefix + current_prefix + name + "\n" + if node["type"] == "directory": # Adjust prefix only if we added a node name new_prefix = prefix + (" " if is_last else "│ ") if node["name"] else prefix @@ -276,25 +323,29 @@ def create_tree_structure(query: dict, node: Dict, prefix: str = "", is_last: bo return tree -def generate_token_string(context_string: str) -> str: + +def generate_token_string(context_string: str) -> Optional[str]: """Returns the number of tokens in a text string.""" formatted_tokens = "" try: - encoding = tiktoken.get_encoding("cl100k_base", ) + encoding = tiktoken.get_encoding("cl100k_base") total_tokens = len(encoding.encode(context_string, disallowed_special=())) - + except Exception as e: print(e) return None - if total_tokens > 1000000: - formatted_tokens = f"{total_tokens/1000000:.1f}M" - elif total_tokens > 1000: - formatted_tokens = f"{total_tokens/1000:.1f}k" + + if total_tokens > 1_000_000: + formatted_tokens = f"{total_tokens / 1_000_000:.1f}M" + elif total_tokens > 1_000: + formatted_tokens = f"{total_tokens / 1_000:.1f}k" else: formatted_tokens = f"{total_tokens}" + return formatted_tokens -def ingest_single_file(path: str, query: dict) -> Dict: + +def ingest_single_file(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: if not os.path.isfile(path): raise ValueError(f"Path {path} is not a file") @@ -310,7 +361,7 @@ def ingest_single_file(path: str, query: dict) -> Dict: file_info = { "path": path.replace(query['local_path'], ""), "content": content, - "size": file_size + "size": file_size, } summary = ( @@ -326,11 +377,15 @@ def ingest_single_file(path: str, query: dict) -> Dict: formatted_tokens = generate_token_string(files_content) if formatted_tokens: summary += f"\nEstimated tokens: {formatted_tokens}" - return (summary, tree, files_content) -def ingest_directory(path: str, query: dict) -> Dict: - nodes = scan_directory(path, query) - files = extract_files_content(query, nodes, query['max_file_size']) + return summary, tree, files_content + + +def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: + nodes = scan_directory(path=path, query=query) + if not nodes: + raise ValueError(f"No files found in {path}") + files = extract_files_content(query=query, node=nodes, max_file_size=query['max_file_size']) summary = create_summary_string(query, nodes, files) tree = "Directory structure:\n" + create_tree_structure(query, nodes) files_content = create_file_content_string(files) @@ -338,9 +393,11 @@ def ingest_directory(path: str, query: dict) -> Dict: formatted_tokens = generate_token_string(tree + files_content) if formatted_tokens: summary += f"\nEstimated tokens: {formatted_tokens}" - return (summary, tree, files_content) -def ingest_from_query(query: dict) -> Dict: + return summary, tree, files_content + + +def ingest_from_query(query: Dict[str, Any]) -> Tuple[str, str, str]: """Main entry point for analyzing a codebase directory or single file.""" path = f"{query['local_path']}{query['subpath']}" if not os.path.exists(path): @@ -348,6 +405,5 @@ def ingest_from_query(query: dict) -> Dict: if query.get('type') == 'blob': return ingest_single_file(path, query) - else: - return ingest_directory(path, query) + return ingest_directory(path, query) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 8b8f97a8..669f28f3 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -1,55 +1,13 @@ -from typing import List, Union +import os import uuid -import os - - -DEFAULT_IGNORE_PATTERNS = [ - # Python - '*.pyc', '*.pyo', '*.pyd', '__pycache__', '.pytest_cache', '.coverage', - '.tox', '.nox', '.mypy_cache', '.ruff_cache', '.hypothesis', - 'poetry.lock', 'Pipfile.lock', - - # JavaScript/Node - 'node_modules', 'bower_components', 'package-lock.json', 'yarn.lock', - '.npm', '.yarn', '.pnpm-store', - - # Version control - '.git', '.svn', '.hg', '.gitignore', '.gitattributes', '.gitmodules', - - # Images and media - '*.svg', '*.png', '*.jpg', '*.jpeg', '*.gif', '*.ico', '*.pdf', - '*.mov', '*.mp4', '*.mp3', '*.wav', - - # Virtual environments - 'venv', '.venv', 'env', '.env', 'virtualenv', - - # IDEs and editors - '.idea', '.vscode', '.vs', '*.swp', '*.swo', '*.swn', - '.settings', '.project', '.classpath', '*.sublime-*', - - # Temporary and cache files - '*.log', '*.bak', '*.swp', '*.tmp', '*.temp', - '.cache', '.sass-cache', '.eslintcache', - '.DS_Store', 'Thumbs.db', 'desktop.ini', - - # Build directories and artifacts - 'build', 'dist', 'target', 'out', - '*.egg-info', '*.egg', '*.whl', - '*.so', '*.dylib', '*.dll', '*.class', - - # Documentation - 'site-packages', '.docusaurus', '.next', '.nuxt', - - # Other common patterns - '*.min.js', '*.min.css', # Minified files - '*.map', # Source maps - '.terraform', '*.tfstate*', # Terraform - 'vendor/', # Dependencies in various languages -] +from typing import Any, Dict, List, Optional, Union + +from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS TMP_BASE_PATH = "../tmp" -def parse_url(url: str) -> dict: + +def parse_url(url: str) -> Dict[str, Any]: parsed = { "user_name": None, "repo_name": None, @@ -62,22 +20,22 @@ def parse_url(url: str) -> dict: "slug": None, "id": None, } - + url = url.split(" ")[0] if not url.startswith('https://'): url = 'https://' + url - + # Extract domain and path url_parts = url.split('/') domain = url_parts[2] path_parts = url_parts[3:] - + if len(path_parts) < 2: raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.") - + parsed["user_name"] = path_parts[0] parsed["repo_name"] = path_parts[1] - + # Keep original URL format parsed["url"] = f"https://{domain}/{parsed['user_name']}/{parsed['repo_name']}" parsed['slug'] = f"{parsed['user_name']}-{parsed['repo_name']}" @@ -89,10 +47,12 @@ def parse_url(url: str) -> dict: parsed["branch"] = path_parts[3] if len(parsed['branch']) == 40 and all(c in '0123456789abcdefABCDEF' for c in parsed['branch']): parsed["commit"] = parsed['branch'] - + parsed["subpath"] = "/" + "/".join(path_parts[4:]) + return parsed + def normalize_pattern(pattern: str) -> str: pattern = pattern.strip() pattern = pattern.lstrip(os.sep) @@ -100,16 +60,21 @@ def normalize_pattern(pattern: str) -> str: pattern += "*" return pattern + def parse_patterns(pattern: Union[List[str], str]) -> List[str]: if isinstance(pattern, list): pattern = ",".join(pattern) for p in pattern.split(","): if not all(c.isalnum() or c in "-_./+*" for c in p.strip()): - raise ValueError(f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed.") + raise ValueError( + f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), " + "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." + ) patterns = [normalize_pattern(p) for p in pattern.split(",")] return patterns + def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[str]) -> List[str]: for pattern in include_patterns: if pattern in ignore_patterns: @@ -117,8 +82,7 @@ def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[ return ignore_patterns -def parse_path(path: str) -> dict: - +def parse_path(path: str) -> Dict[str, Any]: query = { "local_path": os.path.abspath(path), "slug": os.path.basename(os.path.dirname(path)) + "/" + os.path.basename(path), @@ -128,7 +92,14 @@ def parse_path(path: str) -> dict: } return query -def parse_query(source: str, max_file_size: int, from_web: bool, include_patterns: Union[List[str], str] = None, ignore_patterns: Union[List[str], str] = None) -> dict: + +def parse_query( + source: str, + max_file_size: int, + from_web: bool, + include_patterns: Optional[Union[List[str], str]] = None, + ignore_patterns: Optional[Union[List[str], str]] = None, +) -> Dict[str, Any]: if from_web: query = parse_url(source) else: @@ -136,21 +107,21 @@ def parse_query(source: str, max_file_size: int, from_web: bool, include_pattern query = parse_url(source) else: query = parse_path(source) + query['max_file_size'] = max_file_size if ignore_patterns and ignore_patterns != "": ignore_patterns = DEFAULT_IGNORE_PATTERNS + parse_patterns(ignore_patterns) else: ignore_patterns = DEFAULT_IGNORE_PATTERNS - + if include_patterns and include_patterns != "": include_patterns = parse_patterns(include_patterns) ignore_patterns = override_ignore_patterns(ignore_patterns, include_patterns) - else: + else: include_patterns = None - + query['ignore_patterns'] = ignore_patterns query['include_patterns'] = include_patterns - - return query + return query diff --git a/src/gitingest/tests/conftest.py b/src/gitingest/tests/conftest.py index d31de7bb..31dba62d 100644 --- a/src/gitingest/tests/conftest.py +++ b/src/gitingest/tests/conftest.py @@ -6,4 +6,4 @@ # Add both the project root and src directory to PYTHONPATH sys.path.insert(0, project_root) -sys.path.insert(0, os.path.join(project_root, 'src')) \ No newline at end of file +sys.path.insert(0, os.path.join(project_root, 'src')) diff --git a/src/gitingest/tests/test_clone.py b/src/gitingest/tests/test_clone.py index 06579b6e..680181c8 100644 --- a/src/gitingest/tests/test_clone.py +++ b/src/gitingest/tests/test_clone.py @@ -1,72 +1,78 @@ +from unittest.mock import AsyncMock, patch + import pytest -from clone import clone_repo, check_repo_exists -from unittest.mock import patch, AsyncMock + +from gitingest.clone import check_repo_exists, clone_repo + @pytest.mark.asyncio -async def test_clone_repo_with_commit(): +async def test_clone_repo_with_commit() -> None: query = { 'commit': 'a' * 40, # Simulating a valid commit hash 'branch': 'main', 'url': 'https://github.com/user/repo', - 'local_path': '/tmp/repo' + 'local_path': '/tmp/repo', } - - with patch('clone.check_repo_exists', return_value=True) as mock_check: + + with patch('gitingest.clone.check_repo_exists', return_value=True) as mock_check: with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b'output', b'error') mock_exec.return_value = mock_process - + await clone_repo(query) mock_check.assert_called_once_with(query['url']) assert mock_exec.call_count == 2 # Clone and checkout calls + @pytest.mark.asyncio -async def test_clone_repo_without_commit(): +async def test_clone_repo_without_commit() -> None: query = { 'commit': None, 'branch': 'main', 'url': 'https://github.com/user/repo', - 'local_path': '/tmp/repo' + 'local_path': '/tmp/repo', } - - with patch('clone.check_repo_exists', return_value=True) as mock_check: + + with patch('gitingest.clone.check_repo_exists', return_value=True) as mock_check: with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b'output', b'error') mock_exec.return_value = mock_process - + await clone_repo(query) mock_check.assert_called_once_with(query['url']) assert mock_exec.call_count == 1 # Only clone call + @pytest.mark.asyncio -async def test_clone_repo_nonexistent_repository(): +async def test_clone_repo_nonexistent_repository() -> None: query = { 'commit': None, 'branch': 'main', 'url': 'https://github.com/user/nonexistent-repo', - 'local_path': '/tmp/repo' + 'local_path': '/tmp/repo', } - + with patch('gitingest.clone.check_repo_exists', return_value=False) as mock_check: with pytest.raises(ValueError, match="Repository not found"): await clone_repo(query) mock_check.assert_called_once_with(query['url']) + @pytest.mark.asyncio -async def test_check_repo_exists(): +async def test_check_repo_exists() -> None: url = "https://github.com/user/repo" - + with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b'HTTP/1.1 200 OK\n', b'') mock_exec.return_value = mock_process - + # Test existing repository mock_process.returncode = 0 assert await check_repo_exists(url) is True - + # Test non-existing repository (404 response) mock_process.communicate.return_value = (b'HTTP/1.1 404 Not Found\n', b'') mock_process.returncode = 0 @@ -74,4 +80,4 @@ async def test_check_repo_exists(): # Test failed request mock_process.returncode = 1 - assert await check_repo_exists(url) is False \ No newline at end of file + assert await check_repo_exists(url) is False diff --git a/src/gitingest/tests/test_ingest.py b/src/gitingest/tests/test_ingest.py index 19a57b58..33b174b1 100644 --- a/src/gitingest/tests/test_ingest.py +++ b/src/gitingest/tests/test_ingest.py @@ -1,12 +1,14 @@ +from pathlib import Path +from typing import Any, Dict + import pytest -from src.gitingest.ingest_from_query import ( - scan_directory, - extract_files_content, -) + +from gitingest.ingest_from_query import extract_files_content, scan_directory + # Test fixtures @pytest.fixture -def sample_query(): +def sample_query() -> Dict[str, Any]: return { 'user_name': 'test_user', 'repo_name': 'test_repo', @@ -14,16 +16,16 @@ def sample_query(): 'subpath': '/', 'branch': 'main', 'commit': None, - 'max_file_size': 1000000, + 'max_file_size': 1_000_000, 'slug': 'test_user/test_repo', 'ignore_patterns': ['*.pyc', '__pycache__', '.git'], 'include_patterns': None, - 'pattern_type': 'exclude' - + 'pattern_type': 'exclude', } + @pytest.fixture -def temp_directory(tmp_path): +def temp_directory(tmp_path: Path) -> Path: # Creates the following structure: # test_repo/ # ├── file1.txt @@ -38,58 +40,57 @@ def temp_directory(tmp_path): # | └── file_dir1.txt # └── dir2/ # └── file_dir2.txt - + test_dir = tmp_path / "test_repo" test_dir.mkdir() - + # Root files (test_dir / "file1.txt").write_text("Hello World") (test_dir / "file2.py").write_text("print('Hello')") - + # src directory and its files src_dir = test_dir / "src" src_dir.mkdir() (src_dir / "subfile1.txt").write_text("Hello from src") (src_dir / "subfile2.py").write_text("print('Hello from src')") - + # src/subdir and its files subdir = src_dir / "subdir" subdir.mkdir() (subdir / "file_subdir.txt").write_text("Hello from subdir") (subdir / "file_subdir.py").write_text("print('Hello from subdir')") - + # dir1 and its file dir1 = test_dir / "dir1" dir1.mkdir() (dir1 / "file_dir1.txt").write_text("Hello from dir1") - + # dir2 and its file dir2 = test_dir / "dir2" dir2.mkdir() (dir2 / "file_dir2.txt").write_text("Hello from dir2") - + return test_dir -def test_scan_directory(temp_directory, sample_query): - result = scan_directory( - str(temp_directory), - query=sample_query - ) - + +def test_scan_directory(temp_directory: Path, sample_query: Dict[str, Any]) -> None: + result = scan_directory(str(temp_directory), query=sample_query) + if result is None: + assert False, "Result is None" + assert result['type'] == 'directory' assert result['file_count'] == 8 # All .txt and .py files - assert result['dir_count'] == 4 # src, src/subdir, dir1, dir2 + assert result['dir_count'] == 4 # src, src/subdir, dir1, dir2 assert len(result['children']) == 5 # file1.txt, file2.py, src, dir1, dir2 -def test_extract_files_content(temp_directory, sample_query): - nodes = scan_directory( - str(temp_directory), - query=sample_query - ) - - files = extract_files_content(sample_query, nodes, max_file_size=1000000) + +def test_extract_files_content(temp_directory: Path, sample_query: Dict[str, Any]) -> None: + nodes = scan_directory(str(temp_directory), query=sample_query) + if nodes is None: + assert False, "Nodes is None" + files = extract_files_content(query=sample_query, node=nodes, max_file_size=1_000_000) assert len(files) == 8 # All .txt and .py files - + # Check for presence of key files paths = [f['path'] for f in files] assert any('file1.txt' in p for p in paths) @@ -101,22 +102,17 @@ def test_extract_files_content(temp_directory, sample_query): assert any('file_dir2.txt' in p for p in paths) - # TODO: test with include patterns: ['*.txt'] # TODO: test with wrong include patterns: ['*.qwerty'] -#single folder patterns +# single folder patterns # TODO: test with include patterns: ['src/*'] # TODO: test with include patterns: ['/src/*'] # TODO: test with include patterns: ['/src/'] # TODO: test with include patterns: ['/src*'] -#multiple patterns +# multiple patterns # TODO: test with multiple include patterns: ['*.txt', '*.py'] # TODO: test with multiple include patterns: ['/src/*', '*.txt'] # TODO: test with multiple include patterns: ['/src*', '*.txt'] - - - - diff --git a/src/gitingest/tests/test_parse_query.py b/src/gitingest/tests/test_parse_query.py index da614048..ae4c1659 100644 --- a/src/gitingest/tests/test_parse_query.py +++ b/src/gitingest/tests/test_parse_query.py @@ -1,12 +1,14 @@ import pytest -from gitingest.parse_query import parse_query, parse_url, DEFAULT_IGNORE_PATTERNS +from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS +from gitingest.parse_query import parse_query, parse_url -def test_parse_url_valid(): + +def test_parse_url_valid() -> None: test_cases = [ "https://github.com/user/repo", - "https://gitlab.com/user/repo", - "https://bitbucket.org/user/repo" + "https://gitlab.com/user/repo", + "https://bitbucket.org/user/repo", ] for url in test_cases: result = parse_url(url) @@ -14,16 +16,15 @@ def test_parse_url_valid(): assert result["repo_name"] == "repo" assert result["url"] == url -def test_parse_url_invalid(): + +def test_parse_url_invalid() -> None: url = "https://only-domain.com" with pytest.raises(ValueError, match="Invalid repository URL"): parse_url(url) -def test_parse_query_basic(): - test_cases = [ - "https://github.com/user/repo", - "https://gitlab.com/user/repo" - ] + +def test_parse_query_basic() -> None: + test_cases = ["https://github.com/user/repo", "https://gitlab.com/user/repo"] for url in test_cases: result = parse_query(url, max_file_size=50, from_web=True, ignore_patterns='*.txt') assert result["user_name"] == "user" @@ -31,13 +32,15 @@ def test_parse_query_basic(): assert result["url"] == url assert "*.txt" in result["ignore_patterns"] -def test_parse_query_include_pattern(): + +def test_parse_query_include_pattern() -> None: url = "https://github.com/user/repo" result = parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py') assert result["include_patterns"] == ["*.py"] assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS -def test_parse_query_invalid_pattern(): + +def test_parse_query_invalid_pattern() -> None: url = "https://github.com/user/repo" with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): - parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py;rm -rf') \ No newline at end of file + parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py;rm -rf') diff --git a/src/gitingest/utils.py b/src/gitingest/utils.py index ebbb4090..1f07b533 100644 --- a/src/gitingest/utils.py +++ b/src/gitingest/utils.py @@ -1,22 +1,27 @@ - ## Async Timeout decorator import asyncio import functools -from typing import TypeVar, Callable +from typing import Awaitable, Callable, ParamSpec, TypeVar T = TypeVar("T") +P = ParamSpec("P") + class AsyncTimeoutError(Exception): """Raised when an async operation exceeds its timeout limit.""" + pass -def async_timeout(seconds: int = 10): - def decorator(func: Callable[..., T]) -> Callable[..., T]: + +def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]: + def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]: @functools.wraps(func) - async def wrapper(*args, **kwargs) -> T: + async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: try: return await asyncio.wait_for(func(*args, **kwargs), timeout=seconds) except asyncio.TimeoutError: - raise AsyncTimeoutError(f"Clone timed out after {seconds} seconds") + raise AsyncTimeoutError(f"Operation timed out after {seconds} seconds") + return wrapper - return decorator \ No newline at end of file + + return decorator diff --git a/src/main.py b/src/main.py index 3fe94233..a50a1c5f 100644 --- a/src/main.py +++ b/src/main.py @@ -1,27 +1,41 @@ import os -from dotenv import load_dotenv +from typing import Dict +from api_analytics.fastapi import Analytics +from dotenv import load_dotenv from fastapi import FastAPI, Request -from fastapi.templating import Jinja2Templates -from fastapi.responses import HTMLResponse, FileResponse, Response +from fastapi.responses import FileResponse, HTMLResponse, Response from fastapi.staticfiles import StaticFiles -from starlette.middleware.trustedhost import TrustedHostMiddleware -from api_analytics.fastapi import Analytics +from fastapi.templating import Jinja2Templates from slowapi import _rate_limit_exceeded_handler from slowapi.errors import RateLimitExceeded +from starlette.middleware.trustedhost import TrustedHostMiddleware -from server_utils import limiter from routers import download, dynamic, index - +from server_utils import limiter load_dotenv() app = FastAPI() app.state.limiter = limiter -app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) + + +# Define a wrapper handler with the correct signature +async def rate_limit_exception_handler(request: Request, exc: Exception) -> Response: + if isinstance(exc, RateLimitExceeded): + # Delegate to the actual handler + return _rate_limit_exceeded_handler(request, exc) + # Optionally, handle other exceptions or re-raise + raise exc + + +# Register the wrapper handler +app.add_exception_handler(RateLimitExceeded, rate_limit_exception_handler) app.mount("/static", StaticFiles(directory="static"), name="static") -app.add_middleware(Analytics, api_key=os.getenv('API_ANALYTICS_KEY')) +app_analytics_key = os.getenv('API_ANALYTICS_KEY') +if app_analytics_key: + app.add_middleware(Analytics, api_key=app_analytics_key) # Define the default allowed hosts default_allowed_hosts = ["gitingest.com", "*.gitingest.com", "localhost", "127.0.0.1"] @@ -36,31 +50,29 @@ app.add_middleware(TrustedHostMiddleware, allowed_hosts=allowed_hosts) templates = Jinja2Templates(directory="templates") + @app.get("/health") -async def health_check(): +async def health_check() -> Dict[str, str]: return {"status": "healthy"} + @app.head("/") -async def head_root(): +async def head_root() -> HTMLResponse: """Mirror the headers and status code of the index page""" - return HTMLResponse( - content=None, - headers={ - "content-type": "text/html; charset=utf-8" - } - ) - + return HTMLResponse(content=None, headers={"content-type": "text/html; charset=utf-8"}) + + @app.get("/api/", response_class=HTMLResponse) @app.get("/api", response_class=HTMLResponse) -async def api_docs(request: Request): - return templates.TemplateResponse( - "api.jinja", {"request": request} - ) +async def api_docs(request: Request) -> HTMLResponse: + return templates.TemplateResponse("api.jinja", {"request": request}) + @app.get("/robots.txt") -async def robots(): +async def robots() -> FileResponse: return FileResponse('static/robots.txt') + app.include_router(index) app.include_router(download) -app.include_router(dynamic) \ No newline at end of file +app.include_router(dynamic) diff --git a/src/process_query.py b/src/process_query.py index 18d8d76c..466b11d2 100644 --- a/src/process_query.py +++ b/src/process_query.py @@ -1,16 +1,27 @@ -from typing import List -from fastapi.templating import Jinja2Templates +from typing import Any, Dict + from fastapi import Request +from fastapi.templating import Jinja2Templates +from starlette.templating import _TemplateResponse -from config import MAX_DISPLAY_SIZE, EXAMPLE_REPOS -from gitingest import ingest_from_query, clone_repo, parse_query -from server_utils import logSliderToSize, Colors +from config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE +from gitingest.clone import clone_repo +from gitingest.ingest_from_query import ingest_from_query +from gitingest.parse_query import parse_query +from server_utils import Colors, logSliderToSize templates = Jinja2Templates(directory="templates") -def print_query(query, request, max_file_size, pattern_type, pattern): + +def print_query( + query: Dict[str, Any], + request: Request, + max_file_size: int, + pattern_type: str, + pattern: str, +) -> None: print(f"{Colors.WHITE}{query['url']:<20}{Colors.END}", end="") - if int(max_file_size/1024) != 50: + if int(max_file_size / 1024) != 50: print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") if pattern_type == "include" and pattern != "": print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") @@ -18,46 +29,74 @@ def print_query(query, request, max_file_size, pattern_type, pattern): print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") -def print_error(query, request, e, max_file_size, pattern_type, pattern): +def print_error( + query: Dict[str, Any], + request: Request, + e: Exception, + max_file_size: int, + pattern_type: str, + pattern: str, +) -> None: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print_query(query, request, max_file_size, pattern_type, pattern) print(f" | {Colors.RED}{e}{Colors.END}") -def print_success(query, request, max_file_size, pattern_type, pattern, summary): + +def print_success( + query: Dict[str, Any], + request: Request, + max_file_size: int, + pattern_type: str, + pattern: str, + summary: str, +) -> None: estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") print_query(query, request, max_file_size, pattern_type, pattern) print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") - -async def process_query(request: Request, input_text: str, slider_position: int, pattern_type: str = "exclude", pattern: str = "", is_index: bool = False) -> str: +async def process_query( + request: Request, + input_text: str, + slider_position: int, + pattern_type: str = "exclude", + pattern: str = "", + is_index: bool = False, +) -> _TemplateResponse: template = "index.jinja" if is_index else "github.jinja" max_file_size = logSliderToSize(slider_position) + if pattern_type == "include": include_patterns = pattern exclude_patterns = None elif pattern_type == "exclude": exclude_patterns = pattern include_patterns = None + try: - query = parse_query(input_text, max_file_size, True, include_patterns, exclude_patterns) + query = parse_query( + source=input_text, + max_file_size=max_file_size, + from_web=True, + include_patterns=include_patterns, + ignore_patterns=exclude_patterns, + ) await clone_repo(query) summary, tree, content = ingest_from_query(query) with open(f"{query['local_path']}.txt", "w") as f: f.write(tree + "\n" + content) - - except Exception as e: - #hack to print error message when query is not defined + # hack to print error message when query is not defined if 'query' in locals() and query is not None and isinstance(query, dict): - print_error(query, request, e, max_file_size, pattern_type, pattern) + print_error(query, request, e, max_file_size, pattern_type, pattern) else: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print(f"{Colors.RED}{e}{Colors.END}") + return templates.TemplateResponse( - template, + template, { "request": request, "github_url": input_text, @@ -66,25 +105,37 @@ async def process_query(request: Request, input_text: str, slider_position: int, "default_file_size": slider_position, "pattern_type": pattern_type, "pattern": pattern, - } + }, ) - + if len(content) > MAX_DISPLAY_SIZE: - content = f"(Files content cropped to {int(MAX_DISPLAY_SIZE/1000)}k characters, download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] - print_success(query, request, max_file_size, pattern_type, pattern, summary) + content = ( + f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " + "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] + ) + + print_success( + query=query, + request=request, + max_file_size=max_file_size, + pattern_type=pattern_type, + pattern=pattern, + summary=summary, + ) + return templates.TemplateResponse( - template, + template, { - "request": request, + "request": request, "github_url": input_text, - "result": True, + "result": True, "summary": summary, - "tree": tree, + "tree": tree, "content": content, "examples": EXAMPLE_REPOS if is_index else [], "ingest_id": query['id'], "default_file_size": slider_position, "pattern_type": pattern_type, "pattern": pattern, - } + }, ) diff --git a/src/routers/__init__.py b/src/routers/__init__.py index b1871c1a..ace7bd09 100644 --- a/src/routers/__init__.py +++ b/src/routers/__init__.py @@ -1,5 +1,5 @@ -from .download import router as download -from .dynamic import router as dynamic -from .index import router as index +from routers.download import router as download +from routers.dynamic import router as dynamic +from routers.index import router as index -__all__ = ["download", "dynamic", "index"] \ No newline at end of file +__all__ = ["download", "dynamic", "index"] diff --git a/src/routers/download.py b/src/routers/download.py index e26b2df2..95cec0fe 100644 --- a/src/routers/download.py +++ b/src/routers/download.py @@ -1,29 +1,30 @@ -from fastapi import HTTPException, APIRouter +import os + +from fastapi import APIRouter, HTTPException from fastapi.responses import Response + from config import TMP_BASE_PATH -import os router = APIRouter() + @router.get("/download/{digest_id}") -async def download_ingest(digest_id: str): +async def download_ingest(digest_id: str) -> Response: try: # Find the first .txt file in the directory directory = f"{TMP_BASE_PATH}/{digest_id}" txt_files = [f for f in os.listdir(directory) if f.endswith('.txt')] - + if not txt_files: raise FileNotFoundError("No .txt file found") - - with open(f"{directory}/{txt_files[0]}", "r") as f: + + with open(f"{directory}/{txt_files[0]}") as f: content = f.read() - + return Response( content=content, media_type="text/plain", - headers={ - "Content-Disposition": f"attachment; filename={txt_files[0]}" - } + headers={"Content-Disposition": f"attachment; filename={txt_files[0]}"}, ) except FileNotFoundError: - raise HTTPException(status_code=404, detail="Digest not found") \ No newline at end of file + raise HTTPException(status_code=404, detail="Digest not found") diff --git a/src/routers/dynamic.py b/src/routers/dynamic.py index 6a0a2f99..12216f15 100644 --- a/src/routers/dynamic.py +++ b/src/routers/dynamic.py @@ -1,4 +1,4 @@ -from fastapi import APIRouter, Request, Form +from fastapi import APIRouter, Form, Request from fastapi.responses import HTMLResponse from fastapi.templating import Jinja2Templates @@ -8,26 +8,34 @@ router = APIRouter() templates = Jinja2Templates(directory="templates") + @router.get("/{full_path:path}") -async def catch_all(request: Request, full_path: str): +async def catch_all(request: Request, full_path: str) -> HTMLResponse: return templates.TemplateResponse( "github.jinja", { "request": request, "github_url": f"https://github.com/{full_path}", "loading": True, - "default_file_size": 243 - } + "default_file_size": 243, + }, ) + @router.post("/{full_path:path}", response_class=HTMLResponse) -@limiter.limit("10/minute") +@limiter.limit("10/minute") async def process_catch_all( - request: Request, + request: Request, input_text: str = Form(...), max_file_size: int = Form(...), pattern_type: str = Form(...), - pattern: str = Form(...) -): - return await process_query(request, input_text, max_file_size, pattern_type, pattern, is_index=False) - \ No newline at end of file + pattern: str = Form(...), +) -> HTMLResponse: + return await process_query( + request, + input_text, + max_file_size, + pattern_type, + pattern, + is_index=False, + ) diff --git a/src/routers/index.py b/src/routers/index.py index 610d87ce..f2728805 100644 --- a/src/routers/index.py +++ b/src/routers/index.py @@ -1,40 +1,41 @@ -from fastapi import APIRouter, Request, Form +from fastapi import APIRouter, Form, Request from fastapi.responses import HTMLResponse from fastapi.templating import Jinja2Templates -from server_utils import limiter -from process_query import process_query from config import EXAMPLE_REPOS - +from process_query import process_query +from server_utils import limiter router = APIRouter() templates = Jinja2Templates(directory="templates") @router.get("/", response_class=HTMLResponse) -async def home(request: Request): +async def home(request: Request) -> HTMLResponse: return templates.TemplateResponse( - "index.jinja", + "index.jinja", { "request": request, "examples": EXAMPLE_REPOS, - "default_file_size": 243 - } + "default_file_size": 243, + }, ) @router.post("/", response_class=HTMLResponse) -@limiter.limit("10/minute") +@limiter.limit("10/minute") async def index_post( - request: Request, + request: Request, input_text: str = Form(...), max_file_size: int = Form(...), pattern_type: str = Form(...), - pattern: str = Form(...) -): - return await process_query(request, input_text, max_file_size, pattern_type, pattern, is_index=True) - - - - - + pattern: str = Form(...), +) -> HTMLResponse: + return await process_query( + request, + input_text, + max_file_size, + pattern_type, + pattern, + is_index=True, + ) diff --git a/src/server_utils.py b/src/server_utils.py index 584041bd..2a6e186f 100644 --- a/src/server_utils.py +++ b/src/server_utils.py @@ -1,21 +1,26 @@ +import math + ## Rate Limiter from slowapi import Limiter from slowapi.util import get_remote_address + limiter = Limiter(key_func=get_remote_address) -## Logarithmic slider to file size -import math -def logSliderToSize(position): + +## Logarithmic slider to file size conversion +def logSliderToSize(position: int) -> int: """Convert slider position to file size in KB""" maxp = 500 minv = math.log(1) maxv = math.log(102400) - + return round(math.exp(minv + (maxv - minv) * pow(position / maxp, 1.5))) * 1024 + ## Color printing utility class Colors: """ANSI color codes""" + BLACK = "\033[0;30m" RED = "\033[0;31m" GREEN = "\033[0;32m" diff --git a/src/static/favicon.svg b/src/static/favicon.svg index f9b0ae4c..dc5a443a 100644 --- a/src/static/favicon.svg +++ b/src/static/favicon.svg @@ -1 +1 @@ -1 \ No newline at end of file +1 diff --git a/src/static/js/snow.js b/src/static/js/snow.js index 0576bff1..a5e1d87f 100644 --- a/src/static/js/snow.js +++ b/src/static/js/snow.js @@ -88,4 +88,4 @@ function initSnow() { document.addEventListener('DOMContentLoaded', initSnow); // Also initialize when the HTMX content is swapped -document.addEventListener('htmx:afterSettle', initSnow); \ No newline at end of file +document.addEventListener('htmx:afterSettle', initSnow); diff --git a/src/static/robots.txt b/src/static/robots.txt index 49e4f2d9..b757ab6a 100644 --- a/src/static/robots.txt +++ b/src/static/robots.txt @@ -1,5 +1,4 @@ User-agent: * -Allow: / +Allow: / Allow: /api/ Allow: /cyclotruc/gitingest/ - diff --git a/src/templates/api.jinja b/src/templates/api.jinja index 41f0e836..c5e57bdb 100644 --- a/src/templates/api.jinja +++ b/src/templates/api.jinja @@ -1,41 +1,35 @@ {% extends "base.jinja" %} - {% block title %}Git ingest API{% endblock %} - {% block content %} -
-
-
-

API Documentation

- - -
-
-
-
- - - -
-
-

- The API is currently under development.. -

+
+
+
+

API Documentation

+
+
+
+
+ + + +
+
+

The API is currently under development..

+
+

+ We're working on making our API available to the public. + In the meantime, you can + open an issue on github + to suggest features. +

-

- We're working on making our API available to the public. - In the meantime, you can - - open an issue on github - - to suggest features. -

-
-{% endblock %} \ No newline at end of file +{% endblock %} diff --git a/src/templates/base.jinja b/src/templates/base.jinja index 3ef8bd70..e6c3fcda 100644 --- a/src/templates/base.jinja +++ b/src/templates/base.jinja @@ -1,41 +1,44 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - {% block title %}Git ingest{% endblock %} - - - - - - {% block extra_head %}{% endblock %} - - - - - {% include 'components/navbar.jinja' %} - - -
-
- {% block content %}{% endblock %} -
-
- - {% include 'components/footer.jinja' %} - - {% block extra_scripts %}{% endblock %} - - \ No newline at end of file + + {% block extra_head %}{% endblock %} + + + + {% include 'components/navbar.jinja' %} + +
+
+ {% block content %}{% endblock %} +
+
+ {% include 'components/footer.jinja' %} + {% block extra_scripts %}{% endblock %} + + diff --git a/src/templates/components/footer.jinja b/src/templates/components/footer.jinja index a0820416..e8ffa9ee 100644 --- a/src/templates/components/footer.jinja +++ b/src/templates/components/footer.jinja @@ -4,19 +4,23 @@
- \ No newline at end of file + diff --git a/src/templates/components/github_form.jinja b/src/templates/components/github_form.jinja index ec6054ef..7be65aee 100644 --- a/src/templates/components/github_form.jinja +++ b/src/templates/components/github_form.jinja @@ -2,28 +2,30 @@
- + class="absolute md:block hidden left-0 h-[4.5rem] w-[4.5rem] bottom-0 -translate-x-full ml-3">
+ id="ingestForm" + onsubmit="handleSubmit(event{% if is_index %}, true{% endif %})">
- +
-
@@ -31,74 +33,62 @@
- - - + +
-
-
- - + +
- {% if show_examples %} - -
-

Try these example repositories:

-
- {% for example in examples %} - + +
+

Try these example repositories:

+
+ {% for example in examples %} + {% endfor %} +
-
{% endif %}
-
\ No newline at end of file +
diff --git a/src/templates/components/navbar.jinja b/src/templates/components/navbar.jinja index 6275cb87..6f4b2ce0 100644 --- a/src/templates/components/navbar.jinja +++ b/src/templates/components/navbar.jinja @@ -21,7 +21,6 @@ fetchGitHubStars(); -
- \ No newline at end of file + diff --git a/src/templates/components/result.jinja b/src/templates/components/result.jinja index 00b6f934..cd0a9783 100644 --- a/src/templates/components/result.jinja +++ b/src/templates/components/result.jinja @@ -1,115 +1,94 @@ {% if result %} -
-
-
-
- -
- -
-
-

Summary

-
- - -
-
-
- -
- {% if ingest_id %} -
-
-
-
- -
- {% endif %} - - +
+
+
+
+ +
+ +
+
+

Summary

+
+
+
+ +
+ {% if ingest_id %} + - - -
-
-

Directory Structure

-
-
-
- -
-
-
-
-
- -
+
+
+
-
- - -
-
-

Files Content

-
-
-
- -
-
-
-
-
- + + + + Copy +
+
+
+ +
+
+
+ +
+
+

Files Content

+
+
+ +
+
+
+
+
+
+
-{% endif %} \ No newline at end of file +{% endif %} diff --git a/src/templates/github.jinja b/src/templates/github.jinja index fdedcce7..c373367c 100644 --- a/src/templates/github.jinja +++ b/src/templates/github.jinja @@ -1,39 +1,33 @@ {% extends "base.jinja" %} - {% block content %} -{% if error_message %} -
- {{ error_message }} -
-{% endif %} - -{% with is_index=true, show_examples=false %} - {% include 'components/github_form.jinja' %} -{% endwith %} - -{% if loading %} -
-
-
-
-

Loading...

-
-
-{% endif %} - -{% include 'components/result.jinja' %} + {% if error_message %} +
{{ error_message }}
+ {% endif %} + {% with is_index=true, show_examples=false %} + {% include 'components/github_form.jinja' %} + {% endwith %} + {% if loading %} +
+
+
+
+

Loading...

+
+
+ {% endif %} + {% include 'components/result.jinja' %} {% endblock content %} - {% block extra_scripts %} - -{% endblock extra_scripts %} \ No newline at end of file + +{% endblock extra_scripts %} diff --git a/src/templates/index.jinja b/src/templates/index.jinja index 80015ade..e29066f6 100644 --- a/src/templates/index.jinja +++ b/src/templates/index.jinja @@ -1,67 +1,57 @@ {% extends "base.jinja" %} - {% block extra_head %} - + {% endblock %} - {% block content %} -
-
- - - - -

- Prompt-friendly
codebase  -

- +
+
+ + + + + + +

+ Prompt-friendly +
+ codebase  +

+ +
+

+ Turn any Git repository into a simple text ingest of its codebase. +

+

+ This is useful for feeding a codebase into any LLM. +

+

+ You can also replace 'hub' with 'ingest' in any Github URL +

-

- Turn any Git repository into a simple text ingest of its codebase. -

-

- This is useful for feeding a codebase into any LLM. -

-

- You can also replace 'hub' with 'ingest' in any Github URL -

-
- -{% if error_message %} -
- {{ error_message }} -
-{% endif %} - -{% with is_index=true, show_examples=true %} - {% include 'components/github_form.jinja' %} -{% endwith %} - -{% include 'components/result.jinja' %} - - - - -{% endblock %} \ No newline at end of file + {% if error_message %} +
{{ error_message }}
+ {% endif %} + {% with is_index=true, show_examples=true %} + {% include 'components/github_form.jinja' %} + {% endwith %} + {% include 'components/result.jinja' %} +{% endblock %}