From 935fcedfb64e102c6dcd31b0d3750555a5348009 Mon Sep 17 00:00:00 2001 From: Shrey Purohit Date: Sun, 2 Feb 2025 17:14:26 +0530 Subject: [PATCH 1/4] Improvement: Make the CLI work on windows --- setup.py | 6 ++- src/gitingest/query_ingestion.py | 65 ++++++++++++++++++++++++++------ 2 files changed, 58 insertions(+), 13 deletions(-) diff --git a/setup.py b/setup.py index 6778a92c..1bb29a10 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,8 @@ from setuptools import find_packages, setup +from pathlib import Path + +this_directory = Path(__file__).parent +long_description = (this_directory / "README.md").read_text(encoding="utf-8") setup( name="gitingest", @@ -19,7 +23,7 @@ author="Romain Courtois", author_email="romain@coderamp.io", description="CLI tool to analyze and create text dumps of codebases for LLMs", - long_description=open("README.md").read(), + long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/cyclotruc/gitingest", classifiers=[ diff --git a/src/gitingest/query_ingestion.py b/src/gitingest/query_ingestion.py index a0bcfdf7..ea3a230c 100644 --- a/src/gitingest/query_ingestion.py +++ b/src/gitingest/query_ingestion.py @@ -3,6 +3,9 @@ from fnmatch import fnmatch from pathlib import Path from typing import Any +import locale +import os +import platform import tiktoken @@ -16,6 +19,25 @@ from gitingest.notebook_utils import process_notebook from gitingest.query_parser import ParsedQuery +try: + locale.setlocale(locale.LC_ALL, '') +except locale.Error: + locale.setlocale(locale.LC_ALL, 'C') + +def _normalize_path(path: Path) -> Path: + """Normalize path for cross-platform compatibility.""" + return Path(os.path.normpath(str(path))) + +def _normalize_path_str(path: str | Path) -> str: + """Convert path to string with forward slashes for consistent output.""" + return str(path).replace(os.sep, '/') + +def _get_encoding_list() -> list[str]: + """Get list of encodings to try, prioritized for the current platform.""" + encodings = ['utf-8', 'utf-8-sig'] + if platform.system() == 'Windows': + encodings.extend(['cp1252', 'iso-8859-1']) + return encodings + [locale.getpreferredencoding()] def _should_include(path: Path, base_path: Path, include_patterns: set[str]) -> bool: """ @@ -107,9 +129,13 @@ def _is_safe_symlink(symlink_path: Path, base_path: Path) -> bool: `True` if the symlink points within the base directory, `False` otherwise. """ try: - target_path = symlink_path.resolve() - base_resolved = base_path.resolve() - # It's "safe" if target_path == base_resolved or is inside base_resolved + if platform.system() == 'Windows': + if not os.path.islink(str(symlink_path)): + return False + + target_path = _normalize_path(symlink_path.resolve()) + base_resolved = _normalize_path(base_path.resolve()) + return base_resolved in target_path.parents or target_path == base_resolved except (OSError, ValueError): # If there's any error resolving the paths, consider it unsafe @@ -162,10 +188,22 @@ def _read_file_content(file_path: Path) -> str: """ try: if file_path.suffix == ".ipynb": - return process_notebook(file_path) + try: + return process_notebook(file_path) + except Exception as e: + return f"Error processing notebook: {e}" + + for encoding in _get_encoding_list(): + try: + with open(file_path, encoding=encoding) as f: + return f.read() + except UnicodeDecodeError: + continue + except OSError as e: + return f"Error reading file: {e}" + + return "Error: Unable to decode file with available encodings" - with open(file_path, encoding="utf-8", errors="ignore") as f: - return f.read() except (OSError, InvalidNotebookError) as e: return f"Error reading file: {e}" @@ -531,10 +569,10 @@ def _extract_files_content( content = node["content"] relative_path = Path(node["path"]).relative_to(query.local_path) - + # Store paths with forward slashes files.append( { - "path": str(relative_path), + "path": _normalize_path_str(relative_path), "content": content, "size": node["size"], }, @@ -572,7 +610,8 @@ def _create_file_content_string(files: list[dict[str, Any]]) -> str: continue output += separator - output += f"File: {file['path']}\n" + # Use forward slashes in output paths + output += f"File: {_normalize_path_str(file['path'])}\n" output += separator output += f"{file['content']}\n\n" @@ -815,11 +854,13 @@ def run_ingest_query(query: ParsedQuery) -> tuple[str, str, str]: ValueError If the specified path cannot be found or if the file is not a text file. """ - path = query.local_path / query.subpath.lstrip("/") + subpath = _normalize_path(Path(query.subpath.strip("/"))).as_posix() + path = _normalize_path(query.local_path / subpath) + if not path.exists(): raise ValueError(f"{query.slug} cannot be found") if query.type and query.type == "blob": - return _ingest_single_file(path, query) + return _ingest_single_file(_normalize_path(path.resolve()), query) - return _ingest_directory(path, query) + return _ingest_directory(_normalize_path(path.resolve()), query) From dceff33cb35a96dc7707e0b9bb8f17969d770398 Mon Sep 17 00:00:00 2001 From: Romain Courtois Date: Tue, 4 Feb 2025 05:59:37 +0100 Subject: [PATCH 2/4] Fix tmp file creation and add test --- src/gitingest/config.py | 4 +++- src/gitingest/repository_clone.py | 11 ++++++++++ tests/test_repository_clone.py | 34 +++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/src/gitingest/config.py b/src/gitingest/config.py index 291942f3..d0733b92 100644 --- a/src/gitingest/config.py +++ b/src/gitingest/config.py @@ -1,5 +1,6 @@ """ Configuration file for the project. """ +import tempfile from pathlib import Path MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB @@ -8,4 +9,5 @@ MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB OUTPUT_FILE_PATH = "digest.txt" -TMP_BASE_PATH = Path("/tmp/gitingest") + +TMP_BASE_PATH = Path(tempfile.gettempdir()) / "gitingest" diff --git a/src/gitingest/repository_clone.py b/src/gitingest/repository_clone.py index 4adfcd9f..51d40bea 100644 --- a/src/gitingest/repository_clone.py +++ b/src/gitingest/repository_clone.py @@ -1,7 +1,9 @@ """ This module contains functions for cloning a Git repository to a local path. """ import asyncio +import os from dataclasses import dataclass +from pathlib import Path from gitingest.utils import async_timeout @@ -61,6 +63,8 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: ------ ValueError If the 'url' or 'local_path' parameters are missing, or if the repository is not found. + OSError + If there is an error creating the parent directory structure. """ # Extract and validate query parameters url: str = config.url @@ -74,6 +78,13 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: if not local_path: raise ValueError("The 'local_path' parameter is required.") + # Create parent directory if it doesn't exist + parent_dir = Path(local_path).parent + try: + os.makedirs(parent_dir, exist_ok=True) + except OSError as e: + raise OSError(f"Failed to create parent directory {parent_dir}: {e}") from e + # Check if the repository exists if not await _check_repo_exists(url): raise ValueError("Repository not found, make sure it is public") diff --git a/tests/test_repository_clone.py b/tests/test_repository_clone.py index 380ad5d0..d8a749e7 100644 --- a/tests/test_repository_clone.py +++ b/tests/test_repository_clone.py @@ -7,6 +7,7 @@ import asyncio import os +from pathlib import Path from unittest.mock import AsyncMock, patch import pytest @@ -362,3 +363,36 @@ async def test_clone_branch_with_slashes(tmp_path): clone_config.url, clone_config.local_path, ) + + +@pytest.mark.asyncio +async def test_clone_repo_creates_parent_directory(tmp_path: Path) -> None: + """ + Test that clone_repo creates parent directories if they don't exist. + + Given a local path with non-existent parent directories: + When `clone_repo` is called, + Then it should create the parent directories before attempting to clone. + """ + nested_path = tmp_path / "deep" / "nested" / "path" / "repo" + clone_config = CloneConfig( + url="https://github.com/user/repo", + local_path=str(nested_path), + ) + + with patch("gitingest.repository_clone._check_repo_exists", return_value=True): + with patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_exec: + await clone_repo(clone_config) + + # Verify parent directory was created + assert nested_path.parent.exists() + + # Verify git clone was called with correct parameters + mock_exec.assert_called_once_with( + "git", + "clone", + "--depth=1", + "--single-branch", + clone_config.url, + str(nested_path), + ) From 3bb13a18e1fa5d13345ce0ec23d8f9a206e7ec0d Mon Sep 17 00:00:00 2001 From: Romain Courtois Date: Tue, 4 Feb 2025 06:11:15 +0100 Subject: [PATCH 3/4] add error message when git missing --- src/gitingest/repository_clone.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/gitingest/repository_clone.py b/src/gitingest/repository_clone.py index 51d40bea..1fa38641 100644 --- a/src/gitingest/repository_clone.py +++ b/src/gitingest/repository_clone.py @@ -193,8 +193,24 @@ async def _run_git_command(*args: str) -> tuple[bytes, bytes]: Raises ------ RuntimeError - If the Git command exits with a non-zero status. + If Git is not installed or if the Git command exits with a non-zero status. """ + # Check if Git is installed + try: + version_proc = await asyncio.create_subprocess_exec( + "git", + "--version", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + _, stderr = await version_proc.communicate() + if version_proc.returncode != 0: + error_message = stderr.decode().strip() if stderr else "Git command not found" + raise RuntimeError(f"Git is not installed or not accessible: {error_message}") + except FileNotFoundError as exc: + raise RuntimeError("Git is not installed. Please install Git before proceeding.") from exc + + # Execute the requested Git command proc = await asyncio.create_subprocess_exec( *args, stdout=asyncio.subprocess.PIPE, From 0628eb40d05789375f821b6e770d3e0fbf216737 Mon Sep 17 00:00:00 2001 From: Romain Courtois Date: Tue, 4 Feb 2025 06:34:25 +0100 Subject: [PATCH 4/4] update CI to test windows and Macos --- .github/workflows/ci.yml | 2 +- setup.py | 3 +- src/gitingest/query_ingestion.py | 68 ++++++++++++++++++++++++-------- 3 files changed, 55 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9fbbf5d0..6e7c3b1b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,7 @@ jobs: strategy: fail-fast: true matrix: - os: [ubuntu-latest, macos-latest] + os: [ubuntu-latest, macos-latest, windows-latest] python-version: ["3.10", "3.11", "3.12", "3.13"] steps: diff --git a/setup.py b/setup.py index 1bb29a10..62d78116 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ -from setuptools import find_packages, setup from pathlib import Path +from setuptools import find_packages, setup + this_directory = Path(__file__).parent long_description = (this_directory / "README.md").read_text(encoding="utf-8") diff --git a/src/gitingest/query_ingestion.py b/src/gitingest/query_ingestion.py index ea3a230c..5f0cddb2 100644 --- a/src/gitingest/query_ingestion.py +++ b/src/gitingest/query_ingestion.py @@ -1,11 +1,11 @@ """ Functions to ingest and analyze a codebase directory or single file. """ -from fnmatch import fnmatch -from pathlib import Path -from typing import Any import locale import os import platform +from fnmatch import fnmatch +from pathlib import Path +from typing import Any import tiktoken @@ -20,25 +20,61 @@ from gitingest.query_parser import ParsedQuery try: - locale.setlocale(locale.LC_ALL, '') + locale.setlocale(locale.LC_ALL, "") except locale.Error: - locale.setlocale(locale.LC_ALL, 'C') + locale.setlocale(locale.LC_ALL, "C") + def _normalize_path(path: Path) -> Path: - """Normalize path for cross-platform compatibility.""" + """ + Normalize path for cross-platform compatibility. + + Parameters + ---------- + path : Path + The Path object to normalize. + + Returns + ------- + Path + The normalized path with platform-specific separators and resolved components. + """ return Path(os.path.normpath(str(path))) + def _normalize_path_str(path: str | Path) -> str: - """Convert path to string with forward slashes for consistent output.""" - return str(path).replace(os.sep, '/') + """ + Convert path to string with forward slashes for consistent output. + + Parameters + ---------- + path : str | Path + The path to convert, can be string or Path object. + + Returns + ------- + str + The normalized path string with forward slashes as separators. + """ + return str(path).replace(os.sep, "/") + def _get_encoding_list() -> list[str]: - """Get list of encodings to try, prioritized for the current platform.""" - encodings = ['utf-8', 'utf-8-sig'] - if platform.system() == 'Windows': - encodings.extend(['cp1252', 'iso-8859-1']) + """ + Get list of encodings to try, prioritized for the current platform. + + Returns + ------- + list[str] + List of encoding names to try in priority order, starting with the + platform's default encoding followed by common fallback encodings. + """ + encodings = ["utf-8", "utf-8-sig"] + if platform.system() == "Windows": + encodings.extend(["cp1252", "iso-8859-1"]) return encodings + [locale.getpreferredencoding()] + def _should_include(path: Path, base_path: Path, include_patterns: set[str]) -> bool: """ Determine if the given file or directory path matches any of the include patterns. @@ -129,13 +165,13 @@ def _is_safe_symlink(symlink_path: Path, base_path: Path) -> bool: `True` if the symlink points within the base directory, `False` otherwise. """ try: - if platform.system() == 'Windows': + if platform.system() == "Windows": if not os.path.islink(str(symlink_path)): return False - + target_path = _normalize_path(symlink_path.resolve()) base_resolved = _normalize_path(base_path.resolve()) - + return base_resolved in target_path.parents or target_path == base_resolved except (OSError, ValueError): # If there's any error resolving the paths, consider it unsafe @@ -201,7 +237,7 @@ def _read_file_content(file_path: Path) -> str: continue except OSError as e: return f"Error reading file: {e}" - + return "Error: Unable to decode file with available encodings" except (OSError, InvalidNotebookError) as e: