Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions src/gitingest/cloning.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,12 @@ async def clone_repo(config: CloneConfig) -> None:
checkout_cmd = ["git", "-C", local_path]

if partial_clone:
subpath = config.subpath.lstrip("/")
if config.blob:
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name
checkout_cmd += ["sparse-checkout", "set", Path(config.subpath.lstrip("/")).parent]
else:
checkout_cmd += ["sparse-checkout", "set", config.subpath.lstrip("/")]
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
subpath = str(Path(subpath).parent.as_posix())

checkout_cmd += ["sparse-checkout", "set", subpath]

if commit:
checkout_cmd += ["checkout", commit]
Expand Down
21 changes: 0 additions & 21 deletions src/gitingest/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,27 +30,6 @@ class AsyncTimeoutError(Exception):
"""


class MaxFilesReachedError(Exception):
"""Exception raised when the maximum number of files is reached."""

def __init__(self, max_files: int) -> None:
super().__init__(f"Maximum number of files ({max_files}) reached.")


class MaxFileSizeReachedError(Exception):
"""Exception raised when the maximum file size is reached."""

def __init__(self, max_size: int):
super().__init__(f"Maximum file size limit ({max_size/1024/1024:.1f}MB) reached.")


class AlreadyVisitedError(Exception):
"""Exception raised when a symlink target has already been visited."""

def __init__(self, path: str) -> None:
super().__init__(f"Symlink target already visited: {path}")


class InvalidNotebookError(Exception):
"""Exception raised when a Jupyter notebook is invalid or cannot be processed."""

Expand Down
126 changes: 61 additions & 65 deletions src/gitingest/filesystem_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,11 @@
from enum import Enum, auto
from pathlib import Path

from gitingest.exceptions import InvalidNotebookError
from gitingest.utils.ingestion_utils import _get_encoding_list
from gitingest.utils.notebook_utils import process_notebook
from gitingest.utils.textfile_checker_utils import is_textfile

SEPARATOR = "=" * 48 + "\n"
SEPARATOR = "=" * 48 # Tiktoken, the tokenizer openai uses, counts 2 tokens if we have more than 48


class FileSystemNodeType(Enum):
Expand All @@ -36,108 +35,105 @@ class FileSystemNode: # pylint: disable=too-many-instance-attributes
"""
Class representing a node in the file system (either a file or directory).

This class has more than the recommended number of attributes because it needs to
track various properties of files and directories for comprehensive analysis.
Tracks properties of files/directories for comprehensive analysis.
"""

name: str
type: FileSystemNodeType # e.g., "directory" or "file"
type: FileSystemNodeType
path_str: str
path: Path
size: int = 0
file_count: int = 0
dir_count: int = 0
depth: int = 0
children: list[FileSystemNode] = field(default_factory=list) # Using default_factory instead of empty list
children: list[FileSystemNode] = field(default_factory=list)

def sort_children(self) -> None:
"""
Sort the children nodes of a directory according to a specific order.

Order of sorting:
1. README.md first
2. Regular files (not starting with dot)
3. Hidden files (starting with dot)
4. Regular directories (not starting with dot)
5. Hidden directories (starting with dot)
All groups are sorted alphanumerically within themselves.
"""
# Separate files and directories
files = [child for child in self.children if child.type == FileSystemNodeType.FILE]
directories = [child for child in self.children if child.type == FileSystemNodeType.DIRECTORY]
2. Regular files (not starting with dot)
3. Hidden files (starting with dot)
4. Regular directories (not starting with dot)
5. Hidden directories (starting with dot)

# Find README.md
readme_files = [f for f in files if f.name.lower() == "readme.md"]
other_files = [f for f in files if f.name.lower() != "readme.md"]
All groups are sorted alphanumerically within themselves.

# Separate hidden and regular files/directories
regular_files = [f for f in other_files if not f.name.startswith(".")]
hidden_files = [f for f in other_files if f.name.startswith(".")]
regular_dirs = [d for d in directories if not d.name.startswith(".")]
hidden_dirs = [d for d in directories if d.name.startswith(".")]
Raises
------
ValueError
If the node is not a directory.
"""
if self.type != FileSystemNodeType.DIRECTORY:
raise ValueError("Cannot sort children of a non-directory node")

# Sort each group alphanumerically
regular_files.sort(key=lambda x: x.name)
hidden_files.sort(key=lambda x: x.name)
regular_dirs.sort(key=lambda x: x.name)
hidden_dirs.sort(key=lambda x: x.name)
def _sort_key(child: FileSystemNode) -> tuple[int, str]:
# returns the priority order for the sort function, 0 is first
# Groups: 0=README, 1=regular file, 2=hidden file, 3=regular dir, 4=hidden dir
name = child.name.lower()
if child.type == FileSystemNodeType.FILE:
if name == "readme.md":
return (0, name)
return (1 if not name.startswith(".") else 2, name)
return (3 if not name.startswith(".") else 4, name)

self.children = readme_files + regular_files + hidden_files + regular_dirs + hidden_dirs
self.children.sort(key=_sort_key)

@property
def content_string(self) -> str:
"""
Return the content of the node as a string.

This property returns the content of the node as a string, including the path and content.
Return the content of the node as a string, including path and content.

Returns
-------
str
A string representation of the node's content.
"""
content_repr = SEPARATOR
parts = [
SEPARATOR,
f"File: {str(self.path_str).replace(os.sep, '/')}",
SEPARATOR,
f"{self.content}",
]

# Use forward slashes in output paths
content_repr += f"File: {str(self.path_str).replace(os.sep, '/')}\n"
content_repr += SEPARATOR
content_repr += f"{self.content}\n\n"
return content_repr
return "\n".join(parts) + "\n\n"

@property
def content(self) -> str: # pylint: disable=too-many-return-statements
"""
Read the content of a file.

This function attempts to open a file and read its contents using UTF-8 encoding.
If an error occurs during reading (e.g., file is not found or permission error),
it returns an error message.
Read the content of a file if it's text (or a notebook). Return an error message otherwise.

Returns
-------
str
The content of the file, or an error message if the file could not be read.

Raises
------
ValueError
If the node is a directory.
"""
if self.type == FileSystemNodeType.FILE and not is_textfile(self.path):
if self.type == FileSystemNodeType.DIRECTORY:
raise ValueError("Cannot read content of a directory node")

if not is_textfile(self.path):
return "[Non-text file]"

try:
if self.path.suffix == ".ipynb":
try:
return process_notebook(self.path)
except Exception as exc:
return f"Error processing notebook: {exc}"

for encoding in _get_encoding_list():
try:
with self.path.open(encoding=encoding) as f:
return f.read()
except UnicodeDecodeError:
continue
except OSError as exc:
return f"Error reading file: {exc}"

return "Error: Unable to decode file with available encodings"

except (OSError, InvalidNotebookError) as exc:
return f"Error reading file: {exc}"
if self.path.suffix == ".ipynb":
try:
return process_notebook(self.path)
except Exception as exc:
return f"Error processing notebook: {exc}"

# Try multiple encodings
for encoding in _get_encoding_list():
try:
with self.path.open(encoding=encoding) as f:
return f.read()
except UnicodeDecodeError:
continue
except OSError as exc:
return f"Error reading file: {exc}"

return "Error: Unable to decode file with available encodings"
12 changes: 8 additions & 4 deletions src/gitingest/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES
from gitingest.filesystem_schema import FileSystemNode, FileSystemNodeType, FileSystemStats
from gitingest.output_formatters import format_directory, format_single_file
from gitingest.output_formatters import format_node
from gitingest.query_parsing import ParsedQuery
from gitingest.utils.ingestion_utils import _should_exclude, _should_include
from gitingest.utils.path_utils import _is_safe_symlink
Expand Down Expand Up @@ -38,7 +38,7 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]:
Raises
------
ValueError
If the specified path cannot be found or if the file is not a text file.
If the path cannot be found, is not a file, or the file has no content.
"""
subpath = Path(query.subpath.strip("/")).as_posix()
path = query.local_path / subpath
Expand All @@ -63,7 +63,11 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]:
path_str=str(relative_path),
path=path,
)
return format_single_file(file_node, query)

if not file_node.content:
raise ValueError(f"File {file_node.name} has no content")

return format_node(file_node, query)

root_node = FileSystemNode(
name=path.name,
Expand All @@ -80,7 +84,7 @@ def ingest_query(query: ParsedQuery) -> Tuple[str, str, str]:
stats=stats,
)

return format_directory(root_node, query)
return format_node(root_node, query)


def apply_gitingest_file(path: Path, query: ParsedQuery) -> None:
Expand Down
Loading
Loading