diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..a0565651 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,12 @@ +{ + "configurations": [ + { + "name": "Python Debugger: Module", + "type": "debugpy", + "request": "launch", + "module": "uvicorn", + "args": ["server.main:app", "--host", "0.0.0.0", "--port", "8000"], + "cwd": "${workspaceFolder}/src" + } + ] +} diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 72e11c4f..d3005250 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -9,7 +9,6 @@ from gitingest.query_parsing import IngestionQuery from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats from gitingest.utils.ingestion_utils import _should_exclude, _should_include -from gitingest.utils.path_utils import _is_safe_symlink try: import tomllib # type: ignore[import] @@ -171,11 +170,6 @@ def _process_node( The parsed query object containing information about the repository and query parameters. stats : FileSystemStats Statistics tracking object for the total file count and size. - - Raises - ------ - ValueError - If an unexpected error occurs during processing. """ if limit_exceeded(stats, node.depth): @@ -183,28 +177,15 @@ def _process_node( for sub_path in node.path.iterdir(): - symlink_path = None - if sub_path.is_symlink(): - if not _is_safe_symlink(sub_path, query.local_path): - print(f"Skipping unsafe symlink: {sub_path}") - continue - - symlink_path = sub_path - sub_path = sub_path.resolve() - - if sub_path in stats.visited: - print(f"Skipping already visited path: {sub_path}") - continue - - stats.visited.add(sub_path) - if query.ignore_patterns and _should_exclude(sub_path, query.local_path, query.ignore_patterns): continue if query.include_patterns and not _should_include(sub_path, query.local_path, query.include_patterns): continue - if sub_path.is_file(): + if sub_path.is_symlink(): + _process_symlink(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) + elif sub_path.is_file(): _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) elif sub_path.is_dir(): @@ -216,11 +197,6 @@ def _process_node( depth=node.depth + 1, ) - # rename the subdir to reflect the symlink name - if symlink_path: - child_directory_node.name = symlink_path.name - child_directory_node.path_str = str(symlink_path) - _process_node( node=child_directory_node, query=query, @@ -230,13 +206,41 @@ def _process_node( node.size += child_directory_node.size node.file_count += child_directory_node.file_count node.dir_count += 1 + child_directory_node.dir_count - else: - raise ValueError(f"Unexpected error: {sub_path} is neither a file nor a directory") + print(f"Warning: {sub_path} is an unknown file type, skipping") node.sort_children() +def _process_symlink(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None: + """ + Process a symlink in the file system. + + This function checks the symlink's target. + + Parameters + ---------- + path : Path + The full path of the symlink. + parent_node : FileSystemNode + The parent directory node. + stats : FileSystemStats + Statistics tracking object for the total file count and size. + local_path : Path + The base path of the repository or directory being processed. + """ + child = FileSystemNode( + name=path.name, + type=FileSystemNodeType.SYMLINK, + path_str=str(path.relative_to(local_path)), + path=path, + depth=parent_node.depth + 1, + ) + stats.total_files += 1 + parent_node.children.append(child) + parent_node.file_count += 1 + + def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None: """ Process a file in the file system. diff --git a/src/gitingest/output_formatters.py b/src/gitingest/output_formatters.py index 7169d5c9..5bacba22 100644 --- a/src/gitingest/output_formatters.py +++ b/src/gitingest/output_formatters.py @@ -31,7 +31,7 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> Tuple[str, str, if node.type == FileSystemNodeType.DIRECTORY: summary += f"Files analyzed: {node.file_count}\n" - else: + elif node.type == FileSystemNodeType.FILE: summary += f"File: {node.name}\n" summary += f"Lines: {len(node.content.splitlines()):,}\n" @@ -101,7 +101,7 @@ def _gather_file_contents(node: FileSystemNode) -> str: str The concatenated content of all files under the given node. """ - if node.type == FileSystemNodeType.FILE: + if node.type != FileSystemNodeType.DIRECTORY: return node.content_string # Recursively gather contents of all files under the current directory @@ -142,6 +142,8 @@ def _create_tree_structure(query: IngestionQuery, node: FileSystemNode, prefix: display_name = node.name if node.type == FileSystemNodeType.DIRECTORY: display_name += "/" + elif node.type == FileSystemNodeType.SYMLINK: + display_name += " -> " + node.path.readlink().name tree_str += f"{prefix}{current_prefix}{display_name}\n" diff --git a/src/gitingest/schemas/filesystem_schema.py b/src/gitingest/schemas/filesystem_schema.py index fdd3e338..6bb4569a 100644 --- a/src/gitingest/schemas/filesystem_schema.py +++ b/src/gitingest/schemas/filesystem_schema.py @@ -18,6 +18,7 @@ class FileSystemNodeType(Enum): DIRECTORY = auto() FILE = auto() + SYMLINK = auto() @dataclass @@ -91,7 +92,8 @@ def content_string(self) -> str: """ parts = [ SEPARATOR, - f"File: {str(self.path_str).replace(os.sep, '/')}", + f"{self.type.name}: {str(self.path_str).replace(os.sep, '/')}" + + (f" -> {self.path.readlink().name}" if self.type == FileSystemNodeType.SYMLINK else ""), SEPARATOR, f"{self.content}", ] @@ -116,6 +118,9 @@ def content(self) -> str: # pylint: disable=too-many-return-statements if self.type == FileSystemNodeType.DIRECTORY: raise ValueError("Cannot read content of a directory node") + if self.type == FileSystemNodeType.SYMLINK: + return "" + if not is_text_file(self.path): return "[Non-text file]"