-
Notifications
You must be signed in to change notification settings - Fork 5
WIP: Add basic filter command as suggested in #834
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,6 +10,7 @@ | |
| import dfetch.commands.check | ||
| import dfetch.commands.diff | ||
| import dfetch.commands.environment | ||
| import dfetch.commands.filter | ||
| import dfetch.commands.freeze | ||
| import dfetch.commands.import_ | ||
| import dfetch.commands.init | ||
|
|
@@ -29,7 +30,9 @@ class DfetchFatalException(Exception): | |
| def create_parser() -> argparse.ArgumentParser: | ||
| """Create the main argument parser.""" | ||
| parser = argparse.ArgumentParser( | ||
| formatter_class=argparse.RawTextHelpFormatter, epilog=__doc__ | ||
| formatter_class=argparse.RawTextHelpFormatter, | ||
| epilog=__doc__, | ||
| exit_on_error=False, | ||
| ) | ||
| parser.add_argument( | ||
| "--verbose", "-v", action="store_true", help="Increase verbosity" | ||
|
|
@@ -40,6 +43,7 @@ def create_parser() -> argparse.ArgumentParser: | |
| dfetch.commands.check.Check.create_menu(subparsers) | ||
| dfetch.commands.diff.Diff.create_menu(subparsers) | ||
| dfetch.commands.environment.Environment.create_menu(subparsers) | ||
| dfetch.commands.filter.Filter.create_menu(subparsers) | ||
| dfetch.commands.freeze.Freeze.create_menu(subparsers) | ||
| dfetch.commands.import_.Import.create_menu(subparsers) | ||
| dfetch.commands.init.Init.create_menu(subparsers) | ||
|
|
@@ -57,8 +61,15 @@ def _help(args: argparse.Namespace) -> None: | |
|
|
||
| def run(argv: Sequence[str]) -> None: | ||
| """Start dfetch.""" | ||
| logger.print_title() | ||
| args = create_parser().parse_args(argv) | ||
| parser = create_parser() | ||
| try: | ||
| args = parser.parse_args(argv) | ||
| except argparse.ArgumentError as exc: | ||
| logger.print_title() | ||
| parser.error(exc.message) | ||
|
|
||
| if args.verbose or not getattr(args.func, "SILENT", False): | ||
| logger.print_title() | ||
|
Comment on lines
+71
to
+72
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Magic String Attribute Lookup
Tell me moreWhat is the issue?The use of a magic string 'SILENT' as an attribute lookup makes the code's intent unclear without additional context. Why this mattersFuture maintainers will need to search for where SILENT is defined and understand its purpose. This creates cognitive overhead and potential maintenance issues. Suggested change ∙ Feature Preview# Define a constant at module level
SILENT_COMMAND_FLAG = 'SILENT'
# Use in the code
if args.verbose or not getattr(args.func, SILENT_COMMAND_FLAG, False):
logger.print_title()Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit. |
||
|
|
||
| if args.verbose: | ||
| dfetch.log.increase_verbosity() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,142 @@ | ||
| """*Dfetch* can filter files in the repo. | ||
| It can either accept no input to list all files. A list of files can be piped in (such as through ``find``) | ||
| or it can be used as a wrapper around a certain tool to block or allow files under control by dfetch. | ||
| """ | ||
|
|
||
| import argparse | ||
| import os | ||
| import sys | ||
| from pathlib import Path | ||
| from typing import Optional | ||
|
|
||
| import dfetch.commands.command | ||
| import dfetch.log | ||
| import dfetch.manifest.manifest | ||
| from dfetch.log import get_logger | ||
| from dfetch.util.cmdline import run_on_cmdline_uncaptured | ||
| from dfetch.util.util import in_directory | ||
|
|
||
| logger = get_logger(__name__) | ||
|
|
||
|
|
||
| class Filter(dfetch.commands.command.Command): | ||
| """Filter files based on flags and pass on any command. | ||
| Based on the provided arguments filter files, and call the given arguments or print them out if no command given. | ||
| """ | ||
|
|
||
| SILENT = True | ||
|
|
||
| @staticmethod | ||
| def create_menu(subparsers: dfetch.commands.command.SubparserActionType) -> None: | ||
| """Add the parser menu for this action.""" | ||
| parser = dfetch.commands.command.Command.parser(subparsers, Filter) | ||
| parser.add_argument( | ||
| "--in-manifest", | ||
| "-i", | ||
| action="store_true", | ||
| default=False, | ||
| help="Keep files that came here through the manifest.", | ||
| ) | ||
|
|
||
| parser.add_argument( | ||
| "cmd", | ||
| metavar="<cmd>", | ||
| type=str, | ||
| nargs="?", | ||
| help="Command to call", | ||
| ) | ||
|
|
||
| parser.add_argument( | ||
| "args", | ||
| metavar="<args>", | ||
| type=str, | ||
| nargs="*", | ||
| help="Arguments to pass to the command", | ||
| ) | ||
|
|
||
| def __call__(self, args: argparse.Namespace) -> None: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mixed Responsibilities in Entry Point
Tell me moreWhat is the issue?The call method mixes configuration, business logic, and output handling in a single method. Why this mattersThis violates the Single Responsibility Principle and makes the code less maintainable and harder to test individual components. Suggested change ∙ Feature PreviewSplit the call method into separate methods for configuration, filtering, and output handling: def __call__(self, args: argparse.Namespace) -> None:
self._configure_logging(args)
filtered_args = self._process_filtering(args)
self._handle_output(args, filtered_args)Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit. |
||
| """Perform the filter.""" | ||
| if not args.verbose: | ||
| dfetch.log.set_level("ERROR") | ||
| manifest = dfetch.manifest.manifest.get_manifest() | ||
|
|
||
| pwd = Path.cwd() | ||
| topdir = Path(manifest.path).parent | ||
| with in_directory(topdir): | ||
|
|
||
| project_paths = { | ||
| Path(project.destination).resolve() for project in manifest.projects | ||
| } | ||
|
|
||
| input_list = self._determine_input_list(args) | ||
| block_inside, block_outside = self._filter_files( | ||
| pwd, topdir, project_paths, input_list | ||
| ) | ||
|
|
||
| blocklist = block_outside if args.in_manifest else block_inside | ||
|
|
||
| filtered_args = [arg for arg in input_list if arg not in blocklist] | ||
|
|
||
| if args.cmd: | ||
| run_on_cmdline_uncaptured(logger, [args.cmd] + filtered_args) | ||
| else: | ||
| print(os.linesep.join(filtered_args)) | ||
|
|
||
| def _determine_input_list(self, args: argparse.Namespace) -> list[str]: | ||
| """Determine list of inputs to process.""" | ||
| input_list: list[str] = list(str(arg) for arg in args.args) | ||
| if not sys.stdin.isatty(): | ||
| input_list += list(str(arg).strip() for arg in sys.stdin.readlines()) | ||
spoorcc marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| # If no input from stdin or args loop over all files | ||
| if not input_list: | ||
| input_list = list( | ||
| str(file) for file in Path(".").rglob("*") if file.is_file() | ||
| ) | ||
|
|
||
| return input_list | ||
|
|
||
| def _filter_files( | ||
| self, pwd: Path, topdir: Path, project_paths: set[Path], input_list: list[str] | ||
| ) -> tuple[list[str], list[str]]: | ||
|
Comment on lines
+101
to
+103
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Non-descriptive tuple return type
Tell me moreWhat is the issue?The return type annotation using tuple[list[str], list[str]] is not descriptive enough to understand what the two lists represent. Why this mattersGeneric tuple return types make it difficult to understand the meaning of each component without looking at the implementation. Suggested change ∙ Feature Previewfrom typing import NamedTuple
class FilterResult(NamedTuple):
files_inside_projects: list[str]
files_outside_projects: list[str]
def _filter_files(
self, pwd: Path, topdir: Path, project_paths: set[Path], input_list: list[str]
) -> FilterResult:Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit. |
||
| """Filter files in input_set in files in one of the project_paths or not.""" | ||
| block_inside: list[str] = [] | ||
| block_outside: list[str] = [] | ||
|
Comment on lines
+105
to
+106
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unclear list variable names
Tell me moreWhat is the issue?The variable names 'block_inside' and 'block_outside' are not immediately clear about what they represent in the context of file filtering. Why this mattersUnclear variable names force readers to trace through the code to understand their purpose, increasing cognitive load. Suggested change ∙ Feature Previewfiles_inside_projects: list[str] = []
files_outside_projects: list[str] = []Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit. |
||
|
|
||
| for path_or_arg in input_list: | ||
| arg_abs_path = Path(pwd / path_or_arg.strip()).resolve() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Expensive path resolution per file
Tell me moreWhat is the issue?Path resolution with resolve() is called for every input file, which involves expensive filesystem operations including symlink resolution and path canonicalization. Why this mattersThe resolve() method performs multiple filesystem syscalls per file, creating significant I/O overhead that scales linearly with the number of input files and can become a bottleneck for large file sets. Suggested change ∙ Feature PreviewCache resolved paths or use absolute path construction without full resolution when symlink handling isn't critical: arg_abs_path = (pwd / path_or_arg.strip()).absolute()Only call resolve() when necessary for symlink handling. Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit. |
||
| if not arg_abs_path.exists(): | ||
| logger.print_info_line(path_or_arg.strip(), "not a file / dir") | ||
| continue | ||
| try: | ||
| arg_abs_path.relative_to(topdir) | ||
| except ValueError: | ||
| logger.print_info_line(path_or_arg.strip(), "outside project") | ||
| block_inside.append(path_or_arg) | ||
| block_outside.append(path_or_arg) | ||
| continue | ||
|
|
||
| containing_dir = self._file_in_project(arg_abs_path, project_paths) | ||
|
|
||
| if containing_dir: | ||
| block_inside.append(path_or_arg) | ||
| logger.print_info_line( | ||
| path_or_arg.strip(), f"inside project ({containing_dir})" | ||
| ) | ||
| else: | ||
| block_outside.append(path_or_arg) | ||
| logger.print_info_line(path_or_arg.strip(), "not inside any project") | ||
|
|
||
| return block_inside, block_outside | ||
|
|
||
| def _file_in_project(self, file: Path, project_paths: set[Path]) -> Optional[Path]: | ||
| """Check if a specific file is somewhere in one of the project paths.""" | ||
| for project_path in project_paths: | ||
| try: | ||
| file.relative_to(project_path) | ||
| return project_path | ||
| except ValueError: | ||
| continue | ||
|
Comment on lines
+136
to
+141
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Inefficient O(n*m) project path lookup
Tell me moreWhat is the issue?The file-in-project check performs O(n) linear search through all project paths for each file, resulting in O(n*m) complexity where n is files and m is projects. Why this mattersWith many files and projects, this nested loop creates quadratic time complexity that will significantly slow down filtering operations as the number of projects grows. Suggested change ∙ Feature PreviewPre-sort project paths by depth (deepest first) and use early termination, or consider using a trie-based structure for path prefix matching to reduce average case complexity. Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit. |
||
| return None | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -69,6 +69,34 @@ def run_on_cmdline( | |
| return proc | ||
|
|
||
|
|
||
| def run_on_cmdline_uncaptured( | ||
| logger: logging.Logger, cmd: Union[str, list[str]] | ||
| ) -> "subprocess.CompletedProcess[Any]": | ||
| """Run a command and log the output, and raise if something goes wrong.""" | ||
| logger.debug(f"Running {cmd}") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Command execution logged at DEBUG level
Tell me moreWhat is the issue?The log message for running a command uses the DEBUG level, which might be too low for important command executions. Why this mattersUsing DEBUG level for command execution logs may result in these important events being missed in production environments where DEBUG logs are typically disabled. Suggested change ∙ Feature PreviewChange the log level to INFO for command execution: logger.info(f"Running command: {cmd}")Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit. |
||
|
|
||
| if not isinstance(cmd, list): | ||
| cmd = cmd.split(" ") | ||
|
Comment on lines
+78
to
+79
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Naive string splitting for command parsing
Tell me moreWhat is the issue?String splitting on single space fails for commands with multiple consecutive spaces or complex arguments. Why this mattersThis naive splitting approach will create empty strings in the command list when there are multiple spaces, potentially causing subprocess execution failures or incorrect argument parsing. Suggested change ∙ Feature PreviewUse import shlex
if not isinstance(cmd, list):
cmd = shlex.split(cmd)Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit. |
||
|
|
||
| try: | ||
| proc = subprocess.run(cmd, capture_output=False, check=True) # nosec | ||
| except subprocess.CalledProcessError as exc: | ||
| raise SubprocessCommandError( | ||
| exc.cmd, | ||
| "", | ||
| "", | ||
| exc.returncode, | ||
| ) from exc | ||
| except FileNotFoundError as exc: | ||
| cmd = cmd[0] | ||
| raise RuntimeError(f"{cmd} not available on system, please install") from exc | ||
|
|
||
| if proc.returncode: | ||
| raise SubprocessCommandError(cmd, "", "", proc.returncode) | ||
|
|
||
| return proc | ||
|
|
||
|
|
||
| def _log_output(proc: subprocess.CompletedProcess, logger: logging.Logger) -> None: # type: ignore | ||
| logger.debug(f"Return code: {proc.returncode}") | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| Feature: Filtering file paths before executing a tool | ||
|
|
||
| Projects are dfetched and used in parent projects, users would like to run | ||
| static analysis tools but ignore externally vendored projects. The dfetch filter | ||
| command makes it possible to wrap a cal to another tool and filter out any external files. | ||
| Also it is possible to list all files that are under control of dfetch and this can be used | ||
| to automate various tasks. Paths outside the top-level directory should be excluded to prevent | ||
| any path traversal. | ||
|
|
||
| Background: | ||
| Given a git repository "SomeProject.git" | ||
| And a fetched and committed MyProject with the manifest | ||
| """ | ||
| manifest: | ||
| version: 0.0 | ||
| projects: | ||
| - name: SomeProject | ||
| url: some-remote-server/SomeProject.git | ||
| """ | ||
|
|
||
| Scenario: Tool receives only managed files | ||
| When I run "git ls-files | dfetch filter echo" | ||
| Then the output shows | ||
| """ | ||
| Dfetch (0.10.0) | ||
| ext/test-repo-rev-only: wanted (e1fda19a57b873eb8e6ae37780594cbb77b70f1a), available (e1fda19a57b873eb8e6ae37780594cbb77b70f1a) | ||
| ext/test-rev-and-branch: wanted (main - 8df389d0524863b85f484f15a91c5f2c40aefda1), available (main - e1fda19a57b873eb8e6ae37780594cbb77b70f1a) | ||
| """ | ||
|
|
||
| # Scenario: Tool receives only unmanaged files | ||
|
|
||
| # Scenario: Fail on path traversal outside top-level manifest directory |
Uh oh!
There was an error while loading. Please reload this page.