dfetch-org · spoorcc · Oct 18, 2025 · Oct 18, 2025 · Oct 18, 2025 · korbit-ai
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -14,17 +14,17 @@ repos:
     hooks:
     -   id: isort
         name: Sort import
-        entry: isort
+        entry: dfetch
+        args: ['filter', 'isort']
         language: system
         types: [file, python]
-        exclude: ^doc/_ext/sphinxcontrib_asciinema
 
     -   id: black
         name: Black (auto-format)
-        entry: black
+        entry: dfetch
+        args: ['filter', 'black']
         language: system
         types: [file, python]
-        exclude: ^doc/_ext/sphinxcontrib_asciinema
 
     -   id: pylint
         name: pylint
@@ -101,9 +101,10 @@ repos:
     -   id: codespell
         name: codespell
         description: Checks for common misspellings in text files.
-        entry: codespell
+        entry: dfetch
+        args: ['filter', 'codespell']
         language: python
-        exclude: ^doc/_ext/sphinxcontrib_asciinema/_static/asciinema-player_3.12.1.js
+        # exclude: ^doc/_ext/sphinxcontrib_asciinema/_static/asciinema-player_3.12.1.js
         types: [text]
     -   id: ruff
         name: ruff

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -15,6 +15,7 @@ Release 0.11.0 (unreleased)
 * Handle SVN tags with special characters (#811)
 * Don't return non-zero exit code if tool not found during environment (#701)
 * Create standalone binaries for Linux, Mac & Windows (#705)
+* Add filter command (#19)
 
 Release 0.10.0 (released 2025-03-12)
 ====================================

diff --git a/dfetch/__main__.py b/dfetch/__main__.py
@@ -10,6 +10,7 @@
 import dfetch.commands.check
 import dfetch.commands.diff
 import dfetch.commands.environment
+import dfetch.commands.filter
 import dfetch.commands.freeze
 import dfetch.commands.import_
 import dfetch.commands.init
@@ -29,7 +30,9 @@ class DfetchFatalException(Exception):
 def create_parser() -> argparse.ArgumentParser:
     """Create the main argument parser."""
     parser = argparse.ArgumentParser(
-        formatter_class=argparse.RawTextHelpFormatter, epilog=__doc__
+        formatter_class=argparse.RawTextHelpFormatter,
+        epilog=__doc__,
+        exit_on_error=False,
     )
     parser.add_argument(
         "--verbose", "-v", action="store_true", help="Increase verbosity"
@@ -40,6 +43,7 @@ def create_parser() -> argparse.ArgumentParser:
     dfetch.commands.check.Check.create_menu(subparsers)
     dfetch.commands.diff.Diff.create_menu(subparsers)
     dfetch.commands.environment.Environment.create_menu(subparsers)
+    dfetch.commands.filter.Filter.create_menu(subparsers)
     dfetch.commands.freeze.Freeze.create_menu(subparsers)
     dfetch.commands.import_.Import.create_menu(subparsers)
     dfetch.commands.init.Init.create_menu(subparsers)
@@ -57,8 +61,15 @@ def _help(args: argparse.Namespace) -> None:
 
 def run(argv: Sequence[str]) -> None:
     """Start dfetch."""
-    logger.print_title()
-    args = create_parser().parse_args(argv)
+    parser = create_parser()
+    try:
+        args = parser.parse_args(argv)
+    except argparse.ArgumentError as exc:
+        logger.print_title()
+        parser.error(exc.message)
+
+    if args.verbose or not getattr(args.func, "SILENT", False):
+        logger.print_title()
 
     if args.verbose:
         dfetch.log.increase_verbosity()

diff --git a/dfetch/commands/filter.py b/dfetch/commands/filter.py
@@ -0,0 +1,142 @@
+"""*Dfetch* can filter files in the repo.
+
+It can either accept no input to list all files. A list of files can be piped in (such as through ``find``)
+or it can be used as a wrapper around a certain tool to block or allow files under control by dfetch.
+"""
+
+import argparse
+import os
+import sys
+from pathlib import Path
+from typing import Optional
+
+import dfetch.commands.command
+import dfetch.log
+import dfetch.manifest.manifest
+from dfetch.log import get_logger
+from dfetch.util.cmdline import run_on_cmdline_uncaptured
+from dfetch.util.util import in_directory
+
+logger = get_logger(__name__)
+
+
+class Filter(dfetch.commands.command.Command):
+    """Filter files based on flags and pass on any command.
+
+    Based on the provided arguments filter files, and call the given arguments or print them out if no command given.
+    """
+
+    SILENT = True
+
+    @staticmethod
+    def create_menu(subparsers: dfetch.commands.command.SubparserActionType) -> None:
+        """Add the parser menu for this action."""
+        parser = dfetch.commands.command.Command.parser(subparsers, Filter)
+        parser.add_argument(
+            "--in-manifest",
+            "-i",
+            action="store_true",
+            default=False,
+            help="Keep files that came here through the manifest.",
+        )
+
+        parser.add_argument(
+            "cmd",
+            metavar="<cmd>",
+            type=str,
+            nargs="?",
+            help="Command to call",
+        )
+
+        parser.add_argument(
+            "args",
+            metavar="<args>",
+            type=str,
+            nargs="*",
+            help="Arguments to pass to the command",
+        )
+
+    def __call__(self, args: argparse.Namespace) -> None:
+        """Perform the filter."""
+        if not args.verbose:
+            dfetch.log.set_level("ERROR")
+        manifest = dfetch.manifest.manifest.get_manifest()
+
+        pwd = Path.cwd()
+        topdir = Path(manifest.path).parent
+        with in_directory(topdir):
+
+            project_paths = {
+                Path(project.destination).resolve() for project in manifest.projects
+            }
+
+            input_list = self._determine_input_list(args)
+            block_inside, block_outside = self._filter_files(
+                pwd, topdir, project_paths, input_list
+            )
+
+        blocklist = block_outside if args.in_manifest else block_inside
+
+        filtered_args = [arg for arg in input_list if arg not in blocklist]
+
+        if args.cmd:
+            run_on_cmdline_uncaptured(logger, [args.cmd] + filtered_args)
+        else:
+            print(os.linesep.join(filtered_args))
+
+    def _determine_input_list(self, args: argparse.Namespace) -> list[str]:
+        """Determine list of inputs to process."""
+        input_list: list[str] = list(str(arg) for arg in args.args)
+        if not sys.stdin.isatty():
+            input_list += list(str(arg).strip() for arg in sys.stdin.readlines())
+
+        # If no input from stdin or args loop over all files
+        if not input_list:
+            input_list = list(
+                str(file) for file in Path(".").rglob("*") if file.is_file()
+            )
+
+        return input_list
+
+    def _filter_files(
+        self, pwd: Path, topdir: Path, project_paths: set[Path], input_list: list[str]
+    ) -> tuple[list[str], list[str]]:
+        """Filter files in input_set in files in one of the project_paths or not."""
+        block_inside: list[str] = []
+        block_outside: list[str] = []
+
+        for path_or_arg in input_list:
+            arg_abs_path = Path(pwd / path_or_arg.strip()).resolve()
+            if not arg_abs_path.exists():
+                logger.print_info_line(path_or_arg.strip(), "not a file / dir")
+                continue
+            try:
+                arg_abs_path.relative_to(topdir)
+            except ValueError:
+                logger.print_info_line(path_or_arg.strip(), "outside project")
+                block_inside.append(path_or_arg)
+                block_outside.append(path_or_arg)
+                continue
+
+            containing_dir = self._file_in_project(arg_abs_path, project_paths)
+
+            if containing_dir:
+                block_inside.append(path_or_arg)
+                logger.print_info_line(
+                    path_or_arg.strip(), f"inside project ({containing_dir})"
+                )
+            else:
+                block_outside.append(path_or_arg)
+                logger.print_info_line(path_or_arg.strip(), "not inside any project")
+
+        return block_inside, block_outside
+
+    def _file_in_project(self, file: Path, project_paths: set[Path]) -> Optional[Path]:
+        """Check if a specific file is somewhere in one of the project paths."""
+        for project_path in project_paths:
+            try:
+                file.relative_to(project_path)
+                return project_path
+            except ValueError:
+                continue
+        return None
diff --git a/dfetch/log.py b/dfetch/log.py
@@ -57,6 +57,11 @@ def increase_verbosity() -> None:
     coloredlogs.increase_verbosity()
 
 
+def set_level(level: str) -> None:
+    """Set the level of the logger."""
+    coloredlogs.set_level(level)
+
+
 def get_logger(name: str) -> DLogger:
     """Get logger for a module."""
     logging.setLoggerClass(DLogger)

diff --git a/dfetch/util/cmdline.py b/dfetch/util/cmdline.py
@@ -69,6 +69,34 @@ def run_on_cmdline(
     return proc
 
 
+def run_on_cmdline_uncaptured(
+    logger: logging.Logger, cmd: Union[str, list[str]]
+) -> "subprocess.CompletedProcess[Any]":
+    """Run a command and log the output, and raise if something goes wrong."""
+    logger.debug(f"Running {cmd}")
+
+    if not isinstance(cmd, list):
+        cmd = cmd.split(" ")
+
+    try:
+        proc = subprocess.run(cmd, capture_output=False, check=True)  # nosec
+    except subprocess.CalledProcessError as exc:
+        raise SubprocessCommandError(
+            exc.cmd,
+            "",
+            "",
+            exc.returncode,
+        ) from exc
+    except FileNotFoundError as exc:
+        cmd = cmd[0]
+        raise RuntimeError(f"{cmd} not available on system, please install") from exc
+
+    if proc.returncode:
+        raise SubprocessCommandError(cmd, "", "", proc.returncode)
+
+    return proc
+
+
 def _log_output(proc: subprocess.CompletedProcess, logger: logging.Logger) -> None:  # type: ignore
     logger.debug(f"Return code: {proc.returncode}")
 

diff --git a/dfetch/util/util.py b/dfetch/util/util.py
@@ -63,14 +63,14 @@ def safe_rmtree(path: str) -> None:
 
 
 @contextmanager
-def in_directory(path: str) -> Generator[str, None, None]:
+def in_directory(path: Union[str, Path]) -> Generator[str, None, None]:
     """Work temporarily in a given directory."""
     pwd = os.getcwd()
     if not os.path.isdir(path):
         path = os.path.dirname(path)
     os.chdir(path)
     try:
-        yield path
+        yield str(path)
     finally:
         os.chdir(pwd)
 

diff --git a/doc/manual.rst b/doc/manual.rst
@@ -139,3 +139,13 @@ Import
 .. asciinema:: asciicasts/import.cast
 
 .. automodule:: dfetch.commands.import_
+
+Filter
+------
+.. argparse::
+   :module: dfetch.__main__
+   :func: create_parser
+   :prog: dfetch
+   :path: filter
+
+.. automodule:: dfetch.commands.filter
diff --git a/features/filter-projects.feature b/features/filter-projects.feature
@@ -0,0 +1,32 @@
+Feature: Filtering file paths before executing a tool
+
+    Projects are dfetched and used in parent projects, users would like to run
+    static analysis tools but ignore externally vendored projects. The dfetch filter
+    command makes it possible to wrap a cal to another tool and filter out any external files.
+    Also it is possible to list all files that are under control of dfetch and this can be used
+    to automate various tasks. Paths outside the top-level directory should be excluded to prevent
+    any path traversal.
+
+    Background:
+        Given a git repository "SomeProject.git"
+        And a fetched and committed MyProject with the manifest
+            """
+            manifest:
+            version: 0.0
+            projects:
+            - name: SomeProject
+            url: some-remote-server/SomeProject.git
+            """
+
+    Scenario: Tool receives only managed files
+        When I run "git ls-files | dfetch filter echo"
+        Then the output shows
+            """
+            Dfetch (0.10.0)
+            ext/test-repo-rev-only: wanted (e1fda19a57b873eb8e6ae37780594cbb77b70f1a), available (e1fda19a57b873eb8e6ae37780594cbb77b70f1a)
+            ext/test-rev-and-branch: wanted (main - 8df389d0524863b85f484f15a91c5f2c40aefda1), available (main - e1fda19a57b873eb8e6ae37780594cbb77b70f1a)
+            """
+
+#   Scenario: Tool receives only unmanaged files
+
+#   Scenario: Fail on path traversal outside top-level manifest directory