diff --git a/Cargo.lock b/Cargo.lock index 5e087c99..126b714a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -112,7 +112,9 @@ version = "0.1.0" dependencies = [ "chardetng", "encoding_rs", + "ignore", "log", + "path-slash", "pyo3", "pyo3-log", "rayon", @@ -159,12 +161,41 @@ dependencies = [ "wasi", ] +[[package]] +name = "globset" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57da3b9b5b85bd66f31093f8c408b90a74431672542466497dcbdfdc02034be1" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata", + "regex-syntax", +] + [[package]] name = "heck" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "ignore" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b46810df39e66e925525d6e38ce1e7f6e1d208f72dc39757880fcb66e2c58af1" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata", + "same-file", + "walkdir", + "winapi-util", +] + [[package]] name = "indoc" version = "2.0.5" @@ -270,6 +301,12 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "path-slash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e91099d4268b0e11973f036e885d652fb0b21fedcf69738c627f94db6a44f42" + [[package]] name = "phf" version = "0.11.2" @@ -575,6 +612,15 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -696,12 +742,53 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-targets" version = "0.48.5" diff --git a/Cargo.toml b/Cargo.toml index dd6e9501..0af30f5b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,9 @@ crate-type = ["cdylib"] [dependencies] chardetng = "0.1.17" encoding_rs = "0.8.33" +ignore = "0.4.22" log = "0.4.21" +path-slash = "0.2.1" pyo3 = { version = "0.20.3", features = ["abi3-py38"] } pyo3-log = "0.9.0" rayon = "1.9.0" diff --git a/pdm.lock b/pdm.lock index e2260b4f..fd58e05c 100644 --- a/pdm.lock +++ b/pdm.lock @@ -643,7 +643,7 @@ name = "pathspec" version = "0.12.1" requires_python = ">=3.8" summary = "Utility library for gitignore style pattern matching of file paths." -groups = ["default", "docs"] +groups = ["docs"] files = [ {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, diff --git a/pyproject.toml b/pyproject.toml index f602a9bb..4c1320c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,6 @@ classifiers = [ ] dependencies = [ "click>=8.0.0,<9", - "pathspec>=0.9.0", "colorama>=0.4.6; sys_platform == 'win32'", "tomli>=2.0.1; python_version < '3.11'" ] diff --git a/python/deptry/core.py b/python/deptry/core.py index 45422cd8..2fe63530 100644 --- a/python/deptry/core.py +++ b/python/deptry/core.py @@ -14,7 +14,7 @@ from deptry.exceptions import IncorrectDependencyFormatError, UnsupportedPythonVersionError from deptry.imports.extract import get_imported_modules_from_list_of_files from deptry.module import ModuleBuilder, ModuleLocations -from deptry.python_file_finder import PythonFileFinder +from deptry.python_file_finder import get_all_python_files_in from deptry.reporters import JSONReporter, TextReporter from deptry.stdlibs import STDLIBS_PYTHON from deptry.violations import ( @@ -65,10 +65,7 @@ def run(self) -> None: self._log_dependencies(dependencies_extract) - all_python_files = PythonFileFinder( - self.exclude, self.extend_exclude, self.using_default_exclude, self.ignore_notebooks - ).get_all_python_files_in(self.root) - + python_files = self._find_python_files() local_modules = self._get_local_modules() stdlib_modules = self._get_stdlib_modules() @@ -83,7 +80,7 @@ def run(self) -> None: ).build(), locations, ) - for module, locations in get_imported_modules_from_list_of_files(all_python_files).items() + for module, locations in get_imported_modules_from_list_of_files(python_files).items() ] imported_modules_with_locations = [ module_with_locations @@ -99,6 +96,19 @@ def run(self) -> None: self._exit(violations) + def _find_python_files(self) -> list[Path]: + logging.debug("Collecting Python files to scan...") + + python_files = get_all_python_files_in( + self.root, self.exclude, self.extend_exclude, self.using_default_exclude, self.ignore_notebooks + ) + + logging.debug( + "Python files to scan for imports:\n%s\n", "\n".join(str(python_file) for python_file in python_files) + ) + + return python_files + def _find_violations( self, imported_modules_with_locations: list[ModuleLocations], dependencies: list[Dependency] ) -> list[Violation]: diff --git a/python/deptry/python_file_finder.py b/python/deptry/python_file_finder.py index 3392c877..2594c1d6 100644 --- a/python/deptry/python_file_finder.py +++ b/python/deptry/python_file_finder.py @@ -1,79 +1,18 @@ from __future__ import annotations -import logging -import os -import re -from dataclasses import dataclass from pathlib import Path -from typing import Pattern -from pathspec import PathSpec +from deptry.rust import find_python_files -@dataclass -class PythonFileFinder: - """ - Get a list of all .py and .ipynb files recursively within a directory. - Args: - exclude: A list of regex patterns of paths to ignore. - extend_exclude: An additional list of regex patterns of paths to ignore. - using_default_exclude: Whether the exclude list was explicitly set, or the default was used. - ignore_notebooks: If ignore_notebooks is set to True, .ipynb files are ignored and only .py files are returned. - """ - - exclude: tuple[str, ...] - extend_exclude: tuple[str, ...] - using_default_exclude: bool - ignore_notebooks: bool = False - - def get_all_python_files_in(self, directories: tuple[Path, ...]) -> list[Path]: - logging.debug("Collecting Python files to scan...") - - source_files = set() - - ignore_regex = re.compile("|".join(self.exclude + self.extend_exclude)) - file_lookup_suffixes = {".py"} if self.ignore_notebooks else {".py", ".ipynb"} - - gitignore_spec = self._generate_gitignore_pathspec(Path()) - - for directory in directories: - for root_str, dirs, files in os.walk(directory, topdown=True): - root = Path(root_str) - - if self._is_directory_ignored(root, ignore_regex): - dirs[:] = [] - continue - - for file_str in files: - file = root / file_str - if not self._is_file_ignored(file, file_lookup_suffixes, ignore_regex, gitignore_spec): - source_files.add(file) - - source_files_list = list(source_files) - - logging.debug("Python files to scan for imports:\n%s\n", "\n".join([str(file) for file in source_files_list])) - - return source_files_list - - def _is_directory_ignored(self, directory: Path, ignore_regex: Pattern[str]) -> bool: - return bool((self.exclude + self.extend_exclude) and ignore_regex.match(str(directory))) - - def _is_file_ignored( - self, file: Path, file_lookup_suffixes: set[str], ignore_regex: Pattern[str], gitignore_spec: PathSpec | None - ) -> bool: - return bool( - file.suffix not in file_lookup_suffixes - or ((self.exclude + self.extend_exclude) and ignore_regex.match(file.as_posix())) - or (gitignore_spec and gitignore_spec.match_file(file)) - ) - - def _generate_gitignore_pathspec(self, directory: Path) -> PathSpec | None: - # If `exclude` is explicitly set, `.gitignore` is not taken into account. - if not self.using_default_exclude: - return None - - try: - with (directory / ".gitignore").open() as gitignore: - return PathSpec.from_lines("gitwildmatch", gitignore) - except FileNotFoundError: - return None +def get_all_python_files_in( + directories: tuple[Path, ...], + exclude: tuple[str, ...], + extend_exclude: tuple[str, ...], + using_default_exclude: bool, + ignore_notebooks: bool = False, +) -> list[Path]: + return [ + Path(f) + for f in find_python_files(directories, exclude, extend_exclude, using_default_exclude, ignore_notebooks) + ] diff --git a/python/deptry/rust.pyi b/python/deptry/rust.pyi index 98ae6f4a..b84cc011 100644 --- a/python/deptry/rust.pyi +++ b/python/deptry/rust.pyi @@ -1,7 +1,16 @@ +from pathlib import Path + from .rust import Location as RustLocation def get_imports_from_py_files(file_paths: list[str]) -> dict[str, list[RustLocation]]: ... def get_imports_from_ipynb_files(file_paths: list[str]) -> dict[str, list[RustLocation]]: ... +def find_python_files( + directories: tuple[Path, ...], + exclude: tuple[str, ...], + extend_exclude: tuple[str, ...], + using_default_exclude: bool, + ignore_notebooks: bool = False, +) -> list[str]: ... class Location: file: str diff --git a/src/lib.rs b/src/lib.rs index a1114596..4ae8852e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,6 +5,7 @@ use pyo3::prelude::*; mod file_utils; mod imports; mod location; +mod python_file_finder; mod visitor; use location::Location; @@ -18,6 +19,7 @@ fn rust(_py: Python, m: &PyModule) -> PyResult<()> { imports::ipynb::get_imports_from_ipynb_files, m )?)?; + m.add_function(wrap_pyfunction!(python_file_finder::find_python_files, m)?)?; m.add_class::()?; Ok(()) } diff --git a/src/python_file_finder.rs b/src/python_file_finder.rs new file mode 100644 index 00000000..e34b3ba8 --- /dev/null +++ b/src/python_file_finder.rs @@ -0,0 +1,93 @@ +use ignore::types::{Types, TypesBuilder}; +use ignore::{DirEntry, Walk, WalkBuilder}; +use path_slash::PathExt; +use pyo3::types::PyList; +use pyo3::{pyfunction, PyObject, PyResult, Python}; +use regex::Regex; +use std::path::PathBuf; + +#[pyfunction] +#[pyo3(signature = (directories, exclude, extend_exclude, using_default_exclude, ignore_notebooks=false))] +pub fn find_python_files( + py: Python, + directories: Vec, + exclude: Vec<&str>, + extend_exclude: Vec<&str>, + using_default_exclude: bool, + ignore_notebooks: bool, +) -> PyResult { + let mut unique_directories = directories; + unique_directories.dedup(); + + let python_files: Vec<_> = build_walker( + unique_directories, + [exclude, extend_exclude].concat(), + using_default_exclude, + ignore_notebooks, + ) + .flatten() + .filter(|entry| entry.path().is_file()) + .map(|entry| { + entry + .path() + .to_string_lossy() + .strip_prefix("./") + .unwrap_or(&entry.path().to_string_lossy()) + .to_owned() + }) + .collect(); + + Ok(PyList::new(py, &python_files).into()) +} + +fn build_walker( + directories: Vec, + excluded_patterns: Vec<&str>, + use_git_ignore: bool, + ignore_notebooks: bool, +) -> Walk { + let (first_directory, additional_directories) = directories.split_first().unwrap(); + + let mut walk_builder = WalkBuilder::new(first_directory); + for path in additional_directories { + walk_builder.add(path); + } + + let re: Option = if excluded_patterns.is_empty() { + None + } else { + Some(Regex::new(format!(r"^({})", excluded_patterns.join("|")).as_str()).unwrap()) + }; + + walk_builder + .types(build_types(ignore_notebooks).unwrap()) + .hidden(false) + .git_ignore(use_git_ignore) + .require_git(false) + .filter_entry(move |entry| entry_satisfies_predicate(entry, re.as_ref())) + .build() +} + +fn build_types(ignore_notebooks: bool) -> Result { + let mut types_builder = TypesBuilder::new(); + types_builder.add("python", "*.py").unwrap(); + types_builder.select("python"); + + if !ignore_notebooks { + types_builder.add("jupyter", "*.ipynb").unwrap(); + types_builder.select("jupyter"); + } + + types_builder.build() +} + +fn entry_satisfies_predicate(entry: &DirEntry, regex: Option<&Regex>) -> bool { + if regex.is_none() { + return true; + } + + let path_str = entry.path().to_slash_lossy(); + !regex + .unwrap() + .is_match(path_str.strip_prefix("./").unwrap_or(&path_str).as_ref()) +} diff --git a/tests/unit/test_python_file_finder.py b/tests/unit/test_python_file_finder.py index bb8a0a5d..d777c035 100644 --- a/tests/unit/test_python_file_finder.py +++ b/tests/unit/test_python_file_finder.py @@ -3,10 +3,8 @@ from pathlib import Path import pytest -from pathspec import PathSpec -from pathspec.patterns.gitwildmatch import GitWildMatchPattern -from deptry.python_file_finder import PythonFileFinder +from deptry.python_file_finder import get_all_python_files_in from tests.utils import create_files, run_within_dir @@ -20,9 +18,12 @@ def test_simple(tmp_path: Path) -> None: Path("other_dir/subdir/file2.py"), ]) - files = PythonFileFinder( - exclude=(".venv",), extend_exclude=("other_dir",), using_default_exclude=False - ).get_all_python_files_in((Path(),)) + files = get_all_python_files_in( + (Path(),), + exclude=(".venv",), + extend_exclude=("other_dir",), + using_default_exclude=False, + ) assert sorted(files) == [ Path("dir/subdir/file1.py"), @@ -44,9 +45,9 @@ def test_only_matches_start(tmp_path: Path) -> None: Path("subdir/file2.py"), ]) - files = PythonFileFinder( - exclude=("subdir",), extend_exclude=(), using_default_exclude=False - ).get_all_python_files_in((Path(),)) + files = get_all_python_files_in( + (Path(),), exclude=("foo",), extend_exclude=("subdir",), using_default_exclude=False + ) assert sorted(files) == [ Path("dir/subdir/file1.py"), @@ -72,9 +73,9 @@ def test_matches_ipynb(ignore_notebooks: bool, expected: list[Path], tmp_path: P with run_within_dir(tmp_path): create_files([Path("dir/subdir/file1.ipynb")]) - files = PythonFileFinder( - exclude=(), extend_exclude=(), using_default_exclude=False, ignore_notebooks=ignore_notebooks - ).get_all_python_files_in((Path(),)) + files = get_all_python_files_in( + (Path(),), exclude=(), extend_exclude=(), using_default_exclude=False, ignore_notebooks=ignore_notebooks + ) assert sorted(files) == expected @@ -120,9 +121,7 @@ def test_regex_argument(exclude: tuple[str], expected: list[Path], tmp_path: Pat Path("other_dir/subdir/file2.py"), ]) - files = PythonFileFinder( - exclude=exclude, extend_exclude=(), using_default_exclude=False - ).get_all_python_files_in((Path(),)) + files = get_all_python_files_in((Path(),), exclude=exclude, extend_exclude=(), using_default_exclude=False) assert sorted(files) == expected @@ -161,9 +160,9 @@ def test_multiple_source_directories(exclude: tuple[str], expected: list[Path], Path("another_dir/subdir/file1.py"), ]) - files = PythonFileFinder( - exclude=exclude, extend_exclude=(), using_default_exclude=False - ).get_all_python_files_in((Path("dir"), Path("other_dir"))) + files = get_all_python_files_in( + (Path("dir"), Path("other_dir")), exclude=exclude, extend_exclude=(), using_default_exclude=False + ) assert sorted(files) == expected @@ -172,42 +171,6 @@ def test_duplicates_are_removed(tmp_path: Path) -> None: with run_within_dir(tmp_path): create_files([Path("dir/subdir/file1.py")]) - files = PythonFileFinder(exclude=(), extend_exclude=(), using_default_exclude=False).get_all_python_files_in(( - Path(), - Path(), - )) + files = get_all_python_files_in((Path(), Path()), exclude=(), extend_exclude=(), using_default_exclude=False) assert sorted(files) == [Path("dir/subdir/file1.py")] - - -def test__generate_gitignore_pathspec_with_non_default_exclude(tmp_path: Path) -> None: - gitignore_pathspec = PythonFileFinder( - exclude=(), extend_exclude=(), using_default_exclude=False - )._generate_gitignore_pathspec(Path()) - - assert gitignore_pathspec is None - - -def test__generate_gitignore_pathspec_with_non_existing_gitignore(tmp_path: Path) -> None: - with run_within_dir(tmp_path): - gitignore_pathspec = PythonFileFinder( - exclude=(), extend_exclude=(), using_default_exclude=True - )._generate_gitignore_pathspec(Path()) - - assert gitignore_pathspec is None - - -def test__generate_gitignore_pathspec_with_existing_gitignore(tmp_path: Path) -> None: - with run_within_dir(tmp_path): - with Path(".gitignore").open("w") as gitignore: - gitignore.write("foo.py\nbar/") - - gitignore_pathspec = PythonFileFinder( - exclude=(), extend_exclude=(), using_default_exclude=True - )._generate_gitignore_pathspec(Path()) - - assert isinstance(gitignore_pathspec, PathSpec) - assert gitignore_pathspec.patterns == [ - GitWildMatchPattern("foo.py"), - GitWildMatchPattern("bar/"), - ]