diff --git a/AGENTS.md b/AGENTS.md index af852ed..152eded 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -54,6 +54,7 @@ AI PatchLab is an AI-assisted security remediation toolkit. The MVP focuses on a - `scanner/run_scan.py` - CLI entry point (`python scanner/run_scan.py --repo ` or `--from-git-url `) - `scanner/git_source.py` - Shallow-clone a public git URL into a temp directory via the `cloned_repo` context manager; cleanup-on-exit, `shell=False`, no remote API calls - `scanner/paths.py` - `rebase_finding_paths(findings, repo_root)` rewrites each finding's `file` (and `id` when it embeds the same path) to a repo-relative POSIX path so reports survive temp-dir cleanup +- `scanner/ignore.py` - `apply_ignore(findings, patterns)` + `load_ignore_patterns(path)` provide `.gitignore`-style path suppression (used by the `--ignore-file` CLI flag). Empty-file findings are never suppressed - `scanner/models.py` - Normalized `Finding` dataclass + severity/confidence enums + `FINDING_FIELDS` - `scanner/recommendations.py` - Deterministic keyword-based recommendation enrichment - `scanner/confidence.py` - Centralized `Finding.confidence` rules (one function per scanner + `confidence_for_meta_finding` for shared `not-installed` / `scan-error` / etc.) diff --git a/CLAUDE.md b/CLAUDE.md index d9f3b20..3c1dbdf 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -50,6 +50,7 @@ This project can optionally include a parallel Codex/OpenAI runtime via `AGENTS. - `scanner/run_scan.py` — CLI entry point (`python scanner/run_scan.py --repo ` or `--from-git-url `) - `scanner/git_source.py` — Shallow-clone a public git URL into a temp directory via the `cloned_repo` context manager; cleanup-on-exit, `shell=False`, no remote API calls - `scanner/paths.py` — `rebase_finding_paths(findings, repo_root)` rewrites each finding's `file` (and `id` when it embeds the same path) to a repo-relative POSIX path so reports survive temp-dir cleanup +- `scanner/ignore.py` — `apply_ignore(findings, patterns)` + `load_ignore_patterns(path)` provide `.gitignore`-style path suppression of findings (used by the `--ignore-file` CLI flag). Empty-file findings are never suppressed - `scanner/models.py` — Normalized `Finding` dataclass + severity/confidence enums + `FINDING_FIELDS` - `scanner/recommendations.py` — Deterministic keyword-based recommendation enrichment - `scanner/confidence.py` — Centralized `Finding.confidence` rules (one function per scanner + `confidence_for_meta_finding` for shared `not-installed` / `scan-error` / etc.) diff --git a/README.md b/README.md index 4e512f8..ffcf735 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,9 @@ python scanner/run_scan.py --from-git-url "https://github.com/owner/repo" --repo # Filter low-noise findings out of public reports (default keeps everything) python scanner/run_scan.py --from-git-url "https://github.com/owner/repo" --reports-dir "reports\owner-repo" --min-severity medium +# Suppress known false-positive paths with a .gitignore-style ignore file +python scanner/run_scan.py --from-git-url "https://github.com/owner/repo" --reports-dir "reports\owner-repo" --ignore-file "reports\owner-repo\.aipatchlabignore" + # Tests python -m pytest tests/ -v @@ -389,6 +392,31 @@ ai-patchlab/ `-- pyproject.toml # Dependencies and tool config ``` +## Ignore File + +`--ignore-file` accepts a `.gitignore`-style file whose patterns suppress matching +findings *after* path rebasing. Patterns match the repo-relative POSIX path of +each finding (e.g. `tests/cassettes/foo.yaml`). Lines starting with `#` are +comments; `!`-prefixed lines re-include previously excluded paths. + +Example for a project whose own safety-engine tests embed crafted fake secrets +that look real to Gitleaks: + +``` +# Crafted fixtures in the safety policy engine tests. +tests/unit_tests/safety_engine/** + +# Smoke tests that ship fake API tokens to exercise integrations. +tests/smoke_tests/integrations/** + +# Re-include one specific file that's actually worth scanning. +!tests/unit_tests/safety_engine/test_real_findings.py +``` + +Findings with an empty `file` field (e.g. info-level "tool not installed" +placeholders) are never suppressed — they describe infrastructure state, not +file content, and a `**` pattern should not silently drop them. + ## Notes - No web app is included in v0.1. diff --git a/pyproject.toml b/pyproject.toml index 86320e9..ffab70d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ dependencies = [ "aiomysql>=0.2.0", "httpx>=0.27.0", "loguru>=0.7.0", + "pathspec>=0.12.0", "pydantic>=2.0", "pydantic-settings>=2.0", "python-dotenv>=1.0", diff --git a/scanner/ignore.py b/scanner/ignore.py new file mode 100644 index 0000000..c4e4458 --- /dev/null +++ b/scanner/ignore.py @@ -0,0 +1,66 @@ +"""Path-based suppression of findings via .gitignore-style patterns. + +A scan can be paired with an "ignore file" (passed via `--ignore-file`) +that lists path patterns to exclude from the report. Patterns use the +same syntax as `.gitignore`: glob with `**` for any-depth matches, `!` +prefix for negation. This is invaluable for scanning targets that have +recurring false-positive shapes (test cassettes, security-tool detector +fixtures, vendored libraries) without having to teach our scanner-level +rules about every project's conventions. + +Suppression happens AFTER `rebase_finding_paths`, so patterns are +matched against POSIX repo-relative paths (e.g. `tests/**` matches +`tests/foo/bar.py`). +""" + +from __future__ import annotations + +from pathlib import Path + +from pathspec import PathSpec +from pathspec.patterns.gitwildmatch import GitWildMatchPattern + +from scanner.models import Finding + + +def parse_ignore_patterns(raw: str) -> list[str]: + """Split raw text into ignore patterns; drop blank lines and comments. + + Comments are lines whose first non-whitespace character is `#`. Both + leading and trailing whitespace is stripped from each kept pattern. + """ + patterns: list[str] = [] + for line in raw.splitlines(): + stripped = line.strip() + if not stripped or stripped.startswith("#"): + continue + patterns.append(stripped) + return patterns + + +def load_ignore_patterns(path: Path | None) -> list[str]: + """Read an ignore file from disk and return its patterns. + + Returns an empty list when `path` is None (no suppression configured). + Raises `FileNotFoundError` if `path` is given but does not exist. + """ + if path is None: + return [] + return parse_ignore_patterns(path.read_text(encoding="utf-8")) + + +def apply_ignore(findings: list[Finding], patterns: list[str]) -> list[Finding]: + """Drop findings whose `file` matches one of the gitignore-style patterns. + + Findings with an empty `file` field (e.g. info-level "tool not + installed" placeholders that point at the repo root) are never + suppressed - they don't represent a real path and the user + presumably wants to keep seeing infrastructure signals. + """ + if not patterns: + return list(findings) + + spec = PathSpec.from_lines(GitWildMatchPattern, patterns) + return [ + finding for finding in findings if not finding.file or not spec.match_file(finding.file) + ] diff --git a/scanner/run_scan.py b/scanner/run_scan.py index fecb3e7..713c360 100644 --- a/scanner/run_scan.py +++ b/scanner/run_scan.py @@ -10,6 +10,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parents[1])) from scanner.git_source import GitCloneError, cloned_repo +from scanner.ignore import apply_ignore, load_ignore_patterns from scanner.models import SEVERITIES, Finding from scanner.paths import rebase_finding_paths from scanner.recommendations import enrich_findings @@ -30,14 +31,18 @@ def run_scan( repo_path: Path, reports_dir: Path = Path("reports"), min_severity: str = "info", + ignore_file: Path | None = None, ) -> dict[str, Path]: """Validate input, run configured scanners, and write reports.""" resolved_repo = repo_path.expanduser().resolve() if not resolved_repo.exists() or not resolved_repo.is_dir(): raise ValueError(f"Repository path does not exist or is not a directory: {repo_path}") + ignore_patterns = load_ignore_patterns(ignore_file) + findings = collect_findings(resolved_repo, reports_dir) findings = rebase_finding_paths(findings, resolved_repo) + findings = apply_ignore(findings, ignore_patterns) findings = filter_by_min_severity(findings, min_severity) return write_reports(repo_path=resolved_repo, findings=findings, reports_dir=reports_dir) @@ -46,6 +51,7 @@ def run_scan_from_url( url: str, reports_dir: Path = Path("reports"), min_severity: str = "info", + ignore_file: Path | None = None, ) -> dict[str, Path]: """Clone a public git URL into a temporary directory, then scan it. @@ -54,7 +60,12 @@ def run_scan_from_url( outside the clone). """ with cloned_repo(url) as clone: - return run_scan(clone.repo_path, reports_dir, min_severity=min_severity) + return run_scan( + clone.repo_path, + reports_dir, + min_severity=min_severity, + ignore_file=ignore_file, + ) def parse_args(argv: list[str] | None = None) -> argparse.Namespace: @@ -79,6 +90,12 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: choices=list(SEVERITIES), help="Drop findings strictly less severe than this threshold (default: info, keeps everything).", ) + parser.add_argument( + "--ignore-file", + dest="ignore_file", + default=None, + help="Path to a .gitignore-style file whose patterns suppress matching findings.", + ) return parser.parse_args(argv) @@ -86,14 +103,23 @@ def main(argv: list[str] | None = None) -> int: """CLI wrapper.""" args = parse_args(argv) reports_dir = Path(args.reports_dir) + ignore_file = Path(args.ignore_file) if args.ignore_file else None try: if args.from_git_url: report_paths = run_scan_from_url( - args.from_git_url, reports_dir, min_severity=args.min_severity + args.from_git_url, + reports_dir, + min_severity=args.min_severity, + ignore_file=ignore_file, ) else: - report_paths = run_scan(Path(args.repo), reports_dir, min_severity=args.min_severity) - except (ValueError, GitCloneError) as exc: + report_paths = run_scan( + Path(args.repo), + reports_dir, + min_severity=args.min_severity, + ignore_file=ignore_file, + ) + except (ValueError, GitCloneError, FileNotFoundError) as exc: print(f"Error: {exc}", file=sys.stderr) return 2 diff --git a/tests/test_ignore.py b/tests/test_ignore.py new file mode 100644 index 0000000..8cfaaee --- /dev/null +++ b/tests/test_ignore.py @@ -0,0 +1,101 @@ +"""Tests for path-based finding suppression via .gitignore-style patterns.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from scanner.ignore import ( + apply_ignore, + load_ignore_patterns, + parse_ignore_patterns, +) +from scanner.models import Finding + + +def _finding(file: str, finding_id: str = "x") -> Finding: + return Finding( + id=finding_id, + tool="semgrep", + severity="high", + title="t", + description="d", + file=file, + line=1, + recommendation="r", + confidence="medium", + ) + + +class TestParseIgnorePatterns: + def test_strips_comments_and_blank_lines(self) -> None: + raw = "\n# top comment\ntests/**\n\n # indented comment\n**/cassettes/**\n" + patterns = parse_ignore_patterns(raw) + assert patterns == ["tests/**", "**/cassettes/**"] + + def test_keeps_negation_lines(self) -> None: + raw = "tests/**\n!tests/test_critical.py\n" + patterns = parse_ignore_patterns(raw) + assert patterns == ["tests/**", "!tests/test_critical.py"] + + +class TestLoadIgnorePatterns: + def test_returns_empty_for_none_path(self) -> None: + assert load_ignore_patterns(None) == [] + + def test_reads_existing_file(self, tmp_path: Path) -> None: + path = tmp_path / "ignore.txt" + path.write_text("tests/**\n# comment\n**/cassettes/**\n", encoding="utf-8") + assert load_ignore_patterns(path) == ["tests/**", "**/cassettes/**"] + + def test_missing_file_raises(self, tmp_path: Path) -> None: + with pytest.raises(FileNotFoundError): + load_ignore_patterns(tmp_path / "nope.txt") + + +class TestApplyIgnore: + def test_empty_patterns_returns_input_unchanged(self) -> None: + findings = [_finding("src/a.py", "1"), _finding("tests/b.py", "2")] + assert apply_ignore(findings, []) == findings + + def test_drops_findings_matching_pattern(self) -> None: + findings = [ + _finding("src/a.py", "src"), + _finding("tests/b.py", "tst"), + ] + result = apply_ignore(findings, ["tests/**"]) + assert [f.id for f in result] == ["src"] + + def test_double_star_matches_any_depth(self) -> None: + findings = [ + _finding("packages/x/tests/cassettes/foo.yaml", "deep"), + _finding("tests/cassettes/bar.yaml", "shallow"), + _finding("src/foo.py", "keep"), + ] + result = apply_ignore(findings, ["**/cassettes/**"]) + assert [f.id for f in result] == ["keep"] + + def test_negation_re_includes_specific_file(self) -> None: + findings = [ + _finding("tests/test_a.py", "a"), + _finding("tests/test_critical.py", "critical"), + ] + result = apply_ignore(findings, ["tests/**", "!tests/test_critical.py"]) + assert [f.id for f in result] == ["critical"] + + def test_preserves_order(self) -> None: + findings = [ + _finding("src/a.py", "a"), + _finding("tests/b.py", "b"), + _finding("src/c.py", "c"), + ] + result = apply_ignore(findings, ["tests/**"]) + assert [f.id for f in result] == ["a", "c"] + + def test_empty_file_field_is_never_suppressed(self) -> None: + # A finding with empty file (e.g. dependency-scan info) should + # not be silently dropped by a `**` pattern. + findings = [_finding("", "empty"), _finding("tests/b.py", "tst")] + result = apply_ignore(findings, ["tests/**", "**"]) + assert "empty" in [f.id for f in result]