From f446ff74d8d6d690098fc65610c92686d8261354 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 29 Apr 2026 08:21:19 -0400 Subject: [PATCH 01/34] feat(scripts): Add dependency version scanner tool --- scripts/version_scanner/.gitignore | 2 + scripts/version_scanner/benchmark.py | 172 +++++++++ scripts/version_scanner/regex_config.yaml | 90 +++++ .../tests/data/.kokoro/build.sh | 1 + .../tests/data/packages/pkg_a/setup.py | 1 + .../tests/data/packages/pkg_b/clean.py | 1 + .../integration/test_scanner_integration.py | 35 ++ .../tests/unit/test_benchmark.py | 77 ++++ .../tests/unit/test_version_scanner.py | 221 +++++++++++ scripts/version_scanner/version_scanner.py | 344 ++++++++++++++++++ 10 files changed, 944 insertions(+) create mode 100644 scripts/version_scanner/.gitignore create mode 100644 scripts/version_scanner/benchmark.py create mode 100644 scripts/version_scanner/regex_config.yaml create mode 100644 scripts/version_scanner/tests/data/.kokoro/build.sh create mode 100644 scripts/version_scanner/tests/data/packages/pkg_a/setup.py create mode 100644 scripts/version_scanner/tests/data/packages/pkg_b/clean.py create mode 100644 scripts/version_scanner/tests/integration/test_scanner_integration.py create mode 100644 scripts/version_scanner/tests/unit/test_benchmark.py create mode 100644 scripts/version_scanner/tests/unit/test_version_scanner.py create mode 100644 scripts/version_scanner/version_scanner.py diff --git a/scripts/version_scanner/.gitignore b/scripts/version_scanner/.gitignore new file mode 100644 index 000000000000..3d90478f0355 --- /dev/null +++ b/scripts/version_scanner/.gitignore @@ -0,0 +1,2 @@ +.conductor/ +scanner_report.csv diff --git a/scripts/version_scanner/benchmark.py b/scripts/version_scanner/benchmark.py new file mode 100644 index 000000000000..773d7a4f488c --- /dev/null +++ b/scripts/version_scanner/benchmark.py @@ -0,0 +1,172 @@ +import argparse +import os +import random +import subprocess +import sys +import time +from typing import List, Dict + +def get_package_subset(packages_dir: str, count: int) -> List[str]: + """ + Get a randomized subset of package names from the specified directory. + + Args: + packages_dir: Path to the directory containing packages. + count: Number of packages to return. + + Returns: + A list of package directory names. + """ + try: + all_packages = [d for d in os.listdir(packages_dir) if os.path.isdir(os.path.join(packages_dir, d))] + except FileNotFoundError: + print(f"Error: Packages directory not found: {packages_dir}") + return [] + + if count >= len(all_packages): + return all_packages + + return random.sample(all_packages, count) + +def run_benchmark( + scanner_path: str, + root_path: str, + package_file: str, + dependency: str, + version: str +) -> float: + """ + Run the scanner and return the duration in seconds. + """ + cmd = [ + "python3", scanner_path, + "-d", dependency, + "-v", version, + "-p", root_path, + "--package-file", package_file + ] + + start_time = time.perf_counter() + + try: + result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + except subprocess.CalledProcessError as e: + print(f"Error running benchmark: {e}") + return -1.0 + + duration = time.perf_counter() - start_time + return duration + +def run_benchmarks( + scanner_path: str, + root_path: str, + packages_dir: str, + counts: List[int], + dependency: str, + version: str +) -> Dict[int, float]: + """Runs benchmarks for specified counts and returns a dict of results.""" + results = {} + + for count in counts: + subset = get_package_subset(packages_dir, count) + print(f" Testing {len(subset)} packages (e.g., {subset[:3]}...)") + + # Create temp package file + pkg_file = "temp_packages.txt" + with open(pkg_file, 'w') as f: + for pkg in subset: + f.write(f"packages/{pkg}\n") + + duration = run_benchmark(scanner_path, root_path, pkg_file, dependency, version) + results[count] = duration + + # Clean up + if os.path.exists(pkg_file): + os.remove(pkg_file) + + return results + +def main(): + parser = argparse.ArgumentParser(description="Benchmark the version scanner.") + + parser.add_argument( + "-s", "--scanner-path", + default="version_scanner.py", + help="Path to version_scanner.py" + ) + + parser.add_argument( + "-r", "--root-path", + required=True, + help="Path to the monorepo root directory" + ) + + parser.add_argument( + "-p", "--packages-dir", + help="Path to packages directory (defaults to /packages)" + ) + + parser.add_argument( + "-d", "--dependency", + default="python", + help="Dependency to search for" + ) + + parser.add_argument( + "-v", "--version", + default="3.7", + help="Version to search for" + ) + + parser.add_argument( + "-c", "--counts", + default="1,10,50", + help="Comma-separated list of package counts to test" + ) + + args = parser.parse_args() + + packages_dir = args.packages_dir or os.path.join(args.root_path, "packages") + + if not os.path.exists(packages_dir): + print(f"Error: Packages directory not found: {packages_dir}", file=sys.stderr) + sys.exit(1) + + counts = [int(c) for c in args.counts.split(',')] + + try: + all_packages = [d for d in os.listdir(packages_dir) if os.path.isdir(os.path.join(packages_dir, d))] + except FileNotFoundError: + print(f"Error: Packages directory not found: {packages_dir}", file=sys.stderr) + sys.exit(1) + + total_packages = len(all_packages) + + print(f"Found {total_packages} packages in {packages_dir}") + + # Filter counts that are greater than total packages + counts = [c for c in counts if c <= total_packages] + # Add total if not already there + if total_packages not in counts: + counts.append(total_packages) + + print(f"Running benchmarks for counts: {counts}") + + results = run_benchmarks( + scanner_path=args.scanner_path, + root_path=args.root_path, + packages_dir=packages_dir, + counts=counts, + dependency=args.dependency, + version=args.version + ) + + print("\nBenchmark Results:") + print(f"{'Packages':<10} | {'Time (seconds)':<15}") + print("-" * 30) + for count, duration in results.items(): + print(f"{count:<10} | {duration:<15.4f}") + +if __name__ == "__main__": + main() diff --git a/scripts/version_scanner/regex_config.yaml b/scripts/version_scanner/regex_config.yaml new file mode 100644 index 000000000000..e630e0c836df --- /dev/null +++ b/scripts/version_scanner/regex_config.yaml @@ -0,0 +1,90 @@ +description: Search rules for identifying dependency versions +rules: + - name: explicit_version_string + description: Finds explicit version strings in code or configs. + examples: + - "'3.7'" + - '"3.7.1"' + - "'3.7.12'" + rules: + - | + ['"]{major}\.{minor}(\.\d+)?['"] + + - name: python_requires + description: Finds various forms of python_requires declarations. + applies_to: [python] + examples: + - "python_requires = '==3.7'" + - "python_requires = '>=3.7'" + - "python_requires = '<=3.7'" + - "python_requires = '>3.6'" + - "python_requires = '<3.8'" + rules: + - | + python_requires\s*=\s*['"]==3\.{minor}['"] + - | + python_requires\s*=\s*['"]>=3\.{minor}['"] + - | + python_requires\s*=\s*['"]<=3\.{minor}['"] + - | + python_requires\s*=\s*['"]>3\.{minor_minus_one}['"] + - | + python_requires\s*=\s*['"]<3\.{minor_plus_one}['"] + + - name: sys_version_info + description: Finds sys.version_info checks in code. + applies_to: [python] + examples: + - "sys.version_info == (3, 7)" + - "sys.version_info >= (3, 7)" + - "sys.version_info <= (3, 7)" + - "sys.version_info > (3, 6)" + - "sys.version_info < (3, 8)" + - "sys.version_info.minor == 7" + - "sys.version_info.minor >= 7" + - "sys.version_info.minor <= 7" + - "sys.version_info.minor > 6" + - "sys.version_info.minor < 8" + rules: + - | + sys\.version_info\s*==\s*\(3,\s*{minor}\) + - | + sys\.version_info\s*>=\s*\(3,\s*{minor}\) + - | + sys\.version_info\s*<=\s*\(3,\s*{minor}\) + - | + sys\.version_info\s*>\s*\(3,\s*{minor_minus_one}\) + - | + sys\.version_info\s*<\s*\(3,\s*{minor_plus_one}\) + - | + sys\.version_info\.minor\s*==\s*{minor} + - | + sys\.version_info\.minor\s*>=\s*{minor} + - | + sys\.version_info\.minor\s*<=\s*{minor} + - | + sys\.version_info\.minor\s*>\s*{minor_minus_one} + - | + sys\.version_info\.minor\s*<\s*{minor_plus_one} + + - name: python_env_short + description: Finds short python environment names often used in tox or nox. + applies_to: [python] + examples: + - "py37" + - "py37-cover" + rules: + - | + py3{minor} + + - name: explicit_python_command + description: Finds explicit python commands with version. + applies_to: [python] + examples: + - "python3.7" + - "python3.7 -m pip" + rules: + - | + python3\.{minor} + + diff --git a/scripts/version_scanner/tests/data/.kokoro/build.sh b/scripts/version_scanner/tests/data/.kokoro/build.sh new file mode 100644 index 000000000000..a3079c597bd1 --- /dev/null +++ b/scripts/version_scanner/tests/data/.kokoro/build.sh @@ -0,0 +1 @@ +python3.7 diff --git a/scripts/version_scanner/tests/data/packages/pkg_a/setup.py b/scripts/version_scanner/tests/data/packages/pkg_a/setup.py new file mode 100644 index 000000000000..a5ff7d1dc955 --- /dev/null +++ b/scripts/version_scanner/tests/data/packages/pkg_a/setup.py @@ -0,0 +1 @@ +python_requires = '>=3.7' diff --git a/scripts/version_scanner/tests/data/packages/pkg_b/clean.py b/scripts/version_scanner/tests/data/packages/pkg_b/clean.py new file mode 100644 index 000000000000..2f9a147db12e --- /dev/null +++ b/scripts/version_scanner/tests/data/packages/pkg_b/clean.py @@ -0,0 +1 @@ +print("Hello") diff --git a/scripts/version_scanner/tests/integration/test_scanner_integration.py b/scripts/version_scanner/tests/integration/test_scanner_integration.py new file mode 100644 index 000000000000..2d5c1a9bf04e --- /dev/null +++ b/scripts/version_scanner/tests/integration/test_scanner_integration.py @@ -0,0 +1,35 @@ +import csv +import os +import subprocess +import pytest + +def test_integration_scan(tmp_path): + # Paths to real tools + scanner_path = os.path.abspath("version_scanner.py") + config_path = os.path.abspath("regex_config.yaml") + + # Static data directory (which we haven't created yet!) + data_dir = os.path.abspath("tests/data") + + # Run the scanner in the tmp_path so the output file is created there + cmd = [ + "python3", scanner_path, + "-d", "python", + "-v", "3.7", + "-p", data_dir, + "--config", config_path, + "-o", "scanner_report.csv" + ] + + # This will fail because tests/data doesn't exist or is empty! + result = subprocess.run(cmd, cwd=tmp_path, capture_output=True, text=True) + + report_file = tmp_path / "scanner_report.csv" + assert report_file.exists(), f"Report file not found. Stderr: {result.stderr}" + + with open(report_file, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + rows = list(reader) + + # We expect at least some matches when we build the data directory + assert len(rows) > 0 diff --git a/scripts/version_scanner/tests/unit/test_benchmark.py b/scripts/version_scanner/tests/unit/test_benchmark.py new file mode 100644 index 000000000000..128493342ae1 --- /dev/null +++ b/scripts/version_scanner/tests/unit/test_benchmark.py @@ -0,0 +1,77 @@ +import os +import pytest +from unittest.mock import patch +from benchmark import get_package_subset, run_benchmark, run_benchmarks + +def test_get_package_subset(tmp_path): + # Create mock packages directory + packages_dir = tmp_path / "packages" + packages_dir.mkdir() + + for i in range(10): + (packages_dir / f"pkg_{i}").mkdir() + + # Test getting a subset of 5 + subset = get_package_subset(str(packages_dir), 5) + assert len(subset) == 5 + for pkg in subset: + assert pkg.startswith("pkg_") + +def test_get_package_subset_all(tmp_path): + packages_dir = tmp_path / "packages" + packages_dir.mkdir() + + for i in range(5): + (packages_dir / f"pkg_{i}").mkdir() + + # Test getting all + subset = get_package_subset(str(packages_dir), 10) # Request more than available + assert len(subset) == 5 # Should return all available + +def test_run_benchmark(tmp_path): + # Create a dummy package file + package_file = tmp_path / "packages.txt" + package_file.write_text("pkg1\n") + + # Create dummy package directory + packages_dir = tmp_path / "packages" + packages_dir.mkdir() + (packages_dir / "pkg1").mkdir() + (packages_dir / "pkg1" / "test.py").write_text("version = '3.7'\n") + + scanner_path = "version_scanner.py" + + duration = run_benchmark( + scanner_path=scanner_path, + root_path=str(tmp_path), + package_file=str(package_file), + dependency="python", + version="3.7" + ) + + assert isinstance(duration, float) + assert duration >= 0 + +# Test run_benchmarks +@patch('benchmark.run_benchmark') +def test_run_benchmarks(mock_run, tmp_path): + mock_run.return_value = 1.5 + + packages_dir = tmp_path / "packages" + packages_dir.mkdir() + for i in range(5): + (packages_dir / f"pkg_{i}").mkdir() + + results = run_benchmarks( + scanner_path="dummy.py", + root_path=str(tmp_path), + packages_dir=str(packages_dir), + counts=[1, 3], + dependency="python", + version="3.7" + ) + + assert len(results) == 2 + assert results[1] == 1.5 + assert results[3] == 1.5 + assert mock_run.call_count == 2 diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py new file mode 100644 index 000000000000..9a32c4ba6c52 --- /dev/null +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -0,0 +1,221 @@ +import csv +import os +import re +import pytest +from version_scanner import ConfigManager, scan_file, write_csv_report + +# Test ConfigManager +@pytest.mark.parametrize("dependency, version, expected", [ + ( + "python", + "3.7", + {"name": "python", "version": "3.7", "major": "3", "minor": "7", "minor_plus_one": "8", "minor_minus_one": "6"} + ), + ( + "protobuf", + "4.25.8", + {"name": "protobuf", "version": "4.25.8", "major": "4", "minor": "25", "patch": "8", "minor_plus_one": "26", "minor_minus_one": "24"} + ), + ( + "foo", + "3", + {"name": "foo", "version": "3", "major": "3"} + ), +]) +def test_compute_variables(dependency, version, expected): + cm = ConfigManager("dummy_path", dependency, version) + vars = cm._compute_variables() + assert vars == expected + +# Test scan_file +def test_scan_file_positive(tmp_path): + test_file = tmp_path / "test.py" + test_file.write_text("python_requires = '>=3.7'\n") + + rules = [ + {"name": "python_requires_check", "pattern": r"python_requires\s*=\s*['\"]>=3\.7['\"]"} + ] + + results = scan_file(str(test_file), rules) + assert len(results) == 1 + assert results[0]["rule_name"] == "python_requires_check" + assert results[0]["line_number"] == 1 + assert results[0]["matched_string"] == "python_requires = '>=3.7'" + +def test_scan_file_negative(tmp_path): + test_file = tmp_path / "test.py" + test_file.write_text("python_requires = '>=3.8'\n") + + rules = [ + {"name": "python_requires_check", "pattern": r"python_requires\s*=\s*['\"]>=3\.7['\"]"} + ] + + results = scan_file(str(test_file), rules) + assert len(results) == 0 + +# Test directory scan simulation +def test_directory_scan(tmp_path): + # Create dummy files + p1 = tmp_path / "pkg1" + p1.mkdir() + f1 = p1 / "setup.py" + f1.write_text("python_requires = '>=3.7'\n") + + p2 = tmp_path / "pkg2" + p2.mkdir() + f2 = p2 / "clean.py" + f2.write_text("print('Hello')\n") + + rules = [ + {"name": "python_requires_check", "pattern": r"python_requires\s*=\s*['\"]>=3\.7['\"]"} + ] + + results = [] + for root, dirs, files in os.walk(tmp_path): + for file in files: + file_path = os.path.join(root, file) + results.extend(scan_file(file_path, rules)) + + assert len(results) == 1 + assert results[0]["rule_name"] == "python_requires_check" + +# Test write_csv_report +def test_write_csv_report(tmp_path): + output_file = tmp_path / "report.csv" + matches = [ + { + "file_path": "./setup.py", + "rule_name": "python_requires_check", + "line_number": 1, + "matched_string": "python_requires = '>=3.7'", + "context_line": "python_requires = '>=3.7'" + } + ] + + write_csv_report(str(output_file), matches) + + assert output_file.exists() + + with open(output_file, 'r', encoding='utf-8', newline='') as f: + reader = csv.DictReader(f) + rows = list(reader) + + assert len(rows) == 1 + assert rows[0]["file_path"] == "./setup.py" + assert rows[0]["rule_name"] == "python_requires_check" + assert rows[0]["line_number"] == "1" + assert rows[0]["matched_string"] == "python_requires = '>=3.7'" + assert rows[0]["context_line"] == "python_requires = '>=3.7'" + + +def test_load_config(tmp_path): + config_file = tmp_path / "config.yaml" + config_file.write_text(""" +rules: + - name: test_rule + rules: + - python{version} +""") + + cm = ConfigManager(str(config_file), "python", "3.7") + rules = cm.load_config() + + assert len(rules) == 1 + assert rules[0]["name"] == "test_rule" + assert rules[0]["pattern"] == "python3.7" + +def test_regex_patterns(): + """Test that core regex patterns match expected strings based on groups in regex_config.yaml.""" + + # Group: sys_version_info + # Pattern: sys.version_info < (3, 8) + pattern = re.compile(r"sys\.version_info\s*<\s*\(3,\s*8\)") + assert pattern.search("sys.version_info < (3, 8)") is not None + assert pattern.search("sys.version_info<(3,8)") is not None + + # Pattern: sys.version_info.minor <= 7 + pattern = re.compile(r"sys\.version_info\.minor\s*[<=]=?\s*7") + assert pattern.search("sys.version_info.minor <= 7") is not None + assert pattern.search("sys.version_info.minor==7") is not None + + # Pattern: sys.version_info.minor < 8 + pattern = re.compile(r"sys\.version_info\.minor\s*<\s*8") + assert pattern.search("sys.version_info.minor < 8") is not None + + # Group: python_env_short + # Pattern: py37 + pattern = re.compile(r"py37") + assert pattern.search("py37") is not None + + # Group: explicit_python_command + # Pattern: python3.7 + pattern = re.compile(r"python3\.7") + assert pattern.search("python3.7") is not None + + # Group: python_requires + # Pattern: python_requires == '3.7' + pattern = re.compile(r"python_requires\s*=\s*['\"]==3\.7['\"]") + assert pattern.search("python_requires = '==3.7'") is not None + + # Pattern: python_requires >= '3.7' + pattern = re.compile(r"python_requires\s*=\s*['\"]>=3\.7['\"]") + assert pattern.search("python_requires = '>=3.7'") is not None + + # Pattern: python_requires <= '3.7' + pattern = re.compile(r"python_requires\s*=\s*['\"]<=3\.7['\"]") + assert pattern.search("python_requires = '<=3.7'") is not None + + # Pattern: python_requires > '3.6' + pattern = re.compile(r"python_requires\s*=\s*['\"]>3\.6['\"]") + assert pattern.search("python_requires = '>3.6'") is not None + + # Pattern: python_requires < '3.8' + pattern = re.compile(r"python_requires\s*=\s*['\"]<3\.8['\"]") + assert pattern.search("python_requires = '<3.8'") is not None + + +def test_regex_examples_from_config(): + """Test that examples in config match at least one rule in the group.""" + import yaml + config_path = "regex_config.yaml" + + try: + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + except FileNotFoundError: + pytest.fail(f"Config file not found: {config_path}") + + rules_list = config.get("rules", []) + + # Variables for interpolation (simulate Python 3.7) + vars = { + "major": "3", + "minor": "7", + "version": "3.7", + "minor_plus_one": "8", + "minor_minus_one": "6" + } + + for rule_group in rules_list: + name = rule_group.get("name") + examples = rule_group.get("examples", []) + templates = rule_group.get("rules", []) + + if not examples or not templates: + continue + + compiled_patterns = [] + for template in templates: + try: + resolved = template.strip().format(**vars) + compiled_patterns.append(re.compile(resolved)) + except KeyError: + continue + + for example in examples: + matched = False + for pattern in compiled_patterns: + if pattern.search(example): + matched = True + break + assert matched, f"Example '{example}' in group '{name}' did not match any pattern." diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py new file mode 100644 index 000000000000..2726326082b1 --- /dev/null +++ b/scripts/version_scanner/version_scanner.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python3 +""" +Automated Dependency Version Scanner +Scans a repository for references to specific dependency versions. +""" + +import argparse +import csv +import os +import re +import sys +from typing import Dict, List, Tuple +import yaml + +class ConfigManager: + """Handles loading and interpolation of regex configurations.""" + + def __init__(self, config_path: str, dependency: str, version: str): + self.config_path = config_path + self.dependency = dependency + self.version = version + self.variables = self._compute_variables() + + def _compute_variables(self) -> Dict[str, str]: + """Compute variables for interpolation from version string.""" + vars = { + "name": self.dependency, + "version": self.version, + } + + parts = self.version.split('.') + if len(parts) >= 1: + vars["major"] = parts[0] + if len(parts) >= 2: + vars["minor"] = parts[1] + try: + vars["minor_plus_one"] = str(int(parts[1]) + 1) + except ValueError: + vars["minor_plus_one"] = parts[1] + try: + vars["minor_minus_one"] = str(int(parts[1]) - 1) + except ValueError: + vars["minor_minus_one"] = parts[1] + if len(parts) >= 3: + vars["patch"] = parts[2] + + return vars + + def load_config(self) -> List[Dict[str, str]]: + """Load and resolve rules from config.""" + try: + with open(self.config_path, 'r', encoding='utf-8') as f: + config = yaml.safe_load(f) + except FileNotFoundError: + print(f"Error: Config file not found: {self.config_path}", file=sys.stderr) + sys.exit(1) + except yaml.YAMLError as e: + print(f"Error parsing config file: {e}", file=sys.stderr) + sys.exit(1) + + rules_list = config.get("rules", []) + resolved_rules = [] + + for rule_group in rules_list: + name = rule_group.get("name") + applies_to = rule_group.get("applies_to", []) + + # Filter by dependency + if applies_to and self.dependency not in applies_to: + continue + + templates = rule_group.get("rules", []) + + for template in templates: + try: + resolved_pattern = template.strip().format(**self.variables) + resolved_rules.append({ + "name": name, + "pattern": resolved_pattern + }) + except KeyError as e: + print(f"Warning: Missing variable for interpolation in rule {name}: {e}", file=sys.stderr) + + return resolved_rules + +def scan_file(file_path: str, rules: List[Dict[str, str]]) -> List[Dict[str, str]]: + """ + Scan a single file for matching patterns. + + Args: + file_path: Path to the file to scan. + rules: A list of dictionaries containing 'name' and 'pattern' (string). + + Returns: + A list of dictionaries containing match details. + """ + results = [] + + # Compile patterns + compiled_rules = [] + for rule in rules: + try: + compiled_rules.append({ + "name": rule["name"], + "pattern": re.compile(rule["pattern"]) + }) + except re.error as e: + print(f"Error compiling regex for rule {rule['name']}: {e}", file=sys.stderr) + continue + + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + for line_num, line in enumerate(f, 1): + for rule in compiled_rules: + match = rule["pattern"].search(line) + if match: + results.append({ + "rule_name": rule["name"], + "line_number": line_num, + "matched_string": match.group(0), + "context_line": line.strip() + }) + except IOError as e: + print(f"Warning: Could not read file {file_path}: {e}", file=sys.stderr) + + return results + +def write_csv_report(output_path: str, matches: List[Dict[str, str]]) -> None: + """ + Write the collected matches to a CSV file. + + Args: + output_path: Path to the output CSV file. + matches: A list of dictionaries containing match details. + """ + fieldnames = ["file_path", "package_name", "rule_name", "line_number", "matched_string", "context_line"] + + try: + with open(output_path, 'w', encoding='utf-8', newline='') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + + for match in matches: + # Ensure only specified fields are written + row = {field: match.get(field, "") for field in fieldnames} + writer.writerow(row) + + print(f"\nReport written to: {output_path}") + except IOError as e: + print(f"Error writing CSV report: {e}", file=sys.stderr) + +def scan_repository( + root_path: str, + rules: List[Dict[str, str]], + target_packages: List[str] = None +) -> List[Dict[str, str]]: + """ + Scan repository for matching patterns. + + Args: + root_path: Path to the repository root. + rules: A list of dictionaries containing 'name' and 'pattern'. + target_packages: A list of package paths to include (e.g., ['packages/pkg_a']). + If None or empty, all packages are scanned. + + Returns: + A list of match details. + """ + ignore_dirs = {'.git', '__pycache__', '.tox', '.nox', 'venv', '.venv', '.conductor'} + results = [] + + print(f"\nScanning repository: {root_path}") + if target_packages: + print(f"Filtering for packages: {target_packages}") + + for root, dirs, files in os.walk(root_path): + # Prune ignore directories + dirs[:] = [d for d in dirs if d not in ignore_dirs] + + rel_root = os.path.relpath(root, root_path) + parts = rel_root.split(os.sep) + + # Monorepo filtering + if target_packages and parts[0] == "packages": + if len(parts) >= 2: + current_package_path = os.path.join(parts[0], parts[1]) + if current_package_path not in target_packages: + # Skip this directory and all subdirectories + dirs[:] = [] + continue + else: + # We are in the "packages" directory itself. Continue to walk. + pass + + for file in files: + file_path = os.path.join(root, file) + matches = scan_file(file_path, rules) + + # Compute display path and package name + rel_file_path = os.path.relpath(file_path, root_path) + + package_name = "" + path_parts = rel_file_path.split(os.sep) + if len(path_parts) >= 2 and path_parts[0] == "packages": + package_name = path_parts[1] + + root_parts = os.path.abspath(root_path).split(os.sep) + if len(root_parts) >= 2: + prefix = os.path.join(root_parts[-2], root_parts[-1]) + display_path = os.path.join(prefix, rel_file_path) + else: + display_path = rel_file_path + + for m in matches: + m["file_path"] = display_path + m["package_name"] = package_name + results.append(m) + + return results + + +def main(): + parser = argparse.ArgumentParser( + description="Scan repository for references to specific dependency versions." + ) + + parser.add_argument( + "-d", "--dependency", + required=True, + help="Name of the dependency (e.g., python, protobuf)" + ) + + parser.add_argument( + "-v", "--version", + required=True, + help="Specific version to search for (e.g., 3.7, 4.25.8)" + ) + + parser.add_argument( + "-p", "--path", + default=".", + help="Root directory to scan (defaults to current directory)" + ) + + + + package_group = parser.add_mutually_exclusive_group() + + package_group.add_argument( + "--package", + help="Specific subdirectory filter (useful for monorepos)" + ) + + package_group.add_argument( + "--package-file", + help="Path to a file containing a list of package directories to scan" + ) + + parser.add_argument( + "--config", + default="regex_config.yaml", + help="Path to the regex configuration file" + ) + + parser.add_argument( + "-o", "--output", + help="Path to the output CSV file (defaults to --.csv)" + ) + + args = parser.parse_args() + + # Resolve target paths + targets = [] + + if args.package: + targets.append(os.path.join(args.path, args.package)) + + if args.package_file: + if os.path.exists(args.package_file): + with open(args.package_file, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + targets.append(os.path.join(args.path, line)) + else: + print(f"Error: Package file not found: {args.package_file}", file=sys.stderr) + sys.exit(1) + + # Fallback: if neither package nor package-file is given, use the path directly + if not targets: + targets.append(args.path) + + print(f"Starting scan for dependency: {args.dependency} version: {args.version}") + print(f"Root path: {args.path}") + print(f"Targets to scan:") + for target in targets: + print(f" - {target}") + print(f"Using config: {args.config}") + + # Load and resolve rules + config_manager = ConfigManager(args.config, args.dependency, args.version) + rules = config_manager.load_config() + + print(f"\nLoaded {len(rules)} rules:") + for rule in rules: + print(f" - {rule['name']}: {rule['pattern']}") + + # Resolve target packages if filtering is requested + target_packages = [] + if args.package: + target_packages.append(os.path.join("packages", args.package)) + elif args.package_file: + if os.path.exists(args.package_file): + with open(args.package_file, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + target_packages.append(line) + else: + print(f"Error: Package file not found: {args.package_file}", file=sys.stderr) + sys.exit(1) + + # Scan repository + all_matches = scan_repository(args.path, rules, target_packages) + + print(f"\nFound {len(all_matches)} matches.") + for m in all_matches[:10]: # Show first 10 + print(f" {m['file_path']}:{m['line_number']} [{m['rule_name']}] {m['matched_string']}") + + if len(all_matches) > 10: + print(f" ... and {len(all_matches) - 10} more matches.") + + # Write report + import datetime + if args.output: + output_path = args.output + else: + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = f"{args.dependency}-{args.version}-{timestamp}.csv" + + write_csv_report(output_path, all_matches) + +if __name__ == "__main__": + main() From 256b0485392127f8130fe16d29f06d1722fb3eea Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 29 Apr 2026 08:40:50 -0400 Subject: [PATCH 02/34] perf(search): Apply bot suggestions for regex optimization and imports --- .../tests/unit/test_version_scanner.py | 6 ++-- scripts/version_scanner/version_scanner.py | 32 +++++++++---------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 9a32c4ba6c52..69dfdd53dad1 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -33,7 +33,7 @@ def test_scan_file_positive(tmp_path): test_file.write_text("python_requires = '>=3.7'\n") rules = [ - {"name": "python_requires_check", "pattern": r"python_requires\s*=\s*['\"]>=3\.7['\"]"} + {"name": "python_requires_check", "pattern": re.compile(r"python_requires\s*=\s*['\"]>=3\.7['\"]")} ] results = scan_file(str(test_file), rules) @@ -47,7 +47,7 @@ def test_scan_file_negative(tmp_path): test_file.write_text("python_requires = '>=3.8'\n") rules = [ - {"name": "python_requires_check", "pattern": r"python_requires\s*=\s*['\"]>=3\.7['\"]"} + {"name": "python_requires_check", "pattern": re.compile(r"python_requires\s*=\s*['\"]>=3\.7['\"]")} ] results = scan_file(str(test_file), rules) @@ -67,7 +67,7 @@ def test_directory_scan(tmp_path): f2.write_text("print('Hello')\n") rules = [ - {"name": "python_requires_check", "pattern": r"python_requires\s*=\s*['\"]>=3\.7['\"]"} + {"name": "python_requires_check", "pattern": re.compile(r"python_requires\s*=\s*['\"]>=3\.7['\"]")} ] results = [] diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 2726326082b1..e147d1b5cf2d 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -6,6 +6,7 @@ import argparse import csv +import datetime import os import re import sys @@ -83,30 +84,18 @@ def load_config(self) -> List[Dict[str, str]]: return resolved_rules -def scan_file(file_path: str, rules: List[Dict[str, str]]) -> List[Dict[str, str]]: +def scan_file(file_path: str, compiled_rules: List[Dict[str, re.Pattern]]) -> List[Dict[str, str]]: """ Scan a single file for matching patterns. Args: file_path: Path to the file to scan. - rules: A list of dictionaries containing 'name' and 'pattern' (string). + compiled_rules: A list of dictionaries containing 'name' and 'pattern' (compiled regex). Returns: A list of dictionaries containing match details. """ results = [] - - # Compile patterns - compiled_rules = [] - for rule in rules: - try: - compiled_rules.append({ - "name": rule["name"], - "pattern": re.compile(rule["pattern"]) - }) - except re.error as e: - print(f"Error compiling regex for rule {rule['name']}: {e}", file=sys.stderr) - continue try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: @@ -169,6 +158,18 @@ def scan_repository( ignore_dirs = {'.git', '__pycache__', '.tox', '.nox', 'venv', '.venv', '.conductor'} results = [] + # Compile patterns once here + compiled_rules = [] + for rule in rules: + try: + compiled_rules.append({ + "name": rule["name"], + "pattern": re.compile(rule["pattern"]) + }) + except re.error as e: + print(f"Error compiling regex for rule {rule['name']}: {e}", file=sys.stderr) + continue + print(f"\nScanning repository: {root_path}") if target_packages: print(f"Filtering for packages: {target_packages}") @@ -194,7 +195,7 @@ def scan_repository( for file in files: file_path = os.path.join(root, file) - matches = scan_file(file_path, rules) + matches = scan_file(file_path, compiled_rules) # Compute display path and package name rel_file_path = os.path.relpath(file_path, root_path) @@ -331,7 +332,6 @@ def main(): print(f" ... and {len(all_matches) - 10} more matches.") # Write report - import datetime if args.output: output_path = args.output else: From 101039999b0d43dcb4fd3194d7eee7fe61c48fd5 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 29 Apr 2026 08:45:47 -0400 Subject: [PATCH 03/34] refactor(benchmark): Use tempfile for unique names and safe cleanup --- scripts/version_scanner/benchmark.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/scripts/version_scanner/benchmark.py b/scripts/version_scanner/benchmark.py index 773d7a4f488c..7fb89c6ee928 100644 --- a/scripts/version_scanner/benchmark.py +++ b/scripts/version_scanner/benchmark.py @@ -3,6 +3,7 @@ import random import subprocess import sys +import tempfile import time from typing import List, Dict @@ -73,17 +74,18 @@ def run_benchmarks( print(f" Testing {len(subset)} packages (e.g., {subset[:3]}...)") # Create temp package file - pkg_file = "temp_packages.txt" - with open(pkg_file, 'w') as f: + with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: for pkg in subset: f.write(f"packages/{pkg}\n") + pkg_file = f.name - duration = run_benchmark(scanner_path, root_path, pkg_file, dependency, version) - results[count] = duration - - # Clean up - if os.path.exists(pkg_file): - os.remove(pkg_file) + try: + duration = run_benchmark(scanner_path, root_path, pkg_file, dependency, version) + results[count] = duration + finally: + # Clean up + if os.path.exists(pkg_file): + os.remove(pkg_file) return results From 68f61eec3284592396efe62597ccdeb0976e44d7 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 29 Apr 2026 08:45:54 -0400 Subject: [PATCH 04/34] refactor(benchmark): Remove redundant directory check --- scripts/version_scanner/benchmark.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/scripts/version_scanner/benchmark.py b/scripts/version_scanner/benchmark.py index 7fb89c6ee928..bd53b7da5e9c 100644 --- a/scripts/version_scanner/benchmark.py +++ b/scripts/version_scanner/benchmark.py @@ -18,11 +18,7 @@ def get_package_subset(packages_dir: str, count: int) -> List[str]: Returns: A list of package directory names. """ - try: - all_packages = [d for d in os.listdir(packages_dir) if os.path.isdir(os.path.join(packages_dir, d))] - except FileNotFoundError: - print(f"Error: Packages directory not found: {packages_dir}") - return [] + all_packages = [d for d in os.listdir(packages_dir) if os.path.isdir(os.path.join(packages_dir, d))] if count >= len(all_packages): return all_packages From cc960b4776703e4911f98922ba14373e5a6c52b1 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 29 Apr 2026 08:46:05 -0400 Subject: [PATCH 05/34] test(integration): Check exit code of subprocess in integration test --- .../tests/integration/test_scanner_integration.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/version_scanner/tests/integration/test_scanner_integration.py b/scripts/version_scanner/tests/integration/test_scanner_integration.py index 2d5c1a9bf04e..daa3ef19c7b9 100644 --- a/scripts/version_scanner/tests/integration/test_scanner_integration.py +++ b/scripts/version_scanner/tests/integration/test_scanner_integration.py @@ -8,7 +8,7 @@ def test_integration_scan(tmp_path): scanner_path = os.path.abspath("version_scanner.py") config_path = os.path.abspath("regex_config.yaml") - # Static data directory (which we haven't created yet!) + # Static data directory data_dir = os.path.abspath("tests/data") # Run the scanner in the tmp_path so the output file is created there @@ -21,8 +21,7 @@ def test_integration_scan(tmp_path): "-o", "scanner_report.csv" ] - # This will fail because tests/data doesn't exist or is empty! - result = subprocess.run(cmd, cwd=tmp_path, capture_output=True, text=True) + result = subprocess.run(cmd, cwd=tmp_path, capture_output=True, text=True, check=True) report_file = tmp_path / "scanner_report.csv" assert report_file.exists(), f"Report file not found. Stderr: {result.stderr}" From a4ad9ce256591a85b2203c042d4037634fd38b9c Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 29 Apr 2026 08:52:49 -0400 Subject: [PATCH 06/34] test(unit): Remove redundant and brittle test_regex_patterns --- .../tests/unit/test_version_scanner.py | 49 +------------------ 1 file changed, 1 insertion(+), 48 deletions(-) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 69dfdd53dad1..210ace6ccedd 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -124,54 +124,7 @@ def test_load_config(tmp_path): assert rules[0]["name"] == "test_rule" assert rules[0]["pattern"] == "python3.7" -def test_regex_patterns(): - """Test that core regex patterns match expected strings based on groups in regex_config.yaml.""" - - # Group: sys_version_info - # Pattern: sys.version_info < (3, 8) - pattern = re.compile(r"sys\.version_info\s*<\s*\(3,\s*8\)") - assert pattern.search("sys.version_info < (3, 8)") is not None - assert pattern.search("sys.version_info<(3,8)") is not None - - # Pattern: sys.version_info.minor <= 7 - pattern = re.compile(r"sys\.version_info\.minor\s*[<=]=?\s*7") - assert pattern.search("sys.version_info.minor <= 7") is not None - assert pattern.search("sys.version_info.minor==7") is not None - - # Pattern: sys.version_info.minor < 8 - pattern = re.compile(r"sys\.version_info\.minor\s*<\s*8") - assert pattern.search("sys.version_info.minor < 8") is not None - - # Group: python_env_short - # Pattern: py37 - pattern = re.compile(r"py37") - assert pattern.search("py37") is not None - - # Group: explicit_python_command - # Pattern: python3.7 - pattern = re.compile(r"python3\.7") - assert pattern.search("python3.7") is not None - - # Group: python_requires - # Pattern: python_requires == '3.7' - pattern = re.compile(r"python_requires\s*=\s*['\"]==3\.7['\"]") - assert pattern.search("python_requires = '==3.7'") is not None - - # Pattern: python_requires >= '3.7' - pattern = re.compile(r"python_requires\s*=\s*['\"]>=3\.7['\"]") - assert pattern.search("python_requires = '>=3.7'") is not None - - # Pattern: python_requires <= '3.7' - pattern = re.compile(r"python_requires\s*=\s*['\"]<=3\.7['\"]") - assert pattern.search("python_requires = '<=3.7'") is not None - - # Pattern: python_requires > '3.6' - pattern = re.compile(r"python_requires\s*=\s*['\"]>3\.6['\"]") - assert pattern.search("python_requires = '>3.6'") is not None - - # Pattern: python_requires < '3.8' - pattern = re.compile(r"python_requires\s*=\s*['\"]<3\.8['\"]") - assert pattern.search("python_requires = '<3.8'") is not None + def test_regex_examples_from_config(): From 274395744ee5338cbc904c319204bcf28414bac9 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 29 Apr 2026 09:08:40 -0400 Subject: [PATCH 07/34] test(unit): Move import yaml to top of file --- scripts/version_scanner/tests/unit/test_version_scanner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 210ace6ccedd..d66556f74e85 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -2,6 +2,7 @@ import os import re import pytest +import yaml from version_scanner import ConfigManager, scan_file, write_csv_report # Test ConfigManager From 47450bb37ceea13cd650591b101f95102f1cbd02 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 29 Apr 2026 09:08:46 -0400 Subject: [PATCH 08/34] refactor(benchmark): Remove redundant directory check in main --- scripts/version_scanner/benchmark.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/scripts/version_scanner/benchmark.py b/scripts/version_scanner/benchmark.py index bd53b7da5e9c..36179af1dc4b 100644 --- a/scripts/version_scanner/benchmark.py +++ b/scripts/version_scanner/benchmark.py @@ -133,11 +133,7 @@ def main(): counts = [int(c) for c in args.counts.split(',')] - try: - all_packages = [d for d in os.listdir(packages_dir) if os.path.isdir(os.path.join(packages_dir, d))] - except FileNotFoundError: - print(f"Error: Packages directory not found: {packages_dir}", file=sys.stderr) - sys.exit(1) + all_packages = [d for d in os.listdir(packages_dir) if os.path.isdir(os.path.join(packages_dir, d))] total_packages = len(all_packages) From c777e441b6c34928c0966e85d888c83e7542436e Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 29 Apr 2026 09:12:57 -0400 Subject: [PATCH 09/34] test(unit): Remove duplicate import yaml from function --- scripts/version_scanner/tests/unit/test_version_scanner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index d66556f74e85..7feb0b8ab67e 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -130,7 +130,6 @@ def test_load_config(tmp_path): def test_regex_examples_from_config(): """Test that examples in config match at least one rule in the group.""" - import yaml config_path = "regex_config.yaml" try: From 8aab80188e6e8d316ce7cde625f19c77947d6d0a Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 05:22:13 -0400 Subject: [PATCH 10/34] feat(version_scanner): handle invalid format strings in config and add tests --- .../tests/unit/test_version_scanner.py | 22 +++++++++++++++++++ scripts/version_scanner/version_scanner.py | 2 ++ 2 files changed, 24 insertions(+) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 7feb0b8ab67e..177f2c5a5818 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -126,6 +126,28 @@ def test_load_config(tmp_path): assert rules[0]["pattern"] == "python3.7" +@pytest.mark.parametrize("template, expected_warning", [ + ("python{missing_var}", "Warning: Missing variable for interpolation"), + ("python{version", "Warning: Invalid format string"), +]) +def test_load_config_error_handling(tmp_path, capsys, template, expected_warning): + config_file = tmp_path / "config.yaml" + config_file.write_text(f""" +rules: + - name: test_rule + rules: + - {template} +""") + + cm = ConfigManager(str(config_file), "python", "3.7") + rules = cm.load_config() + + assert len(rules) == 0 + + captured = capsys.readouterr() + assert expected_warning in captured.err + + def test_regex_examples_from_config(): diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index e147d1b5cf2d..e1fa1deb9e54 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -81,6 +81,8 @@ def load_config(self) -> List[Dict[str, str]]: }) except KeyError as e: print(f"Warning: Missing variable for interpolation in rule {name}: {e}", file=sys.stderr) + except ValueError as e: + print(f"Warning: Invalid format string in rule {name}: {e}", file=sys.stderr) return resolved_rules From f63053cfa6c8c298ccd6cd22c7c309785b57736f Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 05:29:09 -0400 Subject: [PATCH 11/34] feat(version_scanner): handle PermissionError when reading config file and add tests --- .../tests/unit/test_version_scanner.py | 16 ++++++++++++++++ scripts/version_scanner/version_scanner.py | 3 +++ 2 files changed, 19 insertions(+) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 177f2c5a5818..353800bebb11 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -1,6 +1,7 @@ import csv import os import re +from unittest.mock import patch import pytest import yaml from version_scanner import ConfigManager, scan_file, write_csv_report @@ -148,6 +149,21 @@ def test_load_config_error_handling(tmp_path, capsys, template, expected_warning assert expected_warning in captured.err +def test_load_config_permission_error(tmp_path, capsys): + config_file = tmp_path / "config.yaml" + config_file.write_text("rules: []") + + cm = ConfigManager(str(config_file), "python", "3.7") + + with patch("builtins.open", side_effect=PermissionError("Permission denied")): + with pytest.raises(SystemExit) as excinfo: + cm.load_config() + + assert excinfo.value.code == 1 + captured = capsys.readouterr() + assert "Error: Permission denied reading config file" in captured.err + + def test_regex_examples_from_config(): diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index e1fa1deb9e54..d4f307c140cc 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -55,6 +55,9 @@ def load_config(self) -> List[Dict[str, str]]: except FileNotFoundError: print(f"Error: Config file not found: {self.config_path}", file=sys.stderr) sys.exit(1) + except PermissionError: + print(f"Error: Permission denied reading config file: {self.config_path}", file=sys.stderr) + sys.exit(1) except yaml.YAMLError as e: print(f"Error parsing config file: {e}", file=sys.stderr) sys.exit(1) From 2af97b33631da99b5584073f1df8e0fc1b262635 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 05:46:33 -0400 Subject: [PATCH 12/34] feat(version_scanner): extract read_package_file and handle file errors --- .../tests/unit/test_version_scanner.py | 36 ++++++++++++- scripts/version_scanner/version_scanner.py | 50 ++++++++++++------- 2 files changed, 66 insertions(+), 20 deletions(-) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 353800bebb11..724897e22b5f 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -162,8 +162,40 @@ def test_load_config_permission_error(tmp_path, capsys): assert excinfo.value.code == 1 captured = capsys.readouterr() assert "Error: Permission denied reading config file" in captured.err - - +def test_main_package_file_permission_error(tmp_path, capsys): + package_file = tmp_path / "packages.txt" + package_file.write_text("packages/pkg_a") + + import sys + test_args = ["version_scanner.py", "-d", "python", "-v", "3.7", "--package-file", str(package_file)] + + real_open = open + def side_effect(file, *args, **kwargs): + if str(file) == str(package_file): + raise PermissionError("Permission denied") + return real_open(file, *args, **kwargs) + + with patch("sys.argv", test_args): + with patch("builtins.open", side_effect=side_effect): + with pytest.raises(SystemExit) as excinfo: + from version_scanner import main + main() + + assert excinfo.value.code == 1 + captured = capsys.readouterr() + assert "Error: Permission denied reading package file" in captured.err +def test_main_package_file_not_found(capsys): + import sys + test_args = ["version_scanner.py", "-d", "python", "-v", "3.7", "--package-file", "non_existent_file.txt"] + + with patch("sys.argv", test_args): + with pytest.raises(SystemExit) as excinfo: + from version_scanner import main + main() + + assert excinfo.value.code == 1 + captured = capsys.readouterr() + assert "Error: Package file not found" in captured.err def test_regex_examples_from_config(): diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index d4f307c140cc..9b84ace2ef51 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -142,6 +142,34 @@ def write_csv_report(output_path: str, matches: List[Dict[str, str]]) -> None: print(f"\nReport written to: {output_path}") except IOError as e: print(f"Error writing CSV report: {e}", file=sys.stderr) +def read_package_file(file_path: str) -> List[str]: + """ + Read package paths from a file. + + Args: + file_path: Path to the package file. + + Returns: + A list of package paths. + """ + packages = [] + try: + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + packages.append(line) + except FileNotFoundError: + print(f"Error: Package file not found: {file_path}", file=sys.stderr) + sys.exit(1) + except PermissionError: + print(f"Error: Permission denied reading package file: {file_path}", file=sys.stderr) + sys.exit(1) + except IOError as e: + print(f"Error reading package file: {e}", file=sys.stderr) + sys.exit(1) + return packages + def scan_repository( root_path: str, @@ -282,15 +310,9 @@ def main(): targets.append(os.path.join(args.path, args.package)) if args.package_file: - if os.path.exists(args.package_file): - with open(args.package_file, 'r', encoding='utf-8') as f: - for line in f: - line = line.strip() - if line and not line.startswith('#'): - targets.append(os.path.join(args.path, line)) - else: - print(f"Error: Package file not found: {args.package_file}", file=sys.stderr) - sys.exit(1) + packages = read_package_file(args.package_file) + for p in packages: + targets.append(os.path.join(args.path, p)) # Fallback: if neither package nor package-file is given, use the path directly if not targets: @@ -316,15 +338,7 @@ def main(): if args.package: target_packages.append(os.path.join("packages", args.package)) elif args.package_file: - if os.path.exists(args.package_file): - with open(args.package_file, 'r', encoding='utf-8') as f: - for line in f: - line = line.strip() - if line and not line.startswith('#'): - target_packages.append(line) - else: - print(f"Error: Package file not found: {args.package_file}", file=sys.stderr) - sys.exit(1) + target_packages = read_package_file(args.package_file) # Scan repository all_matches = scan_repository(args.path, rules, target_packages) From cb294386e14a16d3856d222ce786f08ddccad652 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 05:48:11 -0400 Subject: [PATCH 13/34] refactor(version_scanner): simplify target resolution and remove duplication --- scripts/version_scanner/version_scanner.py | 32 ++++++++-------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 9b84ace2ef51..33f0699fc85a 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -303,26 +303,21 @@ def main(): args = parser.parse_args() - # Resolve target paths - targets = [] - + # Resolve target packages if filtering is requested + target_packages = [] if args.package: - targets.append(os.path.join(args.path, args.package)) - - if args.package_file: - packages = read_package_file(args.package_file) - for p in packages: - targets.append(os.path.join(args.path, p)) - - # Fallback: if neither package nor package-file is given, use the path directly - if not targets: - targets.append(args.path) + target_packages.append(os.path.join("packages", args.package)) + elif args.package_file: + target_packages = read_package_file(args.package_file) print(f"Starting scan for dependency: {args.dependency} version: {args.version}") print(f"Root path: {args.path}") print(f"Targets to scan:") - for target in targets: - print(f" - {target}") + if target_packages: + for pkg in target_packages: + print(f" - {os.path.join(args.path, pkg)}") + else: + print(f" - {args.path} (all packages)") print(f"Using config: {args.config}") # Load and resolve rules @@ -333,12 +328,7 @@ def main(): for rule in rules: print(f" - {rule['name']}: {rule['pattern']}") - # Resolve target packages if filtering is requested - target_packages = [] - if args.package: - target_packages.append(os.path.join("packages", args.package)) - elif args.package_file: - target_packages = read_package_file(args.package_file) + # Scan repository all_matches = scan_repository(args.path, rules, target_packages) From ea0e8bef93691300e4047d887d53e18000b9c3a5 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 06:07:09 -0400 Subject: [PATCH 14/34] feat(version_scanner): add format_match_for_csv helper and tests --- .../tests/unit/test_version_scanner.py | 17 ++++++++++++ scripts/version_scanner/version_scanner.py | 26 +++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 724897e22b5f..ceadb91cacb1 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -196,6 +196,23 @@ def test_main_package_file_not_found(capsys): assert excinfo.value.code == 1 captured = capsys.readouterr() assert "Error: Package file not found" in captured.err +def test_format_match_for_csv(): + from version_scanner import format_match_for_csv + match = { + "file_path": "google-cloud-python/main/packages/pkg_a/setup.py", + "repo_path": "packages/pkg_a/setup.py", + "line_number": 123, + "rule_name": "test_rule" + } + + # Test without github_repo + formatted = format_match_for_csv(match) + assert formatted["line_number"] == 123 + + # Test with github_repo + formatted = format_match_for_csv(match, github_repo="https://github.com/user/repo", branch="main") + expected_url = "https://github.com/user/repo/blob/main/packages/pkg_a/setup.py#L123" + assert formatted["line_number"] == f'=HYPERLINK("{expected_url}", "123")' def test_regex_examples_from_config(): diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 33f0699fc85a..cb005f1f8ae3 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -119,6 +119,31 @@ def scan_file(file_path: str, compiled_rules: List[Dict[str, re.Pattern]]) -> Li return results + +def format_match_for_csv( + match: Dict[str, str], + github_repo: str = None, + branch: str = "main" +) -> Dict[str, str]: + """ + Format a match result for CSV output, adding GitHub links if requested. + """ + formatted = match.copy() + + if github_repo: + # Use repo_path if available, fallback to file_path + file_path = match.get("repo_path", match.get("file_path", "")) + line_number = match.get("line_number", "") + + # Construct URL + url = f"{github_repo}/blob/{branch}/{file_path}#L{line_number}" + + # Format as Google Sheets formula + formatted["line_number"] = f'=HYPERLINK("{url}", "{line_number}")' + + return formatted + + def write_csv_report(output_path: str, matches: List[Dict[str, str]]) -> None: """ Write the collected matches to a CSV file. @@ -247,6 +272,7 @@ def scan_repository( for m in matches: m["file_path"] = display_path + m["repo_path"] = rel_file_path m["package_name"] = package_name results.append(m) From a8824afa210dfde5ddcd46f1ecaf4ba8903ed550 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 06:12:23 -0400 Subject: [PATCH 15/34] feat(version_scanner): integrate GitHub link generation into CSV report --- .../tests/unit/test_version_scanner.py | 24 ++++++++++++++++++ scripts/version_scanner/version_scanner.py | 25 ++++++++++++++++--- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index ceadb91cacb1..112308f29732 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -213,6 +213,30 @@ def test_format_match_for_csv(): formatted = format_match_for_csv(match, github_repo="https://github.com/user/repo", branch="main") expected_url = "https://github.com/user/repo/blob/main/packages/pkg_a/setup.py#L123" assert formatted["line_number"] == f'=HYPERLINK("{expected_url}", "123")' +def test_write_csv_report_with_links(tmp_path): + output_file = tmp_path / "report.csv" + matches = [ + { + "file_path": "google-cloud-python/main/packages/pkg_a/setup.py", + "repo_path": "packages/pkg_a/setup.py", + "line_number": 1, + "rule_name": "python_requires_check", + "matched_string": "python_requires = '>=3.7'", + "context_line": "python_requires = '>=3.7'" + } + ] + + from version_scanner import write_csv_report + write_csv_report(str(output_file), matches, github_repo="https://github.com/user/repo", branch="main") + + assert output_file.exists() + + with open(output_file, 'r', encoding='utf-8', newline='') as f: + reader = csv.DictReader(f) + rows = list(reader) + + assert len(rows) == 1 + assert "HYPERLINK" in rows[0]["line_number"] def test_regex_examples_from_config(): diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index cb005f1f8ae3..e4ffb101ac4a 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -144,13 +144,20 @@ def format_match_for_csv( return formatted -def write_csv_report(output_path: str, matches: List[Dict[str, str]]) -> None: +def write_csv_report( + output_path: str, + matches: List[Dict[str, str]], + github_repo: str = None, + branch: str = "main" +) -> None: """ Write the collected matches to a CSV file. Args: output_path: Path to the output CSV file. matches: A list of dictionaries containing match details. + github_repo: Optional GitHub repository URL base. + branch: GitHub branch for links (defaults to main). """ fieldnames = ["file_path", "package_name", "rule_name", "line_number", "matched_string", "context_line"] @@ -160,8 +167,9 @@ def write_csv_report(output_path: str, matches: List[Dict[str, str]]) -> None: writer.writeheader() for match in matches: + formatted_match = format_match_for_csv(match, github_repo, branch) # Ensure only specified fields are written - row = {field: match.get(field, "") for field in fieldnames} + row = {field: formatted_match.get(field, "") for field in fieldnames} writer.writerow(row) print(f"\nReport written to: {output_path}") @@ -327,6 +335,17 @@ def main(): help="Path to the output CSV file (defaults to --.csv)" ) + parser.add_argument( + "--github-repo", + help="GitHub repository URL base (e.g., https://github.com/googleapis/google-cloud-python)" + ) + + parser.add_argument( + "--branch", + default="main", + help="GitHub branch for links (defaults to main)" + ) + args = parser.parse_args() # Resolve target packages if filtering is requested @@ -373,7 +392,7 @@ def main(): timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") output_path = f"{args.dependency}-{args.version}-{timestamp}.csv" - write_csv_report(output_path, all_matches) + write_csv_report(output_path, all_matches, github_repo=args.github_repo, branch=args.branch) if __name__ == "__main__": main() From baafb7455dfed4c3c6f27ff1fcfcf59a43338b94 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 06:23:07 -0400 Subject: [PATCH 16/34] feat(version_scanner): default output to results directory --- scripts/version_scanner/version_scanner.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index e4ffb101ac4a..7f605d9f6325 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -390,7 +390,10 @@ def main(): output_path = args.output else: timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - output_path = f"{args.dependency}-{args.version}-{timestamp}.csv" + script_dir = os.path.dirname(os.path.abspath(__file__)) + results_dir = os.path.join(script_dir, "results") + os.makedirs(results_dir, exist_ok=True) + output_path = os.path.join(results_dir, f"{args.dependency}-{args.version}-{timestamp}.csv") write_csv_report(output_path, all_matches, github_repo=args.github_repo, branch=args.branch) From a1cc08e8afb0fd77c60ebe2e6b959388eab2c740 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 06:42:22 -0400 Subject: [PATCH 17/34] feat(version_scanner): ignore version_scanner directory during scan --- .../tests/unit/test_version_scanner.py | 14 ++++++++++++++ scripts/version_scanner/version_scanner.py | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 112308f29732..82230acb28d6 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -237,6 +237,20 @@ def test_write_csv_report_with_links(tmp_path): assert len(rows) == 1 assert "HYPERLINK" in rows[0]["line_number"] +def test_scan_repository_ignores_version_scanner(tmp_path): + vs_dir = tmp_path / "version_scanner" + vs_dir.mkdir() + f = vs_dir / "test.py" + f.write_text("python_requires = '>=3.7'\n") + + rules = [ + {"name": "python_requires_check", "pattern": "python_requires\\s*=\\s*['\"]>=3\\.7['\"]"} + ] + + from version_scanner import scan_repository + results = scan_repository(str(tmp_path), rules) + + assert len(results) == 0 def test_regex_examples_from_config(): diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 7f605d9f6325..be8961b95669 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -221,7 +221,7 @@ def scan_repository( Returns: A list of match details. """ - ignore_dirs = {'.git', '__pycache__', '.tox', '.nox', 'venv', '.venv', '.conductor'} + ignore_dirs = {'.git', '__pycache__', '.tox', '.nox', 'venv', '.venv', '.conductor', 'version_scanner'} results = [] # Compile patterns once here From 3ceea9b276d09ba277e0dad8bf58ac314fd7c07e Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 07:06:52 -0400 Subject: [PATCH 18/34] feat(version_scanner): broaden version regex and add case insensitivity --- scripts/version_scanner/regex_config.yaml | 4 +++- scripts/version_scanner/tests/unit/test_version_scanner.py | 2 +- scripts/version_scanner/version_scanner.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/version_scanner/regex_config.yaml b/scripts/version_scanner/regex_config.yaml index e630e0c836df..a18209d10122 100644 --- a/scripts/version_scanner/regex_config.yaml +++ b/scripts/version_scanner/regex_config.yaml @@ -6,9 +6,10 @@ rules: - "'3.7'" - '"3.7.1"' - "'3.7.12'" + - "Python 3.7" rules: - | - ['"]{major}\.{minor}(\.\d+)?['"] + (?:['"]|\s|^){major}\.{minor}(\.\d+)?(?:['"]|\s|$) - name: python_requires description: Finds various forms of python_requires declarations. @@ -83,6 +84,7 @@ rules: examples: - "python3.7" - "python3.7 -m pip" + - "Python3.7" rules: - | python3\.{minor} diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 82230acb28d6..a993c823c2e5 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -286,7 +286,7 @@ def test_regex_examples_from_config(): for template in templates: try: resolved = template.strip().format(**vars) - compiled_patterns.append(re.compile(resolved)) + compiled_patterns.append(re.compile(resolved, re.IGNORECASE)) except KeyError: continue diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index be8961b95669..788f2b61745e 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -230,7 +230,7 @@ def scan_repository( try: compiled_rules.append({ "name": rule["name"], - "pattern": re.compile(rule["pattern"]) + "pattern": re.compile(rule["pattern"], re.IGNORECASE) }) except re.error as e: print(f"Error compiling regex for rule {rule['name']}: {e}", file=sys.stderr) From d756c072e652d6b5889240d2a3985223f9b7ffb5 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 07:18:54 -0400 Subject: [PATCH 19/34] feat(version_scanner): strip newlines from matched strings --- .../tests/unit/test_version_scanner.py | 17 +++++++++++++++++ scripts/version_scanner/version_scanner.py | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index a993c823c2e5..0f926e42721c 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -213,6 +213,23 @@ def test_format_match_for_csv(): formatted = format_match_for_csv(match, github_repo="https://github.com/user/repo", branch="main") expected_url = "https://github.com/user/repo/blob/main/packages/pkg_a/setup.py#L123" assert formatted["line_number"] == f'=HYPERLINK("{expected_url}", "123")' + + +def test_scan_file_removes_newline_from_match(tmp_path): + test_file = tmp_path / "test.py" + test_file.write_text("Python 3.7\n") + + rules = [ + {"name": "explicit_version_string", "pattern": re.compile(r"(?:['\"]|\s|^)3\.7(\.\d+)?(?:['\"]|\s|$)")} + ] + + from version_scanner import scan_file + results = scan_file(str(test_file), rules) + + assert len(results) == 1 + assert "\n" not in results[0]["matched_string"] + + def test_write_csv_report_with_links(tmp_path): output_file = tmp_path / "report.csv" matches = [ diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 788f2b61745e..a62822f03ef7 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -111,7 +111,7 @@ def scan_file(file_path: str, compiled_rules: List[Dict[str, re.Pattern]]) -> Li results.append({ "rule_name": rule["name"], "line_number": line_num, - "matched_string": match.group(0), + "matched_string": match.group(0).strip(), "context_line": line.strip() }) except IOError as e: From 075d04b5c274b33eda940b9fbd447888d95e7ddf Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 07:43:37 -0400 Subject: [PATCH 20/34] feat(version_scanner): add word boundaries and truncate long context lines --- scripts/version_scanner/regex_config.yaml | 2 +- .../tests/unit/test_version_scanner.py | 20 +++++++++++++++++++ scripts/version_scanner/version_scanner.py | 16 +++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/scripts/version_scanner/regex_config.yaml b/scripts/version_scanner/regex_config.yaml index a18209d10122..6322b8a1caa8 100644 --- a/scripts/version_scanner/regex_config.yaml +++ b/scripts/version_scanner/regex_config.yaml @@ -76,7 +76,7 @@ rules: - "py37-cover" rules: - | - py3{minor} + \bpy3{minor}\b - name: explicit_python_command description: Finds explicit python commands with version. diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 0f926e42721c..d5488ab9e918 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -215,6 +215,26 @@ def test_format_match_for_csv(): assert formatted["line_number"] == f'=HYPERLINK("{expected_url}", "123")' +def test_format_match_for_csv_truncates_long_line(): + from version_scanner import format_match_for_csv + + long_line = "a" * 1000 + "PY37" + "b" * 1000 + match = { + "file_path": "test.py", + "line_number": 1, + "rule_name": "test_rule", + "matched_string": "PY37", + "context_line": long_line + } + + formatted = format_match_for_csv(match) + context = formatted["context_line"] + + assert len(context) <= 600 + assert "PY37" in context + assert "..." in context + + def test_scan_file_removes_newline_from_match(tmp_path): test_file = tmp_path / "test.py" test_file.write_text("Python 3.7\n") diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index a62822f03ef7..4dd8b09739e8 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -141,6 +141,22 @@ def format_match_for_csv( # Format as Google Sheets formula formatted["line_number"] = f'=HYPERLINK("{url}", "{line_number}")' + context = formatted.get("context_line", "") + matched = formatted.get("matched_string", "") + + if len(context) > 500: + match_start = context.find(matched) + if match_start != -1: + start = max(0, match_start - 200) + end = min(len(context), match_start + len(matched) + 200) + + prefix = "..." if start > 0 else "" + suffix = "..." if end < len(context) else "" + + formatted["context_line"] = prefix + context[start:end] + suffix + else: + formatted["context_line"] = context[:500] + "..." + return formatted From 85e9ff52bad093a790cfc019af1ded258551a7ee Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 07:59:21 -0400 Subject: [PATCH 21/34] feat(version_scanner): add console summary table --- .../tests/unit/test_version_scanner.py | 15 +++++++ scripts/version_scanner/version_scanner.py | 39 +++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index d5488ab9e918..9df4f9a90c07 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -235,6 +235,21 @@ def test_format_match_for_csv_truncates_long_line(): assert "..." in context +def test_get_match_counts(): + from version_scanner import get_match_counts + + matches = [ + {"rule_name": "rule1", "package_name": "pkg1"}, + {"rule_name": "rule1", "package_name": "pkg2"}, + {"rule_name": "rule2", "package_name": "pkg1"}, + ] + + rule_counts, package_counts = get_match_counts(matches) + + assert rule_counts == {"rule1": 2, "rule2": 1} + assert package_counts == {"pkg1": 2, "pkg2": 1} + + def test_scan_file_removes_newline_from_match(tmp_path): test_file = tmp_path / "test.py" test_file.write_text("Python 3.7\n") diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 4dd8b09739e8..7ba9bb1f59e7 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -160,6 +160,41 @@ def format_match_for_csv( return formatted +def get_match_counts(matches: List[Dict[str, str]]) -> Tuple[Dict[str, int], Dict[str, int]]: + """ + Aggregate matches by rule and by package. + """ + rule_counts = {} + package_counts = {} + for m in matches: + r = m.get("rule_name") + p = m.get("package_name") + rule_counts[r] = rule_counts.get(r, 0) + 1 + package_counts[p] = package_counts.get(p, 0) + 1 + return rule_counts, package_counts + + +def print_summary_table(rule_counts: Dict[str, int], package_counts: Dict[str, int]) -> None: + """ + Print a summary table to the console. + """ + print("\n=== Scan Summary ===") + print(f"{'Rule Name':<30} {'Matches':<10}") + print("-" * 42) + for rule, count in sorted(rule_counts.items(), key=lambda x: x[1], reverse=True): + print(f"{rule:<30} {count:<10}") + + print(f"\n{'Package Name':<40} {'Matches':<10}") + print("-" * 52) + sorted_packages = sorted(package_counts.items(), key=lambda x: x[1], reverse=True) + for pkg, count in sorted_packages[:10]: + display_name = pkg if pkg else '[Root/None]' + print(f"{display_name:<40} {count:<10}") + + if len(sorted_packages) > 10: + print(f'... and {len(sorted_packages) - 10} more packages.') + + def write_csv_report( output_path: str, matches: List[Dict[str, str]], @@ -401,6 +436,10 @@ def main(): if len(all_matches) > 10: print(f" ... and {len(all_matches) - 10} more matches.") + # Get and print summary counts + rule_counts, package_counts = get_match_counts(all_matches) + print_summary_table(rule_counts, package_counts) + # Write report if args.output: output_path = args.output From 5c8f673dc2e8a5a09e16bc117ec95901671676f7 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 08:03:13 -0400 Subject: [PATCH 22/34] feat(version_scanner): add .scannerignore file support --- .scannerignore | 4 +++ .../tests/unit/test_version_scanner.py | 11 +++++++ scripts/version_scanner/version_scanner.py | 32 +++++++++++++++++-- 3 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 .scannerignore diff --git a/.scannerignore b/.scannerignore new file mode 100644 index 000000000000..9a192cfa425a --- /dev/null +++ b/.scannerignore @@ -0,0 +1,4 @@ +# Directories to ignore by the version scanner +# (defaults like .git, __pycache__, .tox, .nox, venv, .venv, .conductor, version_scanner are already ignored) +docs +samples diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 9df4f9a90c07..9852df7a7e67 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -305,6 +305,17 @@ def test_scan_repository_ignores_version_scanner(tmp_path): assert len(results) == 0 +def test_load_ignore_file(tmp_path): + from version_scanner import load_ignore_file + + ignore_file = tmp_path / ".scannerignore" + ignore_file.write_text("dir1\n# comment\n \ndir2\n") + + ignore_dirs = load_ignore_file(str(ignore_file)) + + assert ignore_dirs == ["dir1", "dir2"] + + def test_regex_examples_from_config(): """Test that examples in config match at least one rule in the group.""" config_path = "regex_config.yaml" diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 7ba9bb1f59e7..fb84fdfaa7f7 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -195,6 +195,22 @@ def print_summary_table(rule_counts: Dict[str, int], package_counts: Dict[str, i print(f'... and {len(sorted_packages) - 10} more packages.') +def load_ignore_file(file_path: str) -> List[str]: + """ + Read ignore paths from a file. + """ + ignore_dirs = [] + try: + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + ignore_dirs.append(line) + except FileNotFoundError: + pass + return ignore_dirs + + def write_csv_report( output_path: str, matches: List[Dict[str, str]], @@ -258,7 +274,8 @@ def read_package_file(file_path: str) -> List[str]: def scan_repository( root_path: str, rules: List[Dict[str, str]], - target_packages: List[str] = None + target_packages: List[str] = None, + ignore_dirs: List[str] = None ) -> List[Dict[str, str]]: """ Scan repository for matching patterns. @@ -272,7 +289,10 @@ def scan_repository( Returns: A list of match details. """ - ignore_dirs = {'.git', '__pycache__', '.tox', '.nox', 'venv', '.venv', '.conductor', 'version_scanner'} + defaults = {'.git', '__pycache__', '.tox', '.nox', 'venv', '.venv', '.conductor', 'version_scanner'} + if ignore_dirs: + defaults.update(ignore_dirs) + ignore_dirs = defaults results = [] # Compile patterns once here @@ -426,8 +446,14 @@ def main(): + # Load ignore file + ignore_file_path = os.path.join(args.path, ".scannerignore") + ignore_dirs = load_ignore_file(ignore_file_path) + if ignore_dirs: + print(f"Loaded {len(ignore_dirs)} ignore patterns from {ignore_file_path}") + # Scan repository - all_matches = scan_repository(args.path, rules, target_packages) + all_matches = scan_repository(args.path, rules, target_packages, ignore_dirs) print(f"\nFound {len(all_matches)} matches.") for m in all_matches[:10]: # Show first 10 From efb33312698acb1ceb1d7e6ba412b14726b341a7 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 08:17:02 -0400 Subject: [PATCH 23/34] feat(version_scanner): move ignore defaults to .scannerignore file --- .scannerignore | 9 ++++++++- .../version_scanner/tests/unit/test_version_scanner.py | 2 +- scripts/version_scanner/version_scanner.py | 5 +---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.scannerignore b/.scannerignore index 9a192cfa425a..190ff2285404 100644 --- a/.scannerignore +++ b/.scannerignore @@ -1,4 +1,11 @@ # Directories to ignore by the version scanner -# (defaults like .git, __pycache__, .tox, .nox, venv, .venv, .conductor, version_scanner are already ignored) +.git +__pycache__ +.tox +.nox +venv +.venv +.conductor +version_scanner docs samples diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 9852df7a7e67..28aa652d5e08 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -300,7 +300,7 @@ def test_scan_repository_ignores_version_scanner(tmp_path): ] from version_scanner import scan_repository - results = scan_repository(str(tmp_path), rules) + results = scan_repository(str(tmp_path), rules, ignore_dirs=['version_scanner']) assert len(results) == 0 diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index fb84fdfaa7f7..70d04f40c8ed 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -289,10 +289,7 @@ def scan_repository( Returns: A list of match details. """ - defaults = {'.git', '__pycache__', '.tox', '.nox', 'venv', '.venv', '.conductor', 'version_scanner'} - if ignore_dirs: - defaults.update(ignore_dirs) - ignore_dirs = defaults + ignore_dirs = set(ignore_dirs) if ignore_dirs else set() results = [] # Compile patterns once here From bf39072ae506fb96f344a24fbefc18ea92df5b80 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 08:17:09 -0400 Subject: [PATCH 24/34] docs(version_scanner): add README.md --- scripts/version_scanner/README.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 scripts/version_scanner/README.md diff --git a/scripts/version_scanner/README.md b/scripts/version_scanner/README.md new file mode 100644 index 000000000000..733e83a6f6fe --- /dev/null +++ b/scripts/version_scanner/README.md @@ -0,0 +1,31 @@ +# Automated Dependency Version Scanner + +This tool scans the repository for hardcoded references to specific dependency versions (like Python 3.7) that need to be upgraded or removed. + +## Usage + +Run the script from the repository root: + +```bash +python3 scripts/version_scanner/version_scanner.py -d -v [options] +``` + +### Options + +* `-d`, `--dependency`: The dependency name (e.g., `python`). +* `-v`, `--version`: The specific version to search for (e.g., `3.7`). +* `-p`, `--path`: Root directory to scan (defaults to current directory). +* `--package`: Specific package directory to scan (useful for testing). +* `--package-file`: Path to a file containing a list of package directories to scan. +* `--config`: Path to the regex configuration file (defaults to `regex_config.yaml`). +* `-o`, `--output`: Path to the output CSV file. +* `--github-repo`: GitHub repository URL base for generating links. +* `--branch`: GitHub branch for links (defaults to `main`). + +## Configuration + +The scanner uses a YAML configuration file (`regex_config.yaml`) to define rules and regex patterns. + +## Ignoring Directories + +You can create a `.scannerignore` file in the directory you are scanning (usually the repo root) to list directories to skip, one per line. From 9d9ce226901484af66362e94dfc28c8b7cb49943 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 08:23:13 -0400 Subject: [PATCH 25/34] docs(version_scanner): update README options and CLI help strings --- scripts/version_scanner/README.md | 18 +++++++++--------- scripts/version_scanner/version_scanner.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/version_scanner/README.md b/scripts/version_scanner/README.md index 733e83a6f6fe..ff032f79f7fe 100644 --- a/scripts/version_scanner/README.md +++ b/scripts/version_scanner/README.md @@ -12,15 +12,15 @@ python3 scripts/version_scanner/version_scanner.py -d -v ### Options -* `-d`, `--dependency`: The dependency name (e.g., `python`). -* `-v`, `--version`: The specific version to search for (e.g., `3.7`). -* `-p`, `--path`: Root directory to scan (defaults to current directory). -* `--package`: Specific package directory to scan (useful for testing). -* `--package-file`: Path to a file containing a list of package directories to scan. -* `--config`: Path to the regex configuration file (defaults to `regex_config.yaml`). -* `-o`, `--output`: Path to the output CSV file. -* `--github-repo`: GitHub repository URL base for generating links. -* `--branch`: GitHub branch for links (defaults to `main`). +* `-d`, `--dependency`: Name of the dependency (e.g., python, protobuf) +* `-v`, `--version`: Specific version to search for (e.g., 3.7, 4.25.8) +* `-p`, `--path`: Root directory to scan (defaults to current directory) +* `--package`: Specific subdirectory filter (useful for monorepos) +* `--package-file`: Path to a file containing a list of package directories to scan +* `--config`: Path to the regex configuration file (defaults to regex_config.yaml) +* `-o`, `--output`: Path to the output CSV file (defaults to --.csv) +* `--github-repo`: GitHub repository URL base (e.g., https://github.com/googleapis/google-cloud-python) +* `--branch`: GitHub branch for links (defaults to main) ## Configuration diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 70d04f40c8ed..22bf61d33116 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -395,7 +395,7 @@ def main(): parser.add_argument( "--config", default="regex_config.yaml", - help="Path to the regex configuration file" + help="Path to the regex configuration file (defaults to regex_config.yaml)" ) parser.add_argument( From 14e4dcc9917f303597ae16f83780921021f35615 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 08:26:31 -0400 Subject: [PATCH 26/34] feat(version_scanner): set default for --github-repo --- scripts/version_scanner/README.md | 2 +- scripts/version_scanner/version_scanner.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/version_scanner/README.md b/scripts/version_scanner/README.md index ff032f79f7fe..76c4ede25e75 100644 --- a/scripts/version_scanner/README.md +++ b/scripts/version_scanner/README.md @@ -19,7 +19,7 @@ python3 scripts/version_scanner/version_scanner.py -d -v * `--package-file`: Path to a file containing a list of package directories to scan * `--config`: Path to the regex configuration file (defaults to regex_config.yaml) * `-o`, `--output`: Path to the output CSV file (defaults to --.csv) -* `--github-repo`: GitHub repository URL base (e.g., https://github.com/googleapis/google-cloud-python) +* `--github-repo`: GitHub repository URL base (defaults to https://github.com/googleapis/google-cloud-python) * `--branch`: GitHub branch for links (defaults to main) ## Configuration diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 22bf61d33116..55197aa08db1 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -405,7 +405,8 @@ def main(): parser.add_argument( "--github-repo", - help="GitHub repository URL base (e.g., https://github.com/googleapis/google-cloud-python)" + default="https://github.com/googleapis/google-cloud-python", + help="GitHub repository URL base (defaults to https://github.com/googleapis/google-cloud-python)" ) parser.add_argument( From 7fc03ca6da2a159156580ecdf2cc555a07473a78 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 08:36:49 -0400 Subject: [PATCH 27/34] feat(version_scanner): default config path to script directory --- scripts/version_scanner/README.md | 2 +- scripts/version_scanner/version_scanner.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/version_scanner/README.md b/scripts/version_scanner/README.md index 76c4ede25e75..85d59aa6dc45 100644 --- a/scripts/version_scanner/README.md +++ b/scripts/version_scanner/README.md @@ -17,7 +17,7 @@ python3 scripts/version_scanner/version_scanner.py -d -v * `-p`, `--path`: Root directory to scan (defaults to current directory) * `--package`: Specific subdirectory filter (useful for monorepos) * `--package-file`: Path to a file containing a list of package directories to scan -* `--config`: Path to the regex configuration file (defaults to regex_config.yaml) +* `--config`: Path to the regex configuration file (defaults to scripts/version_scanner/regex_config.yaml) * `-o`, `--output`: Path to the output CSV file (defaults to --.csv) * `--github-repo`: GitHub repository URL base (defaults to https://github.com/googleapis/google-cloud-python) * `--branch`: GitHub branch for links (defaults to main) diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 55197aa08db1..67a1cdebad94 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -356,6 +356,9 @@ def scan_repository( def main(): + script_dir = os.path.dirname(os.path.abspath(__file__)) + default_config = os.path.join(script_dir, "regex_config.yaml") + parser = argparse.ArgumentParser( description="Scan repository for references to specific dependency versions." ) @@ -394,8 +397,8 @@ def main(): parser.add_argument( "--config", - default="regex_config.yaml", - help="Path to the regex configuration file (defaults to regex_config.yaml)" + default=default_config, + help="Path to the regex configuration file (defaults to scripts/version_scanner/regex_config.yaml)" ) parser.add_argument( From f64eac4126e68c29957df641a80619b405c419e1 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 09:29:30 -0400 Subject: [PATCH 28/34] feat(version_scanner): support case-insensitive file ignores and add changelog.md --- .scannerignore | 3 +- scripts/version_scanner/version_scanner.py | 80 +++++++++++++++++++++- 2 files changed, 79 insertions(+), 4 deletions(-) diff --git a/.scannerignore b/.scannerignore index 190ff2285404..e36afb521638 100644 --- a/.scannerignore +++ b/.scannerignore @@ -1,4 +1,4 @@ -# Directories to ignore by the version scanner +# Directories and files to ignore by the version scanner .git __pycache__ .tox @@ -9,3 +9,4 @@ venv version_scanner docs samples +changelog.md diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 67a1cdebad94..a15090339e61 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -12,6 +12,8 @@ import sys from typing import Dict, List, Tuple import yaml +import google.auth +from googleapiclient.discovery import build class ConfigManager: """Handles loading and interpolation of regex configurations.""" @@ -242,6 +244,66 @@ def write_csv_report( print(f"\nReport written to: {output_path}") except IOError as e: print(f"Error writing CSV report: {e}", file=sys.stderr) + + +def upload_to_drive(csv_path: str, matches: List[Dict[str, str]], github_repo: str = None, branch: str = "main") -> str: + """ + Upload matches to a Google Sheet in Drive. + """ + print("\nUploading to Google Drive...") + try: + credentials, project = google.auth.default( + scopes=['https://www.googleapis.com/auth/drive', 'https://www.googleapis.com/auth/spreadsheets'] + ) + + service = build('sheets', 'v4', credentials=credentials) + + # Create a new spreadsheet + title = os.path.basename(csv_path).replace('.csv', '') + spreadsheet = { + 'properties': { + 'title': title + } + } + spreadsheet = service.spreadsheets().create(body=spreadsheet, fields='spreadsheetUrl,spreadsheetId').execute() + url = spreadsheet.get('spreadsheetUrl') + spreadsheet_id = spreadsheet.get('spreadsheetId') + + # Prepare data + values = [["file_path", "package_name", "rule_name", "line_number", "matched_string", "context_line"]] + for m in matches: + formatted_m = format_match_for_csv(m, github_repo=github_repo, branch=branch) + values.append([ + formatted_m.get("file_path", ""), + formatted_m.get("package_name", ""), + formatted_m.get("rule_name", ""), + str(formatted_m.get("line_number", "")), + formatted_m.get("matched_string", ""), + formatted_m.get("context_line", "") + ]) + + body = { + 'values': values + } + + # Update values + service.spreadsheets().values().update( + spreadsheetId=spreadsheet_id, + range='Sheet1!A1', + valueInputOption='USER_ENTERED', + body=body + ).execute() + + print(f"Successfully uploaded to Google Sheet: {url}") + return url + + except Exception as e: + import traceback + traceback.print_exc() + print(f"Error uploading to Google Drive: {e}", file=sys.stderr) + return "" + + def read_package_file(file_path: str) -> List[str]: """ Read package paths from a file. @@ -289,7 +351,7 @@ def scan_repository( Returns: A list of match details. """ - ignore_dirs = set(ignore_dirs) if ignore_dirs else set() + ignore_lower = {i.lower() for i in ignore_dirs} if ignore_dirs else set() results = [] # Compile patterns once here @@ -309,8 +371,11 @@ def scan_repository( print(f"Filtering for packages: {target_packages}") for root, dirs, files in os.walk(root_path): - # Prune ignore directories - dirs[:] = [d for d in dirs if d not in ignore_dirs] + # Prune ignore directories (case-insensitive) + dirs[:] = [d for d in dirs if d.lower() not in ignore_lower] + + # Filter ignore files (case-insensitive) + files = [f for f in files if f.lower() not in ignore_lower] rel_root = os.path.relpath(root, root_path) parts = rel_root.split(os.sep) @@ -418,6 +483,12 @@ def main(): help="GitHub branch for links (defaults to main)" ) + parser.add_argument( + "--upload", + action="store_true", + help="Upload results to a Google Sheet in Drive" + ) + args = parser.parse_args() # Resolve target packages if filtering is requested @@ -478,6 +549,9 @@ def main(): output_path = os.path.join(results_dir, f"{args.dependency}-{args.version}-{timestamp}.csv") write_csv_report(output_path, all_matches, github_repo=args.github_repo, branch=args.branch) + + if args.upload: + upload_to_drive(output_path, all_matches, github_repo=args.github_repo, branch=args.branch) if __name__ == "__main__": main() From fc47dd6551295fde71abccc3619c89f083dcefbf Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 09:29:59 -0400 Subject: [PATCH 29/34] feat(version_scanner): update small package list for demos --- scripts/version_scanner/small_package_list.txt | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 scripts/version_scanner/small_package_list.txt diff --git a/scripts/version_scanner/small_package_list.txt b/scripts/version_scanner/small_package_list.txt new file mode 100644 index 000000000000..06109994b7fa --- /dev/null +++ b/scripts/version_scanner/small_package_list.txt @@ -0,0 +1,5 @@ +packages/google-cloud-access-context-manager +packages/google-cloud-bigtable +packages/google-cloud-biglake-hive +packages/google-cloud-documentai-toolbox +packages/google-cloud-core From 9289c8cc2358687924f653300d630948960877b9 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 30 Apr 2026 10:14:28 -0400 Subject: [PATCH 30/34] feat(version_scanner): add combined_version_string rule and use word boundaries for explicit_version_string --- scripts/version_scanner/regex_config.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/version_scanner/regex_config.yaml b/scripts/version_scanner/regex_config.yaml index 6322b8a1caa8..07196c63edeb 100644 --- a/scripts/version_scanner/regex_config.yaml +++ b/scripts/version_scanner/regex_config.yaml @@ -9,7 +9,7 @@ rules: - "Python 3.7" rules: - | - (?:['"]|\s|^){major}\.{minor}(\.\d+)?(?:['"]|\s|$) + \b{major}\.{minor}(\.\d+)?\b - name: python_requires description: Finds various forms of python_requires declarations. @@ -89,4 +89,14 @@ rules: - | python3\.{minor} + - name: combined_version_string + description: Finds combined version strings often used in class or variable names. + applies_to: [python] + examples: + - "Python37" + - "Python37DeprecationWarning" + rules: + - | + Python{major}{minor} + From d771258173d96417181b3bd3f40972c360676d89 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Fri, 1 May 2026 08:23:37 -0400 Subject: [PATCH 31/34] feat(scanner): add ability to detect ignore pragma --- .../tests/unit/test_version_scanner.py | 61 +++++++++++++++++++ scripts/version_scanner/version_scanner.py | 2 + 2 files changed, 63 insertions(+) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 28aa652d5e08..1ce84c8de118 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -1,6 +1,7 @@ import csv import os import re +from unittest import mock from unittest.mock import patch import pytest import yaml @@ -55,6 +56,17 @@ def test_scan_file_negative(tmp_path): results = scan_file(str(test_file), rules) assert len(results) == 0 +def test_scan_file_ignores_pragma(tmp_path): + test_file = tmp_path / "test.py" + test_file.write_text("python_requires = '>=3.7' # version-scanner: ignore\n") + + rules = [ + {"name": "python_requires_check", "pattern": re.compile(r"python_requires\s*=\s*['\"]>=3\.7['\"]")} + ] + + results = scan_file(str(test_file), rules) + assert len(results) == 0 + # Test directory scan simulation def test_directory_scan(tmp_path): # Create dummy files @@ -316,6 +328,55 @@ def test_load_ignore_file(tmp_path): assert ignore_dirs == ["dir1", "dir2"] +@mock.patch('version_scanner.build') +@mock.patch('google.auth.default') +def test_upload_to_drive(mock_auth, mock_build): + from unittest import mock + + mock_creds = mock.Mock() + mock_creds.universe_domain = "googleapis.com" + mock_creds.create_scoped.return_value = mock_creds + + mock_auth_http = mock.Mock() + mock_auth_http.credentials = mock_creds + mock_creds.authorize.return_value = mock_auth_http + + mock_auth.return_value = (mock_creds, "project-id") + + mock_sheets = mock.Mock() + mock_build.return_value = mock_sheets + + mock_spreadsheets = mock.Mock() + mock_sheets.spreadsheets.return_value = mock_spreadsheets + + mock_create = mock.Mock() + mock_spreadsheets.create.return_value = mock_create + mock_create.execute.return_value = {"spreadsheetUrl": "http://example.com"} + + mock_values = mock.Mock() + mock_spreadsheets.values.return_value = mock_values + mock_update = mock.Mock() + mock_values.update.return_value = mock_update + mock_update.execute.return_value = {} + + from version_scanner import upload_to_drive + + matches = [{"rule_name": "r1", "package_name": "p1", "file_path": "f1", "line_number": 1, "matched_string": "s1", "context_line": "c1"}] + + url = upload_to_drive("test.csv", matches, github_repo="https://github.com/user/repo") + + assert url == "http://example.com" + mock_spreadsheets.create.assert_called_once() + + # Verify that update was called with hyperlink formula + mock_values.update.assert_called_once() + args, kwargs = mock_values.update.call_args + body = kwargs.get('body', {}) + values = body.get('values', []) + assert len(values) > 1 + assert "HYPERLINK" in values[1][3] # line_number is at index 3 + + def test_regex_examples_from_config(): """Test that examples in config match at least one rule in the group.""" config_path = "regex_config.yaml" diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index a15090339e61..c74dd4df834a 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -107,6 +107,8 @@ def scan_file(file_path: str, compiled_rules: List[Dict[str, re.Pattern]]) -> Li try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: for line_num, line in enumerate(f, 1): + if "version-scanner: ignore" in line: + continue for rule in compiled_rules: match = rule["pattern"].search(line) if match: From bafae70ac1b8f1a13ad13bb8da7c3a180ad7b038 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Fri, 1 May 2026 09:53:58 -0400 Subject: [PATCH 32/34] feat(scanner): move .scannerignore to script directory and update lookup logic --- .../version_scanner/.scannerignore | 0 .../tests/unit/test_version_scanner.py | 19 +++++++++++++++++++ scripts/version_scanner/version_scanner.py | 5 +++-- 3 files changed, 22 insertions(+), 2 deletions(-) rename .scannerignore => scripts/version_scanner/.scannerignore (100%) diff --git a/.scannerignore b/scripts/version_scanner/.scannerignore similarity index 100% rename from .scannerignore rename to scripts/version_scanner/.scannerignore diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 1ce84c8de118..51f9b61eaa78 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -327,6 +327,25 @@ def test_load_ignore_file(tmp_path): assert ignore_dirs == ["dir1", "dir2"] +@mock.patch('version_scanner.load_ignore_file') +@mock.patch('version_scanner.scan_repository') +def test_main_loads_ignore_from_script_dir(mock_scan, mock_load_ignore): + mock_load_ignore.return_value = [] + mock_scan.return_value = [] + + import sys + test_args = ["version_scanner.py", "-d", "python", "-v", "3.7"] + + with mock.patch('sys.argv', test_args): + from version_scanner import main + main() + + mock_load_ignore.assert_called_once() + args, kwargs = mock_load_ignore.call_args + path = args[0] + assert ".scannerignore" in path + assert "scripts/version_scanner" in path + @mock.patch('version_scanner.build') @mock.patch('google.auth.default') diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index c74dd4df834a..ad6b0b890d46 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -520,8 +520,9 @@ def main(): - # Load ignore file - ignore_file_path = os.path.join(args.path, ".scannerignore") + # Load ignore file from script directory (Option A) + script_dir = os.path.dirname(os.path.abspath(__file__)) + ignore_file_path = os.path.join(script_dir, ".scannerignore") ignore_dirs = load_ignore_file(ignore_file_path) if ignore_dirs: print(f"Loaded {len(ignore_dirs)} ignore patterns from {ignore_file_path}") From 94174bb21c4277bf591baab0603757c3bd24fd9e Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Fri, 1 May 2026 14:28:16 -0400 Subject: [PATCH 33/34] chore(scanner): ignore repositories.bzl in scanner --- scripts/version_scanner/.scannerignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/version_scanner/.scannerignore b/scripts/version_scanner/.scannerignore index e36afb521638..43ca102e0333 100644 --- a/scripts/version_scanner/.scannerignore +++ b/scripts/version_scanner/.scannerignore @@ -10,3 +10,7 @@ version_scanner docs samples changelog.md +.librarian +goldens +# Ignore pandoc references in repositories.bzl +repositories.bzl From d652dbf30cc9440637c8735fd946e3cc3d06f2f3 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Fri, 1 May 2026 15:18:04 -0400 Subject: [PATCH 34/34] feat(scanner): add filename scanning support --- .../tests/unit/test_version_scanner.py | 24 +++++++++++++++++++ scripts/version_scanner/version_scanner.py | 21 ++++++++++++++-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 51f9b61eaa78..32042d34b137 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -67,6 +67,30 @@ def test_scan_file_ignores_pragma(tmp_path): results = scan_file(str(test_file), rules) assert len(results) == 0 +def test_scan_file_ignores_next_line(tmp_path): + test_file = tmp_path / "test.py" + test_file.write_text("# version-scanner: ignore-next-line\npython_requires = '>=3.7'\n") + + rules = [ + {"name": "python_requires_check", "pattern": re.compile(r"python_requires\s*=\s*['\"]>=3\.7['\"]")} + ] + + results = scan_file(str(test_file), rules) + assert len(results) == 0 + +def test_scan_repository_flags_filename(tmp_path): + test_file = tmp_path / "test-3.9.txt" + test_file.write_text("clean content\n") + + rules = [] + + from version_scanner import scan_repository + results = scan_repository(str(tmp_path), rules, version_string="3.9") + + assert len(results) == 1 + assert results[0]["rule_name"] == "filename_match" + assert results[0]["matched_string"] == "3.9" + # Test directory scan simulation def test_directory_scan(tmp_path): # Create dummy files diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index ad6b0b890d46..1d3916973467 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -106,7 +106,14 @@ def scan_file(file_path: str, compiled_rules: List[Dict[str, re.Pattern]]) -> Li try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + skip_next = False for line_num, line in enumerate(f, 1): + if skip_next: + skip_next = False + continue + if "version-scanner: ignore-next-line" in line: + skip_next = True + continue if "version-scanner: ignore" in line: continue for rule in compiled_rules: @@ -339,7 +346,8 @@ def scan_repository( root_path: str, rules: List[Dict[str, str]], target_packages: List[str] = None, - ignore_dirs: List[str] = None + ignore_dirs: List[str] = None, + version_string: str = None ) -> List[Dict[str, str]]: """ Scan repository for matching patterns. @@ -398,6 +406,15 @@ def scan_repository( file_path = os.path.join(root, file) matches = scan_file(file_path, compiled_rules) + # Add filename match if applicable + if version_string and version_string in file: + matches.append({ + "rule_name": "filename_match", + "line_number": 0, + "matched_string": version_string, + "context_line": f"Filename contains {version_string}" + }) + # Compute display path and package name rel_file_path = os.path.relpath(file_path, root_path) @@ -528,7 +545,7 @@ def main(): print(f"Loaded {len(ignore_dirs)} ignore patterns from {ignore_file_path}") # Scan repository - all_matches = scan_repository(args.path, rules, target_packages, ignore_dirs) + all_matches = scan_repository(args.path, rules, target_packages, ignore_dirs, version_string=args.version) print(f"\nFound {len(all_matches)} matches.") for m in all_matches[:10]: # Show first 10