From eef81103bcd3073a4733576289f699e91960fac6 Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Tue, 12 Oct 2021 17:33:00 -0400 Subject: [PATCH 01/17] WIP: convert issue food chain to generators mypy is happy and unit tests have been fixed up so they pass again (mostly by liberal application of `list()` to force execution of the generator). This still needs work to fix the logic where it is assumed that the `issues` property is a list, namely where we try to take the length of it or consume it more than once. --- tartufo/scanner.py | 49 +++++++++++++-------------- tartufo/util.py | 18 ++++++++-- tests/test_base_scanner.py | 60 ++++++++++++++++------------------ tests/test_folder_scanner.py | 5 +-- tests/test_git_repo_scanner.py | 3 +- 5 files changed, 71 insertions(+), 64 deletions(-) diff --git a/tartufo/scanner.py b/tartufo/scanner.py index 065f2e34..c2c89027 100755 --- a/tartufo/scanner.py +++ b/tartufo/scanner.py @@ -131,7 +131,7 @@ class ScannerBase(abc.ABC): all the individual pieces of content to be scanned. """ - _issues: Optional[List[Issue]] = None + _issue_count: int = -1 _included_paths: Optional[List[Pattern]] = None _excluded_paths: Optional[List[Pattern]] = None _excluded_entropy: Optional[List[Rule]] = None @@ -144,18 +144,16 @@ def __init__(self, options: types.GlobalOptions) -> None: self.logger = logging.getLogger(__name__) @property - def issues(self) -> List[Issue]: + def issues(self) -> Generator[Issue, None, None]: """Get a list of issues found during the scan. If a scan has not yet been run, run it. :return: Any issues found during the scan. - :rtype: List[Issue] """ - if self._issues is None: + if self._issue_count < 0: self.logger.debug("Issues called before scan. Calling scan now.") - self._issues = self.scan() - return self._issues + yield from self.scan() @property def included_paths(self) -> List[Pattern]: @@ -345,7 +343,7 @@ def calculate_entropy(self, data: str, char_set: str) -> float: entropy += -prob_x * math.log2(prob_x) return entropy - def scan(self) -> List[Issue]: + def scan(self) -> Generator[Issue, None, None]: """Run the requested scans against the target data. This will iterate through all chunks of data as provided by the scanner @@ -355,7 +353,7 @@ def scan(self) -> List[Issue]: :raises types.TartufoConfigException: If there were problems with the scanner's configuration """ - issues: List[Issue] = [] + if not any((self.global_options.entropy, self.global_options.regex)): self.logger.error("No analysis requested.") raise types.ConfigException("No analysis requested.") @@ -364,47 +362,44 @@ def scan(self) -> List[Issue]: raise types.ConfigException("Regex checks requested, but no regexes found.") self.logger.info("Starting scan...") + self._issue_count = 0 for chunk in self.chunks: # Run regex scans first to trigger a potential fast fail for bad config if self.global_options.regex and self.rules_regexes: - issues += self.scan_regex(chunk) + yield from self.scan_regex(chunk) if self.global_options.entropy: - issues += self.scan_entropy( + yield from self.scan_entropy( chunk, self.global_options.b64_entropy_score, self.global_options.hex_entropy_score, ) - self._issues = issues - self.logger.info("Found %d issues.", len(self._issues)) - return self._issues + self.logger.info("Found %d issues.", self._issue_count)) def scan_entropy( self, chunk: types.Chunk, b64_entropy_score: float, hex_entropy_score: float - ) -> List[Issue]: + ) -> Generator[Issue, None, None]: """Scan a chunk of data for apparent high entropy. :param chunk: The chunk of data to be scanned :param b64_entropy_score: Base64 entropy score :param hex_entropy_score: Hexadecimal entropy score """ - issues: List[Issue] = [] + for line in chunk.contents.split("\n"): for word in line.split(): b64_strings = util.get_strings_of_set(word, BASE64_CHARS) hex_strings = util.get_strings_of_set(word, HEX_CHARS) for string in b64_strings: - issues += self.evaluate_entropy_string( + yield from self.evaluate_entropy_string( chunk, line, string, BASE64_CHARS, b64_entropy_score ) for string in hex_strings: - issues += self.evaluate_entropy_string( + yield from self.evaluate_entropy_string( chunk, line, string, HEX_CHARS, hex_entropy_score ) - return issues - def evaluate_entropy_string( self, chunk: types.Chunk, @@ -412,7 +407,7 @@ def evaluate_entropy_string( string: str, chars: str, min_entropy_score: float, - ) -> List[Issue]: + ) -> Generator[Issue, None, None]: """ Check entropy string using entropy characters and score. @@ -421,7 +416,7 @@ def evaluate_entropy_string( :param string: String to check :param chars: Characters to calculate score :param min_entropy_score: Minimum entropy score to flag - return: List of issues flagged + return: Iterator of issues flagged """ if not self.signature_is_excluded(string, chunk.file_path): entropy_score = self.calculate_entropy(string, chars) @@ -429,15 +424,15 @@ def evaluate_entropy_string( if self.entropy_string_is_excluded(string, line, chunk.file_path): self.logger.debug("line containing entropy was excluded: %s", line) else: - return [Issue(types.IssueType.Entropy, string, chunk)] - return [] + self._issue_count += 1 + yield Issue(types.IssueType.Entropy, string, chunk) - def scan_regex(self, chunk: types.Chunk) -> List[Issue]: + def scan_regex(self, chunk: types.Chunk) -> Generator[Issue, None, None]: """Scan a chunk of data for matches against the configured regexes. :param chunk: The chunk of data to be scanned """ - issues: List[Issue] = [] + for key, rule in self.rules_regexes.items(): if rule.path_pattern is None or rule.path_pattern.match(chunk.file_path): found_strings = rule.pattern.findall(chunk.contents) @@ -446,8 +441,8 @@ def scan_regex(self, chunk: types.Chunk) -> List[Issue]: if not self.signature_is_excluded(match, chunk.file_path): issue = Issue(types.IssueType.RegEx, match, chunk) issue.issue_detail = key - issues.append(issue) - return issues + self._issue_count += 1 + yield issue @property @abc.abstractmethod diff --git a/tartufo/util.py b/tartufo/util.py index b38c920b..391d6556 100644 --- a/tartufo/util.py +++ b/tartufo/util.py @@ -8,7 +8,17 @@ from datetime import datetime from functools import lru_cache, partial from hashlib import blake2s -from typing import Any, Callable, Dict, Iterable, List, Optional, TYPE_CHECKING, Pattern +from typing import ( + Any, + Callable, + Dict, + Generator, + Iterable, + List, + Optional, + TYPE_CHECKING, + Pattern, +) import click import git @@ -96,10 +106,12 @@ def echo_result( click.echo("\n".join(options.exclude_entropy_patterns)) -def write_outputs(found_issues: "List[Issue]", output_dir: pathlib.Path) -> List[str]: +def write_outputs( + found_issues: "Generator[Issue, None, None]", output_dir: pathlib.Path +) -> List[str]: """Write details of the issues to individual files in the specified directory. - :param found_issues: The list of issues to be written out + :param found_issues: A generator for issues to be written out :param output_dir: The directory where the files should be written """ result_files = [] diff --git a/tests/test_base_scanner.py b/tests/test_base_scanner.py index d28b2336..259d4601 100644 --- a/tests/test_base_scanner.py +++ b/tests/test_base_scanner.py @@ -33,7 +33,7 @@ def test_scan_aborts_when_no_entropy_or_regex(self): self.options.regex = False test_scanner = TestScanner(self.options) with self.assertRaisesRegex(types.ConfigException, "No analysis requested."): - test_scanner.scan() + list(test_scanner.scan()) def test_scan_aborts_when_regex_requested_but_none_found(self): self.options.regex = True @@ -42,7 +42,7 @@ def test_scan_aborts_when_regex_requested_but_none_found(self): with self.assertRaisesRegex( types.ConfigException, "Regex checks requested, but no regexes found." ): - test_scanner.scan() + list(test_scanner.scan()) @mock.patch("tartufo.config.configure_regexes") def test_scan_aborts_due_to_invalid_regex(self, mock_config: mock.MagicMock): @@ -54,7 +54,7 @@ def test_scan_aborts_due_to_invalid_regex(self, mock_config: mock.MagicMock): with self.assertRaisesRegex( types.ConfigException, "Invalid regular expression" ): - test_scanner.scan() + list(test_scanner.scan()) @mock.patch("tartufo.scanner.ScannerBase.scan_entropy") def test_scan_iterates_through_all_chunks(self, mock_entropy: mock.MagicMock): @@ -63,7 +63,7 @@ def test_scan_iterates_through_all_chunks(self, mock_entropy: mock.MagicMock): self.options.b64_entropy_score = 4.5 self.options.hex_entropy_score = 3 test_scanner = TestScanner(self.options) - test_scanner.scan() + list(test_scanner.scan()) mock_entropy.assert_has_calls( ( mock.call("foo", 4.5, 3), @@ -77,7 +77,7 @@ def test_scan_iterates_through_all_chunks(self, mock_entropy: mock.MagicMock): def test_scan_checks_entropy_if_specified(self, mock_entropy: mock.MagicMock): self.options.entropy = True test_scanner = TestScanner(self.options) - test_scanner.scan() + list(test_scanner.scan()) mock_entropy.assert_called() @mock.patch("tartufo.scanner.ScannerBase.scan_regex") @@ -85,7 +85,7 @@ def test_scan_checks_regex_if_specified(self, mock_regex: mock.MagicMock): self.options.regex = True self.options.default_regexes = True test_scanner = TestScanner(self.options) - test_scanner.scan() + list(test_scanner.scan()) mock_regex.assert_called() @@ -93,15 +93,13 @@ class IssuesTests(ScannerTestCase): @mock.patch("tartufo.scanner.ScannerBase.scan") def test_empty_issue_list_causes_scan(self, mock_scan: mock.MagicMock): test_scanner = TestScanner(self.options) - test_scanner.issues # pylint: disable=pointless-statement + list(test_scanner.issues) # pylint: disable=pointless-statement mock_scan.assert_called() @mock.patch("tartufo.scanner.ScannerBase.scan") - def test_populated_issues_list_does_not_rescan(self, mock_scan: mock.MagicMock): + def test_scanner_does_not_rescan(self, mock_scan: mock.MagicMock): test_scanner = TestScanner(self.options) - test_scanner._issues = [ # pylint: disable=protected-access - scanner.Issue(types.IssueType.RegEx, "foo", types.Chunk("foo", "bar", {})) - ] + test_scanner._issue_count = 0 # pylint: disable=protected-access test_scanner.issues # pylint: disable=pointless-statement mock_scan.assert_not_called() @@ -313,7 +311,7 @@ def test_all_regex_rules_are_checked(self): ), } chunk = types.Chunk("foo", "/file/path", {}) - test_scanner.scan_regex(chunk) + list(test_scanner.scan_regex(chunk)) rule_1.findall.assert_called_once_with("foo") rule_2.findall.assert_called_once_with("foo") rule_2_path.match.assert_called_once_with("/file/path") @@ -335,7 +333,7 @@ def test_issue_is_not_created_if_signature_is_excluded( ) } chunk = types.Chunk("foo", "bar", {}) - issues = test_scanner.scan_regex(chunk) + issues = list(test_scanner.scan_regex(chunk)) mock_signature.assert_called_once_with("foo", "bar") self.assertEqual(issues, []) @@ -354,7 +352,7 @@ def test_issue_is_returned_if_signature_is_not_excluded( ) } chunk = types.Chunk("foo", "bar", {}) - issues = test_scanner.scan_regex(chunk) + issues = list(test_scanner.scan_regex(chunk)) mock_signature.assert_called_once_with("foo", "bar") self.assertEqual(len(issues), 1) self.assertEqual(issues[0].issue_detail, "foo") @@ -426,7 +424,7 @@ def test_scan_entropy_find_b64_strings_for_every_word_in_diff( mock_strings.return_value = [] b64_entropy_score = 4.5 hex_entropy_score = 3 - self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score) + list(self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score)) mock_strings.assert_has_calls( ( mock.call("foo", scanner.BASE64_CHARS), @@ -451,9 +449,9 @@ def test_issues_are_not_created_for_b64_string_excluded_signatures( mock_signature.return_value = True b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = self.scanner.scan_entropy( + issues = list(self.scanner.scan_entropy( self.chunk, b64_entropy_score, hex_entropy_score - ) + )) mock_calculate.assert_not_called() self.assertEqual(issues, []) @@ -470,9 +468,9 @@ def test_issues_are_not_created_for_hex_string_excluded_signatures( mock_signature.return_value = True b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = self.scanner.scan_entropy( + issues = list(self.scanner.scan_entropy( self.chunk, b64_entropy_score, hex_entropy_score - ) + )) mock_calculate.assert_not_called() self.assertEqual(issues, []) @@ -490,9 +488,9 @@ def test_issues_are_created_for_high_entropy_b64_strings( mock_calculate.return_value = 9.0 b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = self.scanner.scan_entropy( + issues = list(self.scanner.scan_entropy( self.chunk, b64_entropy_score, hex_entropy_score - ) + )) self.assertEqual(len(issues), 1) self.assertEqual(issues[0].issue_type, types.IssueType.Entropy) self.assertEqual(issues[0].matched_string, "foo") @@ -511,9 +509,9 @@ def test_issues_are_created_for_high_entropy_hex_strings( mock_calculate.return_value = 9.0 b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = self.scanner.scan_entropy( + issues = list(self.scanner.scan_entropy( self.chunk, b64_entropy_score, hex_entropy_score - ) + )) self.assertEqual(len(issues), 1) self.assertEqual(issues[0].issue_type, types.IssueType.Entropy) self.assertEqual(issues[0].matched_string, "foo") @@ -535,9 +533,9 @@ def test_issues_are_not_created_for_high_entropy_hex_strings_given_entropy_is_ex mock_calculate.return_value = 9.0 b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = self.scanner.scan_entropy( + issues = list(self.scanner.scan_entropy( self.chunk, b64_entropy_score, hex_entropy_score - ) + )) self.assertEqual(len(issues), 0) @mock.patch("tartufo.scanner.ScannerBase.calculate_entropy") @@ -557,9 +555,9 @@ def test_issues_are_not_created_for_low_entropy_b64_strings_given_entropy_is_exc mock_calculate.return_value = 9.0 b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = self.scanner.scan_entropy( + issues = list(self.scanner.scan_entropy( self.chunk, b64_entropy_score, hex_entropy_score - ) + )) self.assertEqual(len(issues), 0) @mock.patch("tartufo.scanner.ScannerBase.calculate_entropy") @@ -576,9 +574,9 @@ def test_issues_are_not_created_for_low_entropy_b64_strings( mock_calculate.return_value = 1.0 b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = self.scanner.scan_entropy( + issues = list(self.scanner.scan_entropy( self.chunk, b64_entropy_score, hex_entropy_score - ) + )) self.assertEqual(len(issues), 0) @mock.patch("tartufo.scanner.ScannerBase.calculate_entropy") @@ -595,9 +593,9 @@ def test_issues_are_not_created_for_low_entropy_hex_strings( mock_calculate.return_value = 1.0 b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = self.scanner.scan_entropy( + issues = list(self.scanner.scan_entropy( self.chunk, b64_entropy_score, hex_entropy_score - ) + )) self.assertEqual(len(issues), 0) diff --git a/tests/test_folder_scanner.py b/tests/test_folder_scanner.py index 6090f9da..bc510810 100644 --- a/tests/test_folder_scanner.py +++ b/tests/test_folder_scanner.py @@ -23,7 +23,7 @@ def test_scan_should_detect_entropy_and_not_binary(self): self.global_options.exclude_path_patterns = [r"donotscan\.txt"] test_scanner = scanner.FolderScanner(self.global_options, folder_path) - issues = test_scanner.scan() + issues = list(test_scanner.scan()) self.assertEqual(1, len(issues)) self.assertEqual("KQ0I97OBuPlGB9yPRxoSxnX52zE=", issues[0].matched_string) @@ -37,7 +37,8 @@ def test_scan_should_raise_click_error_on_file_permissions_issues(self): test_scanner = scanner.FolderScanner(self.global_options, folder_path) with patch("pathlib.Path.open", side_effect=OSError()): - self.assertRaises(click.FileError, test_scanner.scan) + with self.assertRaises(click.FileError): + list(test_scanner.scan()) if __name__ == "__main__": diff --git a/tests/test_git_repo_scanner.py b/tests/test_git_repo_scanner.py index 00f7a824..af8e2799 100644 --- a/tests/test_git_repo_scanner.py +++ b/tests/test_git_repo_scanner.py @@ -276,7 +276,8 @@ def test_scan_single_branch_throws_exception_when_branch_not_found( self.global_options, self.git_options, "." ) mock_iter_commits.return_value = [] - self.assertRaises(types.BranchNotFoundException, test_scanner.scan) + with self.assertRaises(types.BranchNotFoundException): + list(test_scanner.scan()) @mock.patch("tartufo.scanner.GitRepoScanner._iter_branch_commits") @mock.patch("git.Repo") From 0da25cf7443777f21ea16febb151786f890fc226 Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Tue, 12 Oct 2021 18:29:13 -0400 Subject: [PATCH 02/17] Generator integration, part 2 Add new `issue_count` property to scanner class. This is incremented as issues are yielded by the low level routines, and allows the caller to tell how many issues we got without inspecting the pipeline multiple times. We needed one non-obvious fix in util.py to ensure we have consumed all issues before checking the count. (Before, it would have been zero nearly always.) --- tartufo/cli.py | 2 +- tartufo/scanner.py | 13 +++++++++++++ tartufo/util.py | 7 +++---- tests/test_cli.py | 3 +++ tests/test_util.py | 14 +++++++++++--- 5 files changed, 31 insertions(+), 8 deletions(-) diff --git a/tartufo/cli.py b/tartufo/cli.py index 25956a3d..bd1ce08f 100644 --- a/tartufo/cli.py +++ b/tartufo/cli.py @@ -302,7 +302,7 @@ def process_issues( if not options.json: click.echo(f"Results have been saved in {output_dir}") - if scan.issues: + if scan.issue_count: ctx.exit(1) ctx.exit(0) diff --git a/tartufo/scanner.py b/tartufo/scanner.py index c2c89027..32fa55aa 100755 --- a/tartufo/scanner.py +++ b/tartufo/scanner.py @@ -143,6 +143,19 @@ def __init__(self, options: types.GlobalOptions) -> None: self.global_options = options self.logger = logging.getLogger(__name__) + @property + def issue_count(self) -> int: + """Return the number of issues found during the scan (so far) + + This may be less that the (eventual) total number of issues if the scan + has not been run yet, or if not all output has been consumed and the + generator functions are still evaluating targets. + + :returns: Number of reported issues + """ + + return self._issue_count or 0 + @property def issues(self) -> Generator[Issue, None, None]: """Get a list of issues found during the scan. diff --git a/tartufo/util.py b/tartufo/util.py index 391d6556..b8dd39f1 100644 --- a/tartufo/util.py +++ b/tartufo/util.py @@ -92,11 +92,10 @@ def echo_result( f"({issue.signature}, {issue.issue_detail})" ) else: - if not scanner.issues: + click.echo(b"\n".join([bytes(issue) for issue in scanner.issues])) + if not scanner.issue_count: if not options.quiet: click.echo(f"Time: {now}\nAll clear. No secrets detected.") - else: - click.echo(b"\n".join([bytes(issue) for issue in scanner.issues])) if options.verbose > 0: click.echo("\nExcluded paths:") click.echo("\n".join([path.pattern for path in scanner.excluded_paths])) @@ -107,7 +106,7 @@ def echo_result( def write_outputs( - found_issues: "Generator[Issue, None, None]", output_dir: pathlib.Path + found_issues: Generator["Issue", None, None], output_dir: pathlib.Path ) -> List[str]: """Write details of the issues to individual files in the specified directory. diff --git a/tests/test_cli.py b/tests/test_cli.py index 32ca2da0..47ceeae3 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -202,6 +202,7 @@ def test_command_exits_with_zero_return_code_when_no_issues_are_found( self, mock_scanner: mock.MagicMock ): mock_scanner.return_value.issues = [] + mock_scanner.return_value.issue_count = 0 runner = CliRunner() with runner.isolated_filesystem(): result = runner.invoke(cli.main, ["scan-local-repo", "."]) @@ -226,6 +227,7 @@ def test_command_returns_with_zero_when_quiet_only( self, mock_scanner: mock.MagicMock ): mock_scanner.return_value.issues = [] + mock_scanner.return_value.issue_count = 0 runner = CliRunner() with runner.isolated_filesystem(): result = runner.invoke(cli.main, ["-q", "scan-local-repo", "."]) @@ -238,6 +240,7 @@ def test_command_returns_with_zero_when_verbose_only( self, mock_scanner: mock.MagicMock ): mock_scanner.return_value.issues = [] + mock_scanner.return_value.issue_count = 0 runner = CliRunner() with runner.isolated_filesystem(): result = runner.invoke(cli.main, ["-v", "scan-local-repo", "."]) diff --git a/tests/test_util.py b/tests/test_util.py index 3c0438b1..f3011474 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -141,10 +141,14 @@ def test_echo_result_echos_message_when_clean( mock_time.now.return_value.isoformat.return_value = "now:now:now" options = generate_options(GlobalOptions, json=False, quiet=False, verbose=0) mock_scanner.exclude_signatures = [] + mock_scanner.issue_count = 0 mock_scanner.issues = [] util.echo_result(options, mock_scanner, "", "") - mock_click.echo.assert_called_with( - "Time: now:now:now\nAll clear. No secrets detected." + mock_click.echo.assert_has_calls( + [ + mock.call(b""), + mock.call("Time: now:now:now\nAll clear. No secrets detected."), + ] ) @mock.patch("tartufo.scanner.ScannerBase") @@ -171,6 +175,7 @@ def test_echo_result_echos_exclusions_verbose( exclude_entropy_patterns=exclude_entropy_patterns, ) mock_scanner.issues = [] + mock_scanner.issue_count = 0 mock_scanner.excluded_paths = [ re.compile("package-lock.json"), re.compile("poetry.lock"), @@ -196,7 +201,9 @@ def test_echo_result_echos_no_message_when_quiet(self, mock_click, mock_scanner) mock_scanner.issues = [] mock_scanner.exclude_signatures = [] util.echo_result(options, mock_scanner, "", "") - mock_click.echo.assert_not_called() + # We unavoidably get an empty line because we have to run the iterator + # to figure out whether we did or didn't find any issues. + mock_click.echo.assert_called_once_with(b"") @mock.patch("tartufo.scanner.ScannerBase") @mock.patch("tartufo.util.click", new=mock.MagicMock()) @@ -216,6 +223,7 @@ def test_echo_result_outputs_proper_json_when_requested( types.IssueType.RegEx, "bar", types.Chunk("foo", "/bar", {}) ) mock_scanner.issues = [issue_1, issue_2] + mock_scanner.issue_count = 2 mock_scanner.excluded_paths = [] options = generate_options( GlobalOptions, json=True, exclude_signatures=[], exclude_entropy_patterns=[] From 0c8610a75939ccf0fda7679df09df2d7d3e292cc Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Wed, 13 Oct 2021 09:27:18 -0400 Subject: [PATCH 03/17] Unwind default console output Previously we marshalled all issues into a single `echo()` call, which resulted in buffering all output until the completion of the scan. Now we use a loop (like "compact") and issue a separate `echo()` for each issue, allowing early output to be produced prior to the generation of later issues. This also gets rid of the unwanted "empty output for no issues" behavior introduced by an earlier commit. --- tartufo/util.py | 3 ++- tests/test_util.py | 21 +++++++++++---------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/tartufo/util.py b/tartufo/util.py index b8dd39f1..2970b8c3 100644 --- a/tartufo/util.py +++ b/tartufo/util.py @@ -92,7 +92,8 @@ def echo_result( f"({issue.signature}, {issue.issue_detail})" ) else: - click.echo(b"\n".join([bytes(issue) for issue in scanner.issues])) + for issue in scanner.issues: + click.echo(bytes(issue)) if not scanner.issue_count: if not options.quiet: click.echo(f"Time: {now}\nAll clear. No secrets detected.") diff --git a/tests/test_util.py b/tests/test_util.py index f3011474..0dc7c020 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -103,9 +103,15 @@ def test_echo_result_echos_all_when_not_json(self, mock_click, mock_scanner): mock_scanner.issues = [1, 2, 3, 4] util.echo_result(options, mock_scanner, "", "") # Ensure that the issues are output as a byte stream - mock_click.echo.assert_called_once_with( - bytes(1) + b"\n" + bytes(2) + b"\n" + bytes(3) + b"\n" + bytes(4) + mock_click.echo.assert_has_calls( + [ + mock.call(bytes(1)), + mock.call(bytes(2)), + mock.call(bytes(3)), + mock.call(bytes(4)), + ] ) + self.assertEqual(mock_click.echo.call_count, 4) @mock.patch("tartufo.scanner.ScannerBase") @mock.patch("tartufo.util.click") @@ -144,11 +150,8 @@ def test_echo_result_echos_message_when_clean( mock_scanner.issue_count = 0 mock_scanner.issues = [] util.echo_result(options, mock_scanner, "", "") - mock_click.echo.assert_has_calls( - [ - mock.call(b""), - mock.call("Time: now:now:now\nAll clear. No secrets detected."), - ] + mock_click.echo.assert_called_once_with( + "Time: now:now:now\nAll clear. No secrets detected." ) @mock.patch("tartufo.scanner.ScannerBase") @@ -201,9 +204,7 @@ def test_echo_result_echos_no_message_when_quiet(self, mock_click, mock_scanner) mock_scanner.issues = [] mock_scanner.exclude_signatures = [] util.echo_result(options, mock_scanner, "", "") - # We unavoidably get an empty line because we have to run the iterator - # to figure out whether we did or didn't find any issues. - mock_click.echo.assert_called_once_with(b"") + mock_click.echo.assert_not_called() @mock.patch("tartufo.scanner.ScannerBase") @mock.patch("tartufo.util.click", new=mock.MagicMock()) From a39b5c276848b321b17764c332111c67755ce39d Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Wed, 13 Oct 2021 10:21:53 -0400 Subject: [PATCH 04/17] Stream JSON-formatted output This works because only one part of the output is generated, so we can output a (partial) static blob to start, then an issue at a time, and finally terminate the object to make it valid. Because this is a little hinky, the corresponding unit tests have been adjusted to actually generate (and capture) the entire stdout stream. We then deserialize (to verify syntax) and compare the resulting dict with the original input data to confirm there was no content corruption. --- tartufo/util.py | 20 ++++++++++++++++---- tests/test_util.py | 37 ++++++++++++++++++++++++------------- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/tartufo/util.py b/tartufo/util.py index 2970b8c3..45e2a4b8 100644 --- a/tartufo/util.py +++ b/tartufo/util.py @@ -79,12 +79,24 @@ def echo_result( "exclude_entropy_patterns": [ str(pattern) for pattern in options.exclude_entropy_patterns ], - "found_issues": [ - issue.as_dict(compact=options.compact) for issue in scanner.issues - ], + # This member is for reference. Read below... + # "found_issues": [ + # issue.as_dict(compact=options.compact) for issue in scanner.issues + # ], } - click.echo(json.dumps(output)) + # Observation: We want to "stream" JSON; the only generator output is the + # "found_issues" list (which is at the top level). Dump the "static" part + # minus the closing "}", then generate issues individually, then emit the + # closing "}". + static_part = json.dumps(output) + click.echo(f'{static_part[:-1]}, "found_issues": [', nl=False) + delimiter = "" + for issue in scanner.issues: + live_part = json.dumps(issue.as_dict(compact=options.compact)) + click.echo(f"{delimiter}{live_part}", nl=False) + delimiter = ", " + click.echo("]}") elif options.compact: for issue in scanner.issues: click.echo( diff --git a/tests/test_util.py b/tests/test_util.py index 0dc7c020..6431ec17 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,5 +1,7 @@ -import unittest +from io import StringIO +import json import re +import unittest from pathlib import Path from unittest import mock @@ -207,13 +209,10 @@ def test_echo_result_echos_no_message_when_quiet(self, mock_click, mock_scanner) mock_click.echo.assert_not_called() @mock.patch("tartufo.scanner.ScannerBase") - @mock.patch("tartufo.util.click", new=mock.MagicMock()) - @mock.patch("tartufo.util.json") @mock.patch("tartufo.util.datetime") def test_echo_result_outputs_proper_json_when_requested( self, mock_time, - mock_json, mock_scanner, ): mock_time.now.return_value.isoformat.return_value = "now:now:now" @@ -229,9 +228,16 @@ def test_echo_result_outputs_proper_json_when_requested( options = generate_options( GlobalOptions, json=True, exclude_signatures=[], exclude_entropy_patterns=[] ) - util.echo_result(options, mock_scanner, "/repo", "/output") - mock_json.dumps.assert_called_once_with( + # We're generating JSON piecemeal, so if we want to be safe we'll recover + # the entire output, deserialize it (to confirm it's valid syntax) and + # compare the result to the original input dictionary. + with mock.patch("sys.stdout", new=StringIO()) as mock_stdout: + util.echo_result(options, mock_scanner, "/repo", "/output") + actual_output = mock_stdout.getvalue() + + self.assertEqual( + json.loads(actual_output), { "scan_time": "now:now:now", "project_path": "/repo", @@ -257,15 +263,13 @@ def test_echo_result_outputs_proper_json_when_requested( "file_path": "/bar", }, ], - } + }, ) @mock.patch("tartufo.scanner.ScannerBase") - @mock.patch("tartufo.util.click", new=mock.MagicMock()) - @mock.patch("tartufo.util.json") @mock.patch("tartufo.util.datetime") def test_echo_result_outputs_proper_json_when_requested_pathtype( - self, mock_time, mock_json, mock_scanner + self, mock_time, mock_scanner ): mock_time.now.return_value.isoformat.return_value = "now:now:now" issue_1 = scanner.Issue( @@ -293,8 +297,15 @@ def test_echo_result_outputs_proper_json_when_requested_pathtype( exclude_signatures=exclude_signatures, exclude_entropy_patterns=exclude_entropy_patterns, ) - util.echo_result(options, mock_scanner, "/repo", Path("/tmp")) - mock_json.dumps.assert_called_once_with( + + # We're generating JSON piecemeal, so if we want to be safe we'll recover + # the entire output, deserialize it (to confirm it's valid syntax) and + # compare the result to the original input dictionary. + with mock.patch("sys.stdout", new=StringIO()) as mock_stdout: + util.echo_result(options, mock_scanner, "/repo", Path("/tmp")) + actual_output = mock_stdout.getvalue() + self.assertEqual( + json.loads(actual_output), { "scan_time": "now:now:now", "project_path": "/repo", @@ -326,7 +337,7 @@ def test_echo_result_outputs_proper_json_when_requested_pathtype( "file_path": "/bar", }, ], - } + }, ) From 0243ba38e3c8532dabaf7581412c43090e0e4c8a Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Wed, 13 Oct 2021 10:40:36 -0400 Subject: [PATCH 05/17] Hold issues for storage to files Address the last remaining wart; make `echo_result()` buffer issues (even as it streams them to console) and return the list so it's possible to write all issues to files at the end if desired. This returns `write_outputs()` to its original semantics, namely consumption of a List rather than a Generator. Arguably it might be nicer to create files as we go, but that would constitute a behavior change (files not all done right at the end) and need some code refactoring. --- tartufo/cli.py | 4 ++-- tartufo/util.py | 14 ++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tartufo/cli.py b/tartufo/cli.py index bd1ce08f..7b491dd7 100644 --- a/tartufo/cli.py +++ b/tartufo/cli.py @@ -296,9 +296,9 @@ def process_issues( output_dir = pathlib.Path(options.output_dir) / f"tartufo-scan-results-{now}" output_dir.mkdir(parents=True) - util.echo_result(options, scan, repo_path, output_dir) + issues = util.echo_result(options, scan, repo_path, output_dir) if output_dir: - util.write_outputs(scan.issues, output_dir) + util.write_outputs(issues, output_dir) if not options.json: click.echo(f"Results have been saved in {output_dir}") diff --git a/tartufo/util.py b/tartufo/util.py index 45e2a4b8..9ca4dee9 100644 --- a/tartufo/util.py +++ b/tartufo/util.py @@ -12,7 +12,6 @@ Any, Callable, Dict, - Generator, Iterable, List, Optional, @@ -59,13 +58,15 @@ def echo_result( scanner: "ScannerBase", repo_path: str, output_dir: Optional[pathlib.Path], -) -> None: +) -> List["Issue"]: """Print all found issues out to the console, optionally as JSON. :param options: Global options object :param scanner: ScannerBase containing issues and excluded paths from config tree :param repo_path: The path to the repository the issues were found in :param output_dir: The directory that issue details were written out to + :returns: All issues that were displayed """ + all_issues: List["Issue"] = [] now = datetime.now().isoformat("T", "microseconds") if options.json: output = { @@ -93,18 +94,21 @@ def echo_result( click.echo(f'{static_part[:-1]}, "found_issues": [', nl=False) delimiter = "" for issue in scanner.issues: + all_issues.append(issue) live_part = json.dumps(issue.as_dict(compact=options.compact)) click.echo(f"{delimiter}{live_part}", nl=False) delimiter = ", " click.echo("]}") elif options.compact: for issue in scanner.issues: + all_issues.append(issue) click.echo( f"[{issue.issue_type.value}] {issue.chunk.file_path}: {issue.matched_string} " f"({issue.signature}, {issue.issue_detail})" ) else: for issue in scanner.issues: + all_issues.append(issue) click.echo(bytes(issue)) if not scanner.issue_count: if not options.quiet: @@ -117,13 +121,15 @@ def echo_result( click.echo("\nExcluded entropy patterns:") click.echo("\n".join(options.exclude_entropy_patterns)) + return all_issues + def write_outputs( - found_issues: Generator["Issue", None, None], output_dir: pathlib.Path + found_issues: List["Issue"], output_dir: pathlib.Path ) -> List[str]: """Write details of the issues to individual files in the specified directory. - :param found_issues: A generator for issues to be written out + :param found_issues: A list of issues to be written out :param output_dir: The directory where the files should be written """ result_files = [] From b67d23e90453c71cd1e463c6c88430b99767c2ce Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Wed, 13 Oct 2021 10:46:34 -0400 Subject: [PATCH 06/17] Fix issue_count logic Revise test to reflect use of `-1` vs `None` to indicate scan not started --- tartufo/scanner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tartufo/scanner.py b/tartufo/scanner.py index 32fa55aa..6bbd6193 100755 --- a/tartufo/scanner.py +++ b/tartufo/scanner.py @@ -154,7 +154,7 @@ def issue_count(self) -> int: :returns: Number of reported issues """ - return self._issue_count or 0 + return self._issue_count if self._issue_count > 0 else 0 @property def issues(self) -> Generator[Issue, None, None]: From 03e880e54f415c8ec616790ec789dcacbbf4c718 Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Wed, 13 Oct 2021 10:57:12 -0400 Subject: [PATCH 07/17] Add CHANGELOG entry --- CHANGELOG.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7851092c..d506a990 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +vx.x.x - TBD +------------ + +Features: + +* [#227](https://github.com/godaddy/tartufo/pull/227) - Report findings incrementally + as scan progresses instead of holding all of them until it has completed. This + is a reimplementation of [#108](https://github.com/godaddy/tartufo/pull/108); + thanks to @dclayton-godaddy for showing the way. + v2.9.0 - 19 October 2021 ------------------------ From b980f110f5094faad893f5655c9b5ad33ed5304d Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Wed, 13 Oct 2021 11:22:06 -0400 Subject: [PATCH 08/17] black fixup --- tartufo/util.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tartufo/util.py b/tartufo/util.py index 9ca4dee9..c79939df 100644 --- a/tartufo/util.py +++ b/tartufo/util.py @@ -124,9 +124,7 @@ def echo_result( return all_issues -def write_outputs( - found_issues: List["Issue"], output_dir: pathlib.Path -) -> List[str]: +def write_outputs(found_issues: List["Issue"], output_dir: pathlib.Path) -> List[str]: """Write details of the issues to individual files in the specified directory. :param found_issues: A list of issues to be written out From 632a928e0b94b3c24555bd3e447e21094c3b6f43 Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Tue, 19 Oct 2021 11:09:05 -0400 Subject: [PATCH 09/17] rebase fixups --- tartufo/scanner.py | 2 +- tests/test_base_scanner.py | 52 ++++++++++++++++++++------------------ 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/tartufo/scanner.py b/tartufo/scanner.py index 6bbd6193..2bdf2bfc 100755 --- a/tartufo/scanner.py +++ b/tartufo/scanner.py @@ -386,7 +386,7 @@ def scan(self) -> Generator[Issue, None, None]: self.global_options.b64_entropy_score, self.global_options.hex_entropy_score, ) - self.logger.info("Found %d issues.", self._issue_count)) + self.logger.info("Found %d issues.", self._issue_count) def scan_entropy( self, chunk: types.Chunk, b64_entropy_score: float, hex_entropy_score: float diff --git a/tests/test_base_scanner.py b/tests/test_base_scanner.py index 259d4601..4521d587 100644 --- a/tests/test_base_scanner.py +++ b/tests/test_base_scanner.py @@ -424,7 +424,9 @@ def test_scan_entropy_find_b64_strings_for_every_word_in_diff( mock_strings.return_value = [] b64_entropy_score = 4.5 hex_entropy_score = 3 - list(self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score)) + list( + self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score) + ) mock_strings.assert_has_calls( ( mock.call("foo", scanner.BASE64_CHARS), @@ -449,9 +451,9 @@ def test_issues_are_not_created_for_b64_string_excluded_signatures( mock_signature.return_value = True b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = list(self.scanner.scan_entropy( - self.chunk, b64_entropy_score, hex_entropy_score - )) + issues = list( + self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score) + ) mock_calculate.assert_not_called() self.assertEqual(issues, []) @@ -468,9 +470,9 @@ def test_issues_are_not_created_for_hex_string_excluded_signatures( mock_signature.return_value = True b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = list(self.scanner.scan_entropy( - self.chunk, b64_entropy_score, hex_entropy_score - )) + issues = list( + self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score) + ) mock_calculate.assert_not_called() self.assertEqual(issues, []) @@ -488,9 +490,9 @@ def test_issues_are_created_for_high_entropy_b64_strings( mock_calculate.return_value = 9.0 b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = list(self.scanner.scan_entropy( - self.chunk, b64_entropy_score, hex_entropy_score - )) + issues = list( + self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score) + ) self.assertEqual(len(issues), 1) self.assertEqual(issues[0].issue_type, types.IssueType.Entropy) self.assertEqual(issues[0].matched_string, "foo") @@ -509,9 +511,9 @@ def test_issues_are_created_for_high_entropy_hex_strings( mock_calculate.return_value = 9.0 b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = list(self.scanner.scan_entropy( - self.chunk, b64_entropy_score, hex_entropy_score - )) + issues = list( + self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score) + ) self.assertEqual(len(issues), 1) self.assertEqual(issues[0].issue_type, types.IssueType.Entropy) self.assertEqual(issues[0].matched_string, "foo") @@ -533,9 +535,9 @@ def test_issues_are_not_created_for_high_entropy_hex_strings_given_entropy_is_ex mock_calculate.return_value = 9.0 b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = list(self.scanner.scan_entropy( - self.chunk, b64_entropy_score, hex_entropy_score - )) + issues = list( + self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score) + ) self.assertEqual(len(issues), 0) @mock.patch("tartufo.scanner.ScannerBase.calculate_entropy") @@ -555,9 +557,9 @@ def test_issues_are_not_created_for_low_entropy_b64_strings_given_entropy_is_exc mock_calculate.return_value = 9.0 b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = list(self.scanner.scan_entropy( - self.chunk, b64_entropy_score, hex_entropy_score - )) + issues = list( + self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score) + ) self.assertEqual(len(issues), 0) @mock.patch("tartufo.scanner.ScannerBase.calculate_entropy") @@ -574,9 +576,9 @@ def test_issues_are_not_created_for_low_entropy_b64_strings( mock_calculate.return_value = 1.0 b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = list(self.scanner.scan_entropy( - self.chunk, b64_entropy_score, hex_entropy_score - )) + issues = list( + self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score) + ) self.assertEqual(len(issues), 0) @mock.patch("tartufo.scanner.ScannerBase.calculate_entropy") @@ -593,9 +595,9 @@ def test_issues_are_not_created_for_low_entropy_hex_strings( mock_calculate.return_value = 1.0 b64_entropy_score = 4.5 hex_entropy_score = 3 - issues = list(self.scanner.scan_entropy( - self.chunk, b64_entropy_score, hex_entropy_score - )) + issues = list( + self.scanner.scan_entropy(self.chunk, b64_entropy_score, hex_entropy_score) + ) self.assertEqual(len(issues), 0) From 52444871a61643aa3c727361a9740bbc10703326 Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Thu, 21 Oct 2021 15:41:19 -0400 Subject: [PATCH 10/17] Improve backward compatibility This commit restores the `._issues` member (a list) and reverts the `.issues` property to a list instead of a generator. Several instances of caller references to `.issues` have been restored to their original form. Behind the covers, a new `._completed` member (and matching property) tracks whether or not the scan has completed; `.issues` checks this and forces the scan to run to completion if needed. The low level generators now append to `._issues` instead of incrementing `._issue_count` (which is no longer needed), and `scan()` clears state information when it starts and then sets `._completed` when finished. To make it all work, `echo_result()` now loops on `.scan()` instead of `.issues` so output is still "live" although `.issues` is not. --- tartufo/cli.py | 6 ++--- tartufo/scanner.py | 45 +++++++++++++++++++++----------------- tartufo/util.py | 18 +++++---------- tests/test_base_scanner.py | 2 +- tests/test_util.py | 9 ++++---- 5 files changed, 39 insertions(+), 41 deletions(-) diff --git a/tartufo/cli.py b/tartufo/cli.py index 7b491dd7..25956a3d 100644 --- a/tartufo/cli.py +++ b/tartufo/cli.py @@ -296,13 +296,13 @@ def process_issues( output_dir = pathlib.Path(options.output_dir) / f"tartufo-scan-results-{now}" output_dir.mkdir(parents=True) - issues = util.echo_result(options, scan, repo_path, output_dir) + util.echo_result(options, scan, repo_path, output_dir) if output_dir: - util.write_outputs(issues, output_dir) + util.write_outputs(scan.issues, output_dir) if not options.json: click.echo(f"Results have been saved in {output_dir}") - if scan.issue_count: + if scan.issues: ctx.exit(1) ctx.exit(0) diff --git a/tartufo/scanner.py b/tartufo/scanner.py index 2bdf2bfc..19405c61 100755 --- a/tartufo/scanner.py +++ b/tartufo/scanner.py @@ -131,7 +131,8 @@ class ScannerBase(abc.ABC): all the individual pieces of content to be scanned. """ - _issue_count: int = -1 + _issues: List[Issue] = [] + _completed: bool = False _included_paths: Optional[List[Pattern]] = None _excluded_paths: Optional[List[Pattern]] = None _excluded_entropy: Optional[List[Rule]] = None @@ -144,29 +145,30 @@ def __init__(self, options: types.GlobalOptions) -> None: self.logger = logging.getLogger(__name__) @property - def issue_count(self) -> int: - """Return the number of issues found during the scan (so far) + def completed(self) -> bool: + """Return True if scan has completed - This may be less that the (eventual) total number of issues if the scan - has not been run yet, or if not all output has been consumed and the - generator functions are still evaluating targets. - - :returns: Number of reported issues + :returns: True if scan has completed; False if scan is in progress """ - return self._issue_count if self._issue_count > 0 else 0 + return self._completed @property - def issues(self) -> Generator[Issue, None, None]: + def issues(self) -> List[Issue]: """Get a list of issues found during the scan. - If a scan has not yet been run, run it. + If the scan is still in progress, force it to complete first. - :return: Any issues found during the scan. + :returns: Any issues found during the scan. """ - if self._issue_count < 0: - self.logger.debug("Issues called before scan. Calling scan now.") - yield from self.scan() + + if not self.completed: + self.logger.debug( + "Issues called before scan completed. Finishing scan now." + ) + list(self.scan()) + + return self._issues @property def included_paths(self) -> List[Pattern]: @@ -375,7 +377,8 @@ def scan(self) -> Generator[Issue, None, None]: raise types.ConfigException("Regex checks requested, but no regexes found.") self.logger.info("Starting scan...") - self._issue_count = 0 + self._completed = False + self._issues = [] for chunk in self.chunks: # Run regex scans first to trigger a potential fast fail for bad config if self.global_options.regex and self.rules_regexes: @@ -386,7 +389,8 @@ def scan(self) -> Generator[Issue, None, None]: self.global_options.b64_entropy_score, self.global_options.hex_entropy_score, ) - self.logger.info("Found %d issues.", self._issue_count) + self._completed = True + self.logger.info("Found %d issues.", len(self._issues)) def scan_entropy( self, chunk: types.Chunk, b64_entropy_score: float, hex_entropy_score: float @@ -437,8 +441,9 @@ def evaluate_entropy_string( if self.entropy_string_is_excluded(string, line, chunk.file_path): self.logger.debug("line containing entropy was excluded: %s", line) else: - self._issue_count += 1 - yield Issue(types.IssueType.Entropy, string, chunk) + issue = Issue(types.IssueType.Entropy, string, chunk) + self._issues.append(issue) + yield issue def scan_regex(self, chunk: types.Chunk) -> Generator[Issue, None, None]: """Scan a chunk of data for matches against the configured regexes. @@ -454,7 +459,7 @@ def scan_regex(self, chunk: types.Chunk) -> Generator[Issue, None, None]: if not self.signature_is_excluded(match, chunk.file_path): issue = Issue(types.IssueType.RegEx, match, chunk) issue.issue_detail = key - self._issue_count += 1 + self._issues.append(issue) yield issue @property diff --git a/tartufo/util.py b/tartufo/util.py index c79939df..7c23e9dd 100644 --- a/tartufo/util.py +++ b/tartufo/util.py @@ -58,15 +58,14 @@ def echo_result( scanner: "ScannerBase", repo_path: str, output_dir: Optional[pathlib.Path], -) -> List["Issue"]: +) -> None: """Print all found issues out to the console, optionally as JSON. :param options: Global options object :param scanner: ScannerBase containing issues and excluded paths from config tree :param repo_path: The path to the repository the issues were found in :param output_dir: The directory that issue details were written out to - :returns: All issues that were displayed """ - all_issues: List["Issue"] = [] + now = datetime.now().isoformat("T", "microseconds") if options.json: output = { @@ -93,24 +92,21 @@ def echo_result( static_part = json.dumps(output) click.echo(f'{static_part[:-1]}, "found_issues": [', nl=False) delimiter = "" - for issue in scanner.issues: - all_issues.append(issue) + for issue in scanner.scan(): live_part = json.dumps(issue.as_dict(compact=options.compact)) click.echo(f"{delimiter}{live_part}", nl=False) delimiter = ", " click.echo("]}") elif options.compact: - for issue in scanner.issues: - all_issues.append(issue) + for issue in scanner.scan(): click.echo( f"[{issue.issue_type.value}] {issue.chunk.file_path}: {issue.matched_string} " f"({issue.signature}, {issue.issue_detail})" ) else: - for issue in scanner.issues: - all_issues.append(issue) + for issue in scanner.scan(): click.echo(bytes(issue)) - if not scanner.issue_count: + if not scanner.issues: if not options.quiet: click.echo(f"Time: {now}\nAll clear. No secrets detected.") if options.verbose > 0: @@ -121,8 +117,6 @@ def echo_result( click.echo("\nExcluded entropy patterns:") click.echo("\n".join(options.exclude_entropy_patterns)) - return all_issues - def write_outputs(found_issues: List["Issue"], output_dir: pathlib.Path) -> List[str]: """Write details of the issues to individual files in the specified directory. diff --git a/tests/test_base_scanner.py b/tests/test_base_scanner.py index 4521d587..b8f270f7 100644 --- a/tests/test_base_scanner.py +++ b/tests/test_base_scanner.py @@ -99,7 +99,7 @@ def test_empty_issue_list_causes_scan(self, mock_scan: mock.MagicMock): @mock.patch("tartufo.scanner.ScannerBase.scan") def test_scanner_does_not_rescan(self, mock_scan: mock.MagicMock): test_scanner = TestScanner(self.options) - test_scanner._issue_count = 0 # pylint: disable=protected-access + test_scanner._completed = True # pylint: disable=protected-access test_scanner.issues # pylint: disable=pointless-statement mock_scan.assert_not_called() diff --git a/tests/test_util.py b/tests/test_util.py index 6431ec17..6e73950b 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -102,7 +102,7 @@ class OutputTests(unittest.TestCase): def test_echo_result_echos_all_when_not_json(self, mock_click, mock_scanner): options = generate_options(GlobalOptions, json=False, verbose=0) mock_scanner.exclude_signatures = [] - mock_scanner.issues = [1, 2, 3, 4] + mock_scanner.scan.return_value = (1, 2, 3, 4) util.echo_result(options, mock_scanner, "", "") # Ensure that the issues are output as a byte stream mock_click.echo.assert_has_calls( @@ -126,7 +126,7 @@ def test_echo_result_outputs_compact_format(self, mock_click, mock_scanner): types.IssueType.RegEx, "bar", types.Chunk("fullfoobar", "/what/bar", {}) ) issue2.issue_detail = "Meets the bar" - mock_scanner.issues = [issue1, issue2] + mock_scanner.scan.return_value = (issue1, issue2) util.echo_result(options, mock_scanner, "", "") mock_click.echo.assert_has_calls( @@ -222,8 +222,7 @@ def test_echo_result_outputs_proper_json_when_requested( issue_2 = scanner.Issue( types.IssueType.RegEx, "bar", types.Chunk("foo", "/bar", {}) ) - mock_scanner.issues = [issue_1, issue_2] - mock_scanner.issue_count = 2 + mock_scanner.scan.return_value = (issue_1, issue_2) mock_scanner.excluded_paths = [] options = generate_options( GlobalOptions, json=True, exclude_signatures=[], exclude_entropy_patterns=[] @@ -278,7 +277,7 @@ def test_echo_result_outputs_proper_json_when_requested_pathtype( issue_2 = scanner.Issue( types.IssueType.RegEx, "bar", types.Chunk("foo", "/bar", {}) ) - mock_scanner.issues = [issue_1, issue_2] + mock_scanner.scan.return_value = (issue_1, issue_2) mock_scanner.excluded_paths = [ re.compile("package-lock.json"), re.compile("poetry.lock"), From 577aab0cb1c2662915fd7d749daef9a40c2f41c3 Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Thu, 21 Oct 2021 15:51:20 -0400 Subject: [PATCH 11/17] Don't allow rescans If `scan()` is called (again) after having completed a scan, return immediately without doing anything. The proper way to look at issues (again) is to consult `.issues`. This limits, but does not eliminate, the potential for craziness if a caller decides to interleave calls to `.scan()` and references to `.issues`. --- tartufo/scanner.py | 4 +++- tests/test_base_scanner.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/tartufo/scanner.py b/tartufo/scanner.py index 19405c61..f0289586 100755 --- a/tartufo/scanner.py +++ b/tartufo/scanner.py @@ -369,6 +369,9 @@ def scan(self) -> Generator[Issue, None, None]: scanner's configuration """ + if self.completed: + return + if not any((self.global_options.entropy, self.global_options.regex)): self.logger.error("No analysis requested.") raise types.ConfigException("No analysis requested.") @@ -377,7 +380,6 @@ def scan(self) -> Generator[Issue, None, None]: raise types.ConfigException("Regex checks requested, but no regexes found.") self.logger.info("Starting scan...") - self._completed = False self._issues = [] for chunk in self.chunks: # Run regex scans first to trigger a potential fast fail for bad config diff --git a/tests/test_base_scanner.py b/tests/test_base_scanner.py index b8f270f7..e0938e47 100644 --- a/tests/test_base_scanner.py +++ b/tests/test_base_scanner.py @@ -73,6 +73,20 @@ def test_scan_iterates_through_all_chunks(self, mock_entropy: mock.MagicMock): any_order=True, ) + @mock.patch("tartufo.scanner.ScannerBase.scan_entropy") + @mock.patch("tartufo.scanner.ScannerBase.scan_regex") + def test_scan_does_not_rescan(self, mock_regex, mock_entropy): + """Make sure scan() does not rescan""" + + self.options.regex = True + self.options.entropy = True + test_scanner = TestScanner(self.options) + test_scanner._completed = True + result = list(test_scanner.scan()) + mock_regex.assert_not_called() + mock_entropy.assert_not_called() + self.assertEqual(result, []) + @mock.patch("tartufo.scanner.ScannerBase.scan_entropy") def test_scan_checks_entropy_if_specified(self, mock_entropy: mock.MagicMock): self.options.entropy = True From e9fb05835b5ab4f948694eb3e1f34fdfd858670a Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Thu, 21 Oct 2021 16:02:16 -0400 Subject: [PATCH 12/17] linter fixups We squelch "too many instance attributes" on `ScannerBase` because we really really need a `_completed` flag so we tell pylint to suck it up. --- tartufo/scanner.py | 2 +- tests/test_base_scanner.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tartufo/scanner.py b/tartufo/scanner.py index f0289586..594d3f55 100755 --- a/tartufo/scanner.py +++ b/tartufo/scanner.py @@ -120,7 +120,7 @@ def __bytes__(self) -> bytes: return self.__str__().encode("utf8") -class ScannerBase(abc.ABC): +class ScannerBase(abc.ABC): # pylint: disable=too-many-instance-attributes """Provide the base, generic functionality needed by all scanners. In fact, this contains all of the actual scanning logic. This part of the diff --git a/tests/test_base_scanner.py b/tests/test_base_scanner.py index e0938e47..bc3b99bc 100644 --- a/tests/test_base_scanner.py +++ b/tests/test_base_scanner.py @@ -81,7 +81,7 @@ def test_scan_does_not_rescan(self, mock_regex, mock_entropy): self.options.regex = True self.options.entropy = True test_scanner = TestScanner(self.options) - test_scanner._completed = True + test_scanner._completed = True # pylint: disable=protected-access result = list(test_scanner.scan()) mock_regex.assert_not_called() mock_entropy.assert_not_called() From 459dbc038f2548e344aca9c4f82b05561f8817fc Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Thu, 21 Oct 2021 16:34:49 -0400 Subject: [PATCH 13/17] Fix scan() re-call Slow burn from @tarkatronic review comment. If `scan()` is called again, yield from the cached `._issues` list instead of doing nothing. This looks like the old behavior except it doesn't actually rescan. --- tartufo/scanner.py | 1 + tests/test_base_scanner.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tartufo/scanner.py b/tartufo/scanner.py index 594d3f55..1a575e37 100755 --- a/tartufo/scanner.py +++ b/tartufo/scanner.py @@ -370,6 +370,7 @@ def scan(self) -> Generator[Issue, None, None]: """ if self.completed: + yield from self._issues return if not any((self.global_options.entropy, self.global_options.regex)): diff --git a/tests/test_base_scanner.py b/tests/test_base_scanner.py index bc3b99bc..f94915fd 100644 --- a/tests/test_base_scanner.py +++ b/tests/test_base_scanner.py @@ -82,10 +82,11 @@ def test_scan_does_not_rescan(self, mock_regex, mock_entropy): self.options.entropy = True test_scanner = TestScanner(self.options) test_scanner._completed = True # pylint: disable=protected-access + test_scanner._issues = [1, 2, 3] result = list(test_scanner.scan()) mock_regex.assert_not_called() mock_entropy.assert_not_called() - self.assertEqual(result, []) + self.assertEqual(result, [1, 2, 3]) @mock.patch("tartufo.scanner.ScannerBase.scan_entropy") def test_scan_checks_entropy_if_specified(self, mock_entropy: mock.MagicMock): From 1f4330714b13451ea1bd09dd69ce893be733258c Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Thu, 21 Oct 2021 16:41:06 -0400 Subject: [PATCH 14/17] linter fixups --- tests/test_base_scanner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_base_scanner.py b/tests/test_base_scanner.py index f94915fd..67da9834 100644 --- a/tests/test_base_scanner.py +++ b/tests/test_base_scanner.py @@ -82,7 +82,7 @@ def test_scan_does_not_rescan(self, mock_regex, mock_entropy): self.options.entropy = True test_scanner = TestScanner(self.options) test_scanner._completed = True # pylint: disable=protected-access - test_scanner._issues = [1, 2, 3] + test_scanner._issues = [1, 2, 3] # pylint: disable=protected-access result = list(test_scanner.scan()) mock_regex.assert_not_called() mock_entropy.assert_not_called() From 9db392a817eca11fb3816af045304c31f2d5b74f Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Mon, 25 Oct 2021 18:07:19 -0400 Subject: [PATCH 15/17] Remove unnecessary mock properties The real object doesn't have `issue_count` any longer. --- tests/test_cli.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 47ceeae3..32ca2da0 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -202,7 +202,6 @@ def test_command_exits_with_zero_return_code_when_no_issues_are_found( self, mock_scanner: mock.MagicMock ): mock_scanner.return_value.issues = [] - mock_scanner.return_value.issue_count = 0 runner = CliRunner() with runner.isolated_filesystem(): result = runner.invoke(cli.main, ["scan-local-repo", "."]) @@ -227,7 +226,6 @@ def test_command_returns_with_zero_when_quiet_only( self, mock_scanner: mock.MagicMock ): mock_scanner.return_value.issues = [] - mock_scanner.return_value.issue_count = 0 runner = CliRunner() with runner.isolated_filesystem(): result = runner.invoke(cli.main, ["-q", "scan-local-repo", "."]) @@ -240,7 +238,6 @@ def test_command_returns_with_zero_when_verbose_only( self, mock_scanner: mock.MagicMock ): mock_scanner.return_value.issues = [] - mock_scanner.return_value.issue_count = 0 runner = CliRunner() with runner.isolated_filesystem(): result = runner.invoke(cli.main, ["-v", "scan-local-repo", "."]) From 1df4c0e897b73eb52997157f73d9fafc67b3f1e9 Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Mon, 25 Oct 2021 19:13:29 -0400 Subject: [PATCH 16/17] Hoist issue accumulation Cache issues inside `scan()` instead of the lower-level routines (which now are generators without side effects). --- tartufo/scanner.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tartufo/scanner.py b/tartufo/scanner.py index 1a575e37..ce227f52 100755 --- a/tartufo/scanner.py +++ b/tartufo/scanner.py @@ -385,13 +385,17 @@ def scan(self) -> Generator[Issue, None, None]: for chunk in self.chunks: # Run regex scans first to trigger a potential fast fail for bad config if self.global_options.regex and self.rules_regexes: - yield from self.scan_regex(chunk) + for issue in self.scan_regex(chunk): + self._issues.append(issue) + yield issue if self.global_options.entropy: - yield from self.scan_entropy( + for issue in self.scan_entropy( chunk, self.global_options.b64_entropy_score, self.global_options.hex_entropy_score, - ) + ): + self._issues.append(issue) + yield issue self._completed = True self.logger.info("Found %d issues.", len(self._issues)) @@ -444,9 +448,7 @@ def evaluate_entropy_string( if self.entropy_string_is_excluded(string, line, chunk.file_path): self.logger.debug("line containing entropy was excluded: %s", line) else: - issue = Issue(types.IssueType.Entropy, string, chunk) - self._issues.append(issue) - yield issue + yield Issue(types.IssueType.Entropy, string, chunk) def scan_regex(self, chunk: types.Chunk) -> Generator[Issue, None, None]: """Scan a chunk of data for matches against the configured regexes. @@ -462,7 +464,6 @@ def scan_regex(self, chunk: types.Chunk) -> Generator[Issue, None, None]: if not self.signature_is_excluded(match, chunk.file_path): issue = Issue(types.IssueType.RegEx, match, chunk) issue.issue_detail = key - self._issues.append(issue) yield issue @property From 361d33d13ed7d647650bb827d29eea46fc2bfbea Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Tue, 26 Oct 2021 10:12:20 -0400 Subject: [PATCH 17/17] Add locking for scan() method Multiple competing calls to `scan()` are now guaranteed to execute sequentially. --- tartufo/scanner.py | 83 +++++++++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/tartufo/scanner.py b/tartufo/scanner.py index ce227f52..c5c594c3 100755 --- a/tartufo/scanner.py +++ b/tartufo/scanner.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- import abc +from functools import lru_cache import hashlib import logging import math import pathlib import re -import warnings -from functools import lru_cache +import threading from typing import ( Any, Dict, @@ -19,6 +19,7 @@ Set, Tuple, ) +import warnings import click @@ -139,6 +140,7 @@ class ScannerBase(abc.ABC): # pylint: disable=too-many-instance-attributes _rules_regexes: Optional[Dict[str, Rule]] = None global_options: types.GlobalOptions logger: logging.Logger + _scan_lock: threading.Lock = threading.Lock() def __init__(self, options: types.GlobalOptions) -> None: self.global_options = options @@ -162,6 +164,12 @@ def issues(self) -> List[Issue]: :returns: Any issues found during the scan. """ + # Note there is no locking in this method (which is readonly). If the + # first scan is not completed (or even if we mistakenly believe it is + # not completed, due to a race), we call scan (which is protected) to + # ensure the issues list is complete. By the time we reach the return + # statement here, we know _issues is stable. + if not self.completed: self.logger.debug( "Issues called before scan completed. Finishing scan now." @@ -365,39 +373,52 @@ def scan(self) -> Generator[Issue, None, None]: implementation, and run all requested scans against it, as specified in `self.global_options`. + The scan method is thread-safe; if multiple concurrent scans are requested, + the first will run to completion while other callers are blocked (after + which they will each execute in turn, yielding cached issues without + repeating the underlying repository scan). + :raises types.TartufoConfigException: If there were problems with the scanner's configuration """ - if self.completed: - yield from self._issues - return - - if not any((self.global_options.entropy, self.global_options.regex)): - self.logger.error("No analysis requested.") - raise types.ConfigException("No analysis requested.") - if self.global_options.regex and not self.rules_regexes: - self.logger.error("Regex checks requested, but no regexes found.") - raise types.ConfigException("Regex checks requested, but no regexes found.") - - self.logger.info("Starting scan...") - self._issues = [] - for chunk in self.chunks: - # Run regex scans first to trigger a potential fast fail for bad config - if self.global_options.regex and self.rules_regexes: - for issue in self.scan_regex(chunk): - self._issues.append(issue) - yield issue - if self.global_options.entropy: - for issue in self.scan_entropy( - chunk, - self.global_options.b64_entropy_score, - self.global_options.hex_entropy_score, - ): - self._issues.append(issue) - yield issue - self._completed = True - self.logger.info("Found %d issues.", len(self._issues)) + # I cannot find any written description of the python memory model. The + # correctness of this code in multithreaded environments relies on the + # expectation that the write to _completed at the bottom of the critical + # section cannot be reordered to appear after the implicit release of + # _scan_lock (when viewed from a competing thread). + with self._scan_lock: + if self._completed: + yield from self._issues + return + + if not any((self.global_options.entropy, self.global_options.regex)): + self.logger.error("No analysis requested.") + raise types.ConfigException("No analysis requested.") + if self.global_options.regex and not self.rules_regexes: + self.logger.error("Regex checks requested, but no regexes found.") + raise types.ConfigException( + "Regex checks requested, but no regexes found." + ) + + self.logger.info("Starting scan...") + self._issues = [] + for chunk in self.chunks: + # Run regex scans first to trigger a potential fast fail for bad config + if self.global_options.regex and self.rules_regexes: + for issue in self.scan_regex(chunk): + self._issues.append(issue) + yield issue + if self.global_options.entropy: + for issue in self.scan_entropy( + chunk, + self.global_options.b64_entropy_score, + self.global_options.hex_entropy_score, + ): + self._issues.append(issue) + yield issue + self._completed = True + self.logger.info("Found %d issues.", len(self._issues)) def scan_entropy( self, chunk: types.Chunk, b64_entropy_score: float, hex_entropy_score: float