Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: report issues as discovered instead of buffering #227

Merged
merged 17 commits into from
Oct 26, 2021
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
vx.x.x - TBD
------------

Features:

* [#227](https://github.com/godaddy/tartufo/pull/227) - Report findings incrementally
as scan progresses instead of holding all of them until it has completed. This
is a reimplementation of [#108](https://github.com/godaddy/tartufo/pull/108);
thanks to @dclayton-godaddy for showing the way.

v2.9.0 - 19 October 2021
------------------------

Expand Down
72 changes: 44 additions & 28 deletions tartufo/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def __bytes__(self) -> bytes:
return self.__str__().encode("utf8")


class ScannerBase(abc.ABC):
class ScannerBase(abc.ABC): # pylint: disable=too-many-instance-attributes
"""Provide the base, generic functionality needed by all scanners.

In fact, this contains all of the actual scanning logic. This part of the
Expand All @@ -131,7 +131,8 @@ class ScannerBase(abc.ABC):
all the individual pieces of content to be scanned.
"""

_issues: Optional[List[Issue]] = None
_issues: List[Issue] = []
_completed: bool = False
_included_paths: Optional[List[Pattern]] = None
_excluded_paths: Optional[List[Pattern]] = None
_excluded_entropy: Optional[List[Rule]] = None
Expand All @@ -143,18 +144,30 @@ def __init__(self, options: types.GlobalOptions) -> None:
self.global_options = options
self.logger = logging.getLogger(__name__)

@property
def completed(self) -> bool:
"""Return True if scan has completed

:returns: True if scan has completed; False if scan is in progress
"""

return self._completed

@property
def issues(self) -> List[Issue]:
"""Get a list of issues found during the scan.

If a scan has not yet been run, run it.
If the scan is still in progress, force it to complete first.

:return: Any issues found during the scan.
:rtype: List[Issue]
:returns: Any issues found during the scan.
"""
if self._issues is None:
self.logger.debug("Issues called before scan. Calling scan now.")
self._issues = self.scan()

if not self.completed:
self.logger.debug(
"Issues called before scan completed. Finishing scan now."
)
list(self.scan())
rbailey-godaddy marked this conversation as resolved.
Show resolved Hide resolved

return self._issues

@property
Expand Down Expand Up @@ -345,7 +358,7 @@ def calculate_entropy(self, data: str, char_set: str) -> float:
entropy += -prob_x * math.log2(prob_x)
return entropy

def scan(self) -> List[Issue]:
def scan(self) -> Generator[Issue, None, None]:
"""Run the requested scans against the target data.

This will iterate through all chunks of data as provided by the scanner
Expand All @@ -355,7 +368,11 @@ def scan(self) -> List[Issue]:
:raises types.TartufoConfigException: If there were problems with the
scanner's configuration
"""
issues: List[Issue] = []

if self.completed:
yield from self._issues
return

if not any((self.global_options.entropy, self.global_options.regex)):
self.logger.error("No analysis requested.")
raise types.ConfigException("No analysis requested.")
Expand All @@ -364,55 +381,53 @@ def scan(self) -> List[Issue]:
raise types.ConfigException("Regex checks requested, but no regexes found.")

self.logger.info("Starting scan...")
self._issues = []
for chunk in self.chunks:
# Run regex scans first to trigger a potential fast fail for bad config
if self.global_options.regex and self.rules_regexes:
issues += self.scan_regex(chunk)
yield from self.scan_regex(chunk)
if self.global_options.entropy:
issues += self.scan_entropy(
yield from self.scan_entropy(
chunk,
self.global_options.b64_entropy_score,
self.global_options.hex_entropy_score,
)
self._issues = issues
self._completed = True
self.logger.info("Found %d issues.", len(self._issues))
return self._issues

def scan_entropy(
self, chunk: types.Chunk, b64_entropy_score: float, hex_entropy_score: float
) -> List[Issue]:
) -> Generator[Issue, None, None]:
"""Scan a chunk of data for apparent high entropy.

:param chunk: The chunk of data to be scanned
:param b64_entropy_score: Base64 entropy score
:param hex_entropy_score: Hexadecimal entropy score
"""
issues: List[Issue] = []

for line in chunk.contents.split("\n"):
for word in line.split():
b64_strings = util.get_strings_of_set(word, BASE64_CHARS)
hex_strings = util.get_strings_of_set(word, HEX_CHARS)

for string in b64_strings:
issues += self.evaluate_entropy_string(
yield from self.evaluate_entropy_string(
chunk, line, string, BASE64_CHARS, b64_entropy_score
)

for string in hex_strings:
issues += self.evaluate_entropy_string(
yield from self.evaluate_entropy_string(
chunk, line, string, HEX_CHARS, hex_entropy_score
)

return issues

def evaluate_entropy_string(
self,
chunk: types.Chunk,
line: str,
string: str,
chars: str,
min_entropy_score: float,
) -> List[Issue]:
) -> Generator[Issue, None, None]:
"""
Check entropy string using entropy characters and score.

Expand All @@ -421,23 +436,24 @@ def evaluate_entropy_string(
:param string: String to check
:param chars: Characters to calculate score
:param min_entropy_score: Minimum entropy score to flag
return: List of issues flagged
return: Iterator of issues flagged
"""
if not self.signature_is_excluded(string, chunk.file_path):
entropy_score = self.calculate_entropy(string, chars)
if entropy_score > min_entropy_score:
if self.entropy_string_is_excluded(string, line, chunk.file_path):
self.logger.debug("line containing entropy was excluded: %s", line)
else:
return [Issue(types.IssueType.Entropy, string, chunk)]
return []
issue = Issue(types.IssueType.Entropy, string, chunk)
self._issues.append(issue)
rbailey-godaddy marked this conversation as resolved.
Show resolved Hide resolved
yield issue

def scan_regex(self, chunk: types.Chunk) -> List[Issue]:
def scan_regex(self, chunk: types.Chunk) -> Generator[Issue, None, None]:
"""Scan a chunk of data for matches against the configured regexes.

:param chunk: The chunk of data to be scanned
"""
issues: List[Issue] = []

for key, rule in self.rules_regexes.items():
if rule.path_pattern is None or rule.path_pattern.match(chunk.file_path):
found_strings = rule.pattern.findall(chunk.contents)
Expand All @@ -446,8 +462,8 @@ def scan_regex(self, chunk: types.Chunk) -> List[Issue]:
if not self.signature_is_excluded(match, chunk.file_path):
issue = Issue(types.IssueType.RegEx, match, chunk)
issue.issue_detail = key
issues.append(issue)
return issues
self._issues.append(issue)
rbailey-godaddy marked this conversation as resolved.
Show resolved Hide resolved
yield issue

@property
@abc.abstractmethod
Expand Down
42 changes: 32 additions & 10 deletions tartufo/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,16 @@
from datetime import datetime
from functools import lru_cache, partial
from hashlib import blake2s
from typing import Any, Callable, Dict, Iterable, List, Optional, TYPE_CHECKING, Pattern
from typing import (
Any,
Callable,
Dict,
Iterable,
List,
Optional,
TYPE_CHECKING,
Pattern,
)

import click
import git
Expand Down Expand Up @@ -56,6 +65,7 @@ def echo_result(
:param repo_path: The path to the repository the issues were found in
:param output_dir: The directory that issue details were written out to
"""

now = datetime.now().isoformat("T", "microseconds")
if options.json:
output = {
Expand All @@ -69,24 +79,36 @@ def echo_result(
"exclude_entropy_patterns": [
str(pattern) for pattern in options.exclude_entropy_patterns
],
"found_issues": [
issue.as_dict(compact=options.compact) for issue in scanner.issues
],
# This member is for reference. Read below...
# "found_issues": [
# issue.as_dict(compact=options.compact) for issue in scanner.issues
# ],
}

click.echo(json.dumps(output))
# Observation: We want to "stream" JSON; the only generator output is the
# "found_issues" list (which is at the top level). Dump the "static" part
# minus the closing "}", then generate issues individually, then emit the
# closing "}".
static_part = json.dumps(output)
click.echo(f'{static_part[:-1]}, "found_issues": [', nl=False)
delimiter = ""
for issue in scanner.scan():
live_part = json.dumps(issue.as_dict(compact=options.compact))
click.echo(f"{delimiter}{live_part}", nl=False)
delimiter = ", "
click.echo("]}")
elif options.compact:
for issue in scanner.issues:
for issue in scanner.scan():
click.echo(
f"[{issue.issue_type.value}] {issue.chunk.file_path}: {issue.matched_string} "
f"({issue.signature}, {issue.issue_detail})"
)
else:
for issue in scanner.scan():
click.echo(bytes(issue))
if not scanner.issues:
if not options.quiet:
click.echo(f"Time: {now}\nAll clear. No secrets detected.")
else:
click.echo(b"\n".join([bytes(issue) for issue in scanner.issues]))
if options.verbose > 0:
click.echo("\nExcluded paths:")
click.echo("\n".join([path.pattern for path in scanner.excluded_paths]))
Expand All @@ -96,10 +118,10 @@ def echo_result(
click.echo("\n".join(options.exclude_entropy_patterns))


def write_outputs(found_issues: "List[Issue]", output_dir: pathlib.Path) -> List[str]:
def write_outputs(found_issues: List["Issue"], output_dir: pathlib.Path) -> List[str]:
"""Write details of the issues to individual files in the specified directory.

:param found_issues: The list of issues to be written out
:param found_issues: A list of issues to be written out
:param output_dir: The directory where the files should be written
"""
result_files = []
Expand Down
Loading