Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: report issues as discovered instead of buffering #227

Merged
merged 17 commits into from
Oct 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
vx.x.x - TBD
------------

Features:

* [#227](https://github.com/godaddy/tartufo/pull/227) - Report findings incrementally
as scan progresses instead of holding all of them until it has completed. This
is a reimplementation of [#108](https://github.com/godaddy/tartufo/pull/108);
thanks to @dclayton-godaddy for showing the way.

v2.9.0 - 19 October 2021
------------------------

Expand Down
130 changes: 84 additions & 46 deletions tartufo/scanner.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# -*- coding: utf-8 -*-

import abc
from functools import lru_cache
import hashlib
import logging
import math
import pathlib
import re
import warnings
from functools import lru_cache
import threading
from typing import (
Any,
Dict,
Expand All @@ -19,6 +19,7 @@
Set,
Tuple,
)
import warnings

import click

Expand Down Expand Up @@ -120,7 +121,7 @@ def __bytes__(self) -> bytes:
return self.__str__().encode("utf8")


class ScannerBase(abc.ABC):
class ScannerBase(abc.ABC): # pylint: disable=too-many-instance-attributes
"""Provide the base, generic functionality needed by all scanners.

In fact, this contains all of the actual scanning logic. This part of the
Expand All @@ -131,30 +132,50 @@ class ScannerBase(abc.ABC):
all the individual pieces of content to be scanned.
"""

_issues: Optional[List[Issue]] = None
_issues: List[Issue] = []
_completed: bool = False
_included_paths: Optional[List[Pattern]] = None
_excluded_paths: Optional[List[Pattern]] = None
_excluded_entropy: Optional[List[Rule]] = None
_rules_regexes: Optional[Dict[str, Rule]] = None
global_options: types.GlobalOptions
logger: logging.Logger
_scan_lock: threading.Lock = threading.Lock()

def __init__(self, options: types.GlobalOptions) -> None:
self.global_options = options
self.logger = logging.getLogger(__name__)

@property
def completed(self) -> bool:
"""Return True if scan has completed

:returns: True if scan has completed; False if scan is in progress
"""

return self._completed

@property
def issues(self) -> List[Issue]:
"""Get a list of issues found during the scan.

If a scan has not yet been run, run it.
If the scan is still in progress, force it to complete first.

:return: Any issues found during the scan.
:rtype: List[Issue]
:returns: Any issues found during the scan.
"""
if self._issues is None:
self.logger.debug("Issues called before scan. Calling scan now.")
self._issues = self.scan()

# Note there is no locking in this method (which is readonly). If the
# first scan is not completed (or even if we mistakenly believe it is
# not completed, due to a race), we call scan (which is protected) to
# ensure the issues list is complete. By the time we reach the return
# statement here, we know _issues is stable.

if not self.completed:
self.logger.debug(
"Issues called before scan completed. Finishing scan now."
)
list(self.scan())
rbailey-godaddy marked this conversation as resolved.
Show resolved Hide resolved

return self._issues

@property
Expand Down Expand Up @@ -345,74 +366,93 @@ def calculate_entropy(self, data: str, char_set: str) -> float:
entropy += -prob_x * math.log2(prob_x)
return entropy

def scan(self) -> List[Issue]:
def scan(self) -> Generator[Issue, None, None]:
"""Run the requested scans against the target data.

This will iterate through all chunks of data as provided by the scanner
implementation, and run all requested scans against it, as specified in
`self.global_options`.

The scan method is thread-safe; if multiple concurrent scans are requested,
the first will run to completion while other callers are blocked (after
which they will each execute in turn, yielding cached issues without
repeating the underlying repository scan).

:raises types.TartufoConfigException: If there were problems with the
scanner's configuration
"""
issues: List[Issue] = []
if not any((self.global_options.entropy, self.global_options.regex)):
self.logger.error("No analysis requested.")
raise types.ConfigException("No analysis requested.")
if self.global_options.regex and not self.rules_regexes:
self.logger.error("Regex checks requested, but no regexes found.")
raise types.ConfigException("Regex checks requested, but no regexes found.")

self.logger.info("Starting scan...")
for chunk in self.chunks:
# Run regex scans first to trigger a potential fast fail for bad config
if self.global_options.regex and self.rules_regexes:
issues += self.scan_regex(chunk)
if self.global_options.entropy:
issues += self.scan_entropy(
chunk,
self.global_options.b64_entropy_score,
self.global_options.hex_entropy_score,

# I cannot find any written description of the python memory model. The
# correctness of this code in multithreaded environments relies on the
# expectation that the write to _completed at the bottom of the critical
# section cannot be reordered to appear after the implicit release of
# _scan_lock (when viewed from a competing thread).
with self._scan_lock:
if self._completed:
yield from self._issues
return

if not any((self.global_options.entropy, self.global_options.regex)):
self.logger.error("No analysis requested.")
raise types.ConfigException("No analysis requested.")
if self.global_options.regex and not self.rules_regexes:
self.logger.error("Regex checks requested, but no regexes found.")
raise types.ConfigException(
"Regex checks requested, but no regexes found."
)
self._issues = issues
self.logger.info("Found %d issues.", len(self._issues))
return self._issues

self.logger.info("Starting scan...")
self._issues = []
for chunk in self.chunks:
# Run regex scans first to trigger a potential fast fail for bad config
if self.global_options.regex and self.rules_regexes:
for issue in self.scan_regex(chunk):
self._issues.append(issue)
yield issue
if self.global_options.entropy:
for issue in self.scan_entropy(
chunk,
self.global_options.b64_entropy_score,
self.global_options.hex_entropy_score,
):
self._issues.append(issue)
yield issue
self._completed = True
self.logger.info("Found %d issues.", len(self._issues))

def scan_entropy(
self, chunk: types.Chunk, b64_entropy_score: float, hex_entropy_score: float
) -> List[Issue]:
) -> Generator[Issue, None, None]:
"""Scan a chunk of data for apparent high entropy.

:param chunk: The chunk of data to be scanned
:param b64_entropy_score: Base64 entropy score
:param hex_entropy_score: Hexadecimal entropy score
"""
issues: List[Issue] = []

for line in chunk.contents.split("\n"):
for word in line.split():
b64_strings = util.get_strings_of_set(word, BASE64_CHARS)
hex_strings = util.get_strings_of_set(word, HEX_CHARS)

for string in b64_strings:
issues += self.evaluate_entropy_string(
yield from self.evaluate_entropy_string(
chunk, line, string, BASE64_CHARS, b64_entropy_score
)

for string in hex_strings:
issues += self.evaluate_entropy_string(
yield from self.evaluate_entropy_string(
chunk, line, string, HEX_CHARS, hex_entropy_score
)

return issues

def evaluate_entropy_string(
self,
chunk: types.Chunk,
line: str,
string: str,
chars: str,
min_entropy_score: float,
) -> List[Issue]:
) -> Generator[Issue, None, None]:
"""
Check entropy string using entropy characters and score.

Expand All @@ -421,23 +461,22 @@ def evaluate_entropy_string(
:param string: String to check
:param chars: Characters to calculate score
:param min_entropy_score: Minimum entropy score to flag
return: List of issues flagged
return: Iterator of issues flagged
"""
if not self.signature_is_excluded(string, chunk.file_path):
entropy_score = self.calculate_entropy(string, chars)
if entropy_score > min_entropy_score:
if self.entropy_string_is_excluded(string, line, chunk.file_path):
self.logger.debug("line containing entropy was excluded: %s", line)
else:
return [Issue(types.IssueType.Entropy, string, chunk)]
return []
yield Issue(types.IssueType.Entropy, string, chunk)

def scan_regex(self, chunk: types.Chunk) -> List[Issue]:
def scan_regex(self, chunk: types.Chunk) -> Generator[Issue, None, None]:
"""Scan a chunk of data for matches against the configured regexes.

:param chunk: The chunk of data to be scanned
"""
issues: List[Issue] = []

for key, rule in self.rules_regexes.items():
if rule.path_pattern is None or rule.path_pattern.match(chunk.file_path):
found_strings = rule.pattern.findall(chunk.contents)
Expand All @@ -446,8 +485,7 @@ def scan_regex(self, chunk: types.Chunk) -> List[Issue]:
if not self.signature_is_excluded(match, chunk.file_path):
issue = Issue(types.IssueType.RegEx, match, chunk)
issue.issue_detail = key
issues.append(issue)
return issues
yield issue

@property
@abc.abstractmethod
Expand Down
42 changes: 32 additions & 10 deletions tartufo/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,16 @@
from datetime import datetime
from functools import lru_cache, partial
from hashlib import blake2s
from typing import Any, Callable, Dict, Iterable, List, Optional, TYPE_CHECKING, Pattern
from typing import (
Any,
Callable,
Dict,
Iterable,
List,
Optional,
TYPE_CHECKING,
Pattern,
)

import click
import git
Expand Down Expand Up @@ -56,6 +65,7 @@ def echo_result(
:param repo_path: The path to the repository the issues were found in
:param output_dir: The directory that issue details were written out to
"""

now = datetime.now().isoformat("T", "microseconds")
if options.json:
output = {
Expand All @@ -69,24 +79,36 @@ def echo_result(
"exclude_entropy_patterns": [
str(pattern) for pattern in options.exclude_entropy_patterns
],
"found_issues": [
issue.as_dict(compact=options.compact) for issue in scanner.issues
],
# This member is for reference. Read below...
# "found_issues": [
# issue.as_dict(compact=options.compact) for issue in scanner.issues
# ],
}

click.echo(json.dumps(output))
# Observation: We want to "stream" JSON; the only generator output is the
# "found_issues" list (which is at the top level). Dump the "static" part
# minus the closing "}", then generate issues individually, then emit the
# closing "}".
static_part = json.dumps(output)
click.echo(f'{static_part[:-1]}, "found_issues": [', nl=False)
delimiter = ""
for issue in scanner.scan():
live_part = json.dumps(issue.as_dict(compact=options.compact))
click.echo(f"{delimiter}{live_part}", nl=False)
delimiter = ", "
click.echo("]}")
elif options.compact:
for issue in scanner.issues:
for issue in scanner.scan():
click.echo(
f"[{issue.issue_type.value}] {issue.chunk.file_path}: {issue.matched_string} "
f"({issue.signature}, {issue.issue_detail})"
)
else:
for issue in scanner.scan():
click.echo(bytes(issue))
if not scanner.issues:
if not options.quiet:
click.echo(f"Time: {now}\nAll clear. No secrets detected.")
else:
click.echo(b"\n".join([bytes(issue) for issue in scanner.issues]))
if options.verbose > 0:
click.echo("\nExcluded paths:")
click.echo("\n".join([path.pattern for path in scanner.excluded_paths]))
Expand All @@ -96,10 +118,10 @@ def echo_result(
click.echo("\n".join(options.exclude_entropy_patterns))


def write_outputs(found_issues: "List[Issue]", output_dir: pathlib.Path) -> List[str]:
def write_outputs(found_issues: List["Issue"], output_dir: pathlib.Path) -> List[str]:
"""Write details of the issues to individual files in the specified directory.

:param found_issues: The list of issues to be written out
:param found_issues: A list of issues to be written out
:param output_dir: The directory where the files should be written
"""
result_files = []
Expand Down
Loading