From d21e003cd3c0df7f626a7eab70d639abda617a46 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Mon, 8 Feb 2021 15:37:21 -0500 Subject: [PATCH 01/29] Initial changes after cloing repository Swap out instances of skeleton-python-library and do some initial reference updates --- .github/lineage.yml | 2 +- CONTRIBUTING.md | 10 +++++----- README.md | 12 ++++++------ setup.py | 15 +++++++-------- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/.github/lineage.yml b/.github/lineage.yml index 8dfc20b..569fc89 100644 --- a/.github/lineage.yml +++ b/.github/lineage.yml @@ -3,4 +3,4 @@ version: "1" lineage: skeleton: - remote-url: https://github.com/cisagov/skeleton-generic.git + remote-url: https://github.com/cisagov/skeleton-python-library.git diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 24d6e98..71e7997 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,7 +15,7 @@ all of which should be in this repository. If you want to report a bug or request a new feature, the most direct method is to [create an -issue](https://github.com/cisagov/skeleton-python-library/issues) in +issue](https://github.com/cisagov/hash-http-content/issues) in this repository. We recommend that you first search through existing issues (both open and closed) to check if your particular issue has already been reported. If it has then you might want to add a comment @@ -25,7 +25,7 @@ one. ## Pull requests ## If you choose to [submit a pull -request](https://github.com/cisagov/skeleton-python-library/pulls), +request](https://github.com/cisagov/hash-http-content/pulls), you will notice that our continuous integration (CI) system runs a fairly extensive set of linters, syntax checkers, system, and unit tests. Your pull request may fail these checks, and that's OK. If you want @@ -111,9 +111,9 @@ can create and configure the Python virtual environment with these commands: ```console -cd skeleton-python-library -pyenv virtualenv skeleton-python-library -pyenv local skeleton-python-library +cd hash-http-content +pyenv virtualenv hash-http-content +pyenv local hash-http-content pip install --requirement requirements-dev.txt ``` diff --git a/README.md b/README.md index 7f20bda..902dfa1 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ -# skeleton-python-library # +# hash-http-content # -[![GitHub Build Status](https://github.com/cisagov/skeleton-python-library/workflows/build/badge.svg)](https://github.com/cisagov/skeleton-python-library/actions) -[![Coverage Status](https://coveralls.io/repos/github/cisagov/skeleton-python-library/badge.svg?branch=develop)](https://coveralls.io/github/cisagov/skeleton-python-library?branch=develop) -[![Total alerts](https://img.shields.io/lgtm/alerts/g/cisagov/skeleton-python-library.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/cisagov/skeleton-python-library/alerts/) -[![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/cisagov/skeleton-python-library.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/cisagov/skeleton-python-library/context:python) -[![Known Vulnerabilities](https://snyk.io/test/github/cisagov/skeleton-python-library/develop/badge.svg)](https://snyk.io/test/github/cisagov/skeleton-python-library) +[![GitHub Build Status](https://github.com/cisagov/hash-http-content/workflows/build/badge.svg)](https://github.com/cisagov/hash-http-content/actions) +[![Coverage Status](https://coveralls.io/repos/github/cisagov/hash-http-content/badge.svg?branch=develop)](https://coveralls.io/github/cisagov/hash-http-content?branch=develop) +[![Total alerts](https://img.shields.io/lgtm/alerts/g/cisagov/hash-http-content.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/cisagov/hash-http-content/alerts/) +[![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/cisagov/hash-http-content.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/cisagov/hash-http-content/context:python) +[![Known Vulnerabilities](https://snyk.io/test/github/cisagov/hash-http-content/develop/badge.svg)](https://snyk.io/test/github/cisagov/hash-http-content) This is a generic skeleton project that can be used to quickly get a new [cisagov](https://github.com/cisagov) Python library GitHub diff --git a/setup.py b/setup.py index a458722..eec3899 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ """ -This is the setup module for the example project. +This is the setup module for the hash-http-content project. Based on: @@ -42,16 +42,16 @@ def get_version(version_file): setup( - name="example", + name="hash-http-content", # Versions should comply with PEP440 version=get_version("src/example/_version.py"), - description="Example python library", + description="HTTP content hasher", long_description=readme(), long_description_content_type="text/markdown", # NCATS "homepage" url="https://www.us-cert.gov/resources/ncats", # The project's main homepage - download_url="https://github.com/cisagov/skeleton-python-library", + download_url="https://github.com/cisagov/hash-http-content", # Author details author="Cyber and Infrastructure Security Agency", author_email="ncats@hq.dhs.gov", @@ -77,10 +77,9 @@ def get_version(version_file): ], python_requires=">=3.6", # What does your project relate to? - keywords="skeleton", + keywords="hash http requests", packages=find_packages(where="src"), package_dir={"": "src"}, - package_data={"example": ["data/*.txt"]}, py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")], include_package_data=True, install_requires=["docopt", "schema", "setuptools >= 24.2.0"], @@ -99,6 +98,6 @@ def get_version(version_file): "pytest", ] }, - # Conveniently allows one to run the CLI tool as `example` - entry_points={"console_scripts": ["example = example.example:main"]}, + # Conveniently allows one to run the CLI tool as `hash-url` + entry_points={"console_scripts": ["hash-url = example.example:main"]}, ) From 5e656c43ef98b5cac63eef21859c434d810ee2ea Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Mon, 8 Feb 2021 15:48:45 -0500 Subject: [PATCH 02/29] Remove example files and put in framework for this module Get rid of or move any of the example library's files and put in place some files to implement for this package's functionality. --- .coveragerc | 2 +- bump_version.sh | 2 +- setup.py | 4 +- src/example/data/secret.txt | 1 - src/example/example.py | 108 --------------- .../__init__.py | 8 +- .../__main__.py | 2 +- .../_version.py | 0 src/hash_http_content/cli.py | 5 + src/hash_http_content/hasher.py | 1 + tests/test_cli.py | 2 + tests/test_example.py | 128 ------------------ tests/test_hasher.py | 2 + 13 files changed, 20 insertions(+), 245 deletions(-) delete mode 100644 src/example/data/secret.txt delete mode 100644 src/example/example.py rename src/{example => hash_http_content}/__init__.py (72%) rename src/{example => hash_http_content}/__main__.py (73%) rename src/{example => hash_http_content}/_version.py (100%) create mode 100644 src/hash_http_content/cli.py create mode 100644 src/hash_http_content/hasher.py create mode 100644 tests/test_cli.py delete mode 100644 tests/test_example.py create mode 100644 tests/test_hasher.py diff --git a/.coveragerc b/.coveragerc index d315b87..1fe2ed9 100644 --- a/.coveragerc +++ b/.coveragerc @@ -2,7 +2,7 @@ # https://coverage.readthedocs.io/en/latest/config.html [run] -source = src/example +source = src/hash_http_content omit = branch = true diff --git a/bump_version.sh b/bump_version.sh index 861eed0..b5ca161 100755 --- a/bump_version.sh +++ b/bump_version.sh @@ -6,7 +6,7 @@ set -o nounset set -o errexit set -o pipefail -VERSION_FILE=src/example/_version.py +VERSION_FILE=src/hash_http_content/_version.py HELP_INFORMATION="bump_version.sh (show|major|minor|patch|prerelease|build|finalize)" diff --git a/setup.py b/setup.py index eec3899..e7803d7 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ def get_version(version_file): setup( name="hash-http-content", # Versions should comply with PEP440 - version=get_version("src/example/_version.py"), + version=get_version("src/hash_http_content/_version.py"), description="HTTP content hasher", long_description=readme(), long_description_content_type="text/markdown", @@ -99,5 +99,5 @@ def get_version(version_file): ] }, # Conveniently allows one to run the CLI tool as `hash-url` - entry_points={"console_scripts": ["hash-url = example.example:main"]}, + entry_points={"console_scripts": ["hash-url = hash_http_content.cli:main"]}, ) diff --git a/src/example/data/secret.txt b/src/example/data/secret.txt deleted file mode 100644 index c40a49b..0000000 --- a/src/example/data/secret.txt +++ /dev/null @@ -1 +0,0 @@ -Three may keep a secret, if two of them are dead. diff --git a/src/example/example.py b/src/example/example.py deleted file mode 100644 index 73faa33..0000000 --- a/src/example/example.py +++ /dev/null @@ -1,108 +0,0 @@ -"""example is an example Python library and tool. - -Divide one integer by another and log the result. Also log some information -from an environment variable and a package resource. - -EXIT STATUS - This utility exits with one of the following values: - 0 Calculation completed successfully. - >0 An error occurred. - -Usage: - example [--log-level=LEVEL] - example (-h | --help) - -Options: - -h --help Show this message. - --log-level=LEVEL If specified, then the log level will be set to - the specified value. Valid values are "debug", "info", - "warning", "error", and "critical". [default: info] -""" - -# Standard Python Libraries -import logging -import os -import sys -from typing import Any, Dict - -# Third-Party Libraries -import docopt -import pkg_resources -from schema import And, Schema, SchemaError, Use - -from ._version import __version__ - -DEFAULT_ECHO_MESSAGE: str = "Hello World from the example default!" - - -def example_div(dividend: float, divisor: float) -> float: - """Print some logging messages.""" - logging.debug("This is a debug message") - logging.info("This is an info message") - logging.warning("This is a warning message") - logging.error("This is an error message") - logging.critical("This is a critical message") - return dividend / divisor - - -def main() -> int: - """Set up logging and call the example function.""" - args: Dict[str, str] = docopt.docopt(__doc__, version=__version__) - # Validate and convert arguments as needed - schema: Schema = Schema( - { - "--log-level": And( - str, - Use(str.lower), - lambda n: n in ("debug", "info", "warning", "error", "critical"), - error="Possible values for --log-level are " - + "debug, info, warning, error, and critical.", - ), - "": Use(int, error=" must be an integer."), - "": And( - Use(int), - lambda n: n != 0, - error=" must be an integer that is not 0.", - ), - str: object, # Don't care about other keys, if any - } - ) - - try: - validated_args: Dict[str, Any] = schema.validate(args) - except SchemaError as err: - # Exit because one or more of the arguments were invalid - print(err, file=sys.stderr) - return 1 - - # Assign validated arguments to variables - dividend: int = validated_args[""] - divisor: int = validated_args[""] - log_level: str = validated_args["--log-level"] - - # Set up logging - logging.basicConfig( - format="%(asctime)-15s %(levelname)s %(message)s", level=log_level.upper() - ) - - logging.info(f"{dividend} / {divisor} == {example_div(dividend, divisor)}") - - # Access some data from an environment variable - message: str = os.getenv("ECHO_MESSAGE", DEFAULT_ECHO_MESSAGE) - logging.info(f'ECHO_MESSAGE="{message}"') - - # Access some data from our package data (see the setup.py) - secret_message: str = ( - pkg_resources.resource_string("example", "data/secret.txt") - .decode("utf-8") - .strip() - ) - logging.info(f'Secret="{secret_message}"') - - # Stop logging and clean up - logging.shutdown() - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/example/__init__.py b/src/hash_http_content/__init__.py similarity index 72% rename from src/example/__init__.py rename to src/hash_http_content/__init__.py index 98b5e04..2714aa2 100644 --- a/src/example/__init__.py +++ b/src/hash_http_content/__init__.py @@ -1,9 +1,11 @@ -"""The example library.""" +"""The hash-http-content library.""" +# Standard Python Libraries +from typing import List + # We disable a Flake8 check for "Module imported but unused (F401)" here because # although this import is not directly used, it populates the value # package_name.__version__, which is used to get version information about this # Python package. from ._version import __version__ # noqa: F401 -from .example import example_div -__all__ = ["example_div"] +__all__: List[str] = [] diff --git a/src/example/__main__.py b/src/hash_http_content/__main__.py similarity index 73% rename from src/example/__main__.py rename to src/hash_http_content/__main__.py index 11a3238..c123d36 100644 --- a/src/example/__main__.py +++ b/src/hash_http_content/__main__.py @@ -1,5 +1,5 @@ """Code to run if this package is used as a Python module.""" -from .example import main +from .cli import main main() diff --git a/src/example/_version.py b/src/hash_http_content/_version.py similarity index 100% rename from src/example/_version.py rename to src/hash_http_content/_version.py diff --git a/src/hash_http_content/cli.py b/src/hash_http_content/cli.py new file mode 100644 index 0000000..e5855e3 --- /dev/null +++ b/src/hash_http_content/cli.py @@ -0,0 +1,5 @@ +"""Command line interface to the hash-url-content package.""" + + +def main(): + """Return the hash(es) and information from the requested URL(s).""" diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py new file mode 100644 index 0000000..a72573a --- /dev/null +++ b/src/hash_http_content/hasher.py @@ -0,0 +1 @@ +"""Functionality to get a hash of URL's visible content.""" diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..154e0b3 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,2 @@ +#!/usr/bin/env pytest -vs +"""Tests for hash_url_content command line interface.""" diff --git a/tests/test_example.py b/tests/test_example.py deleted file mode 100644 index 3a22848..0000000 --- a/tests/test_example.py +++ /dev/null @@ -1,128 +0,0 @@ -#!/usr/bin/env pytest -vs -"""Tests for example.""" - -# Standard Python Libraries -import logging -import os -import sys -from unittest.mock import patch - -# Third-Party Libraries -import pytest - -# cisagov Libraries -import example - -div_params = [ - (1, 1, 1), - (2, 2, 1), - (0, 1, 0), - (8, 2, 4), -] - -log_levels = ( - "debug", - "info", - "warning", - "error", - "critical", -) - -# define sources of version strings -RELEASE_TAG = os.getenv("RELEASE_TAG") -PROJECT_VERSION = example.__version__ - - -def test_stdout_version(capsys): - """Verify that version string sent to stdout agrees with the module version.""" - with pytest.raises(SystemExit): - with patch.object(sys, "argv", ["bogus", "--version"]): - example.example.main() - captured = capsys.readouterr() - assert ( - captured.out == f"{PROJECT_VERSION}\n" - ), "standard output by '--version' should agree with module.__version__" - - -def test_running_as_module(capsys): - """Verify that the __main__.py file loads correctly.""" - with pytest.raises(SystemExit): - with patch.object(sys, "argv", ["bogus", "--version"]): - # F401 is a "Module imported but unused" warning. This import - # emulates how this project would be run as a module. The only thing - # being done by __main__ is importing the main entrypoint of the - # package and running it, so there is nothing to use from this - # import. As a result, we can safely ignore this warning. - # cisagov Libraries - import example.__main__ # noqa: F401 - captured = capsys.readouterr() - assert ( - captured.out == f"{PROJECT_VERSION}\n" - ), "standard output by '--version' should agree with module.__version__" - - -@pytest.mark.skipif( - RELEASE_TAG in [None, ""], reason="this is not a release (RELEASE_TAG not set)" -) -def test_release_version(): - """Verify that release tag version agrees with the module version.""" - assert ( - RELEASE_TAG == f"v{PROJECT_VERSION}" - ), "RELEASE_TAG does not match the project version" - - -@pytest.mark.parametrize("level", log_levels) -def test_log_levels(level): - """Validate commandline log-level arguments.""" - with patch.object(sys, "argv", ["bogus", f"--log-level={level}", "1", "1"]): - with patch.object(logging.root, "handlers", []): - assert ( - logging.root.hasHandlers() is False - ), "root logger should not have handlers yet" - return_code = example.example.main() - assert ( - logging.root.hasHandlers() is True - ), "root logger should now have a handler" - assert return_code == 0, "main() should return success (0)" - - -def test_bad_log_level(): - """Validate bad log-level argument returns error.""" - with patch.object(sys, "argv", ["bogus", "--log-level=emergency", "1", "1"]): - return_code = example.example.main() - assert return_code == 1, "main() should return failure" - - -@pytest.mark.parametrize("dividend, divisor, quotient", div_params) -def test_division(dividend, divisor, quotient): - """Verify division results.""" - result = example.example_div(dividend, divisor) - assert result == quotient, "result should equal quotient" - - -@pytest.mark.slow -def test_slow_division(): - """Example of using a custom marker. - - This test will only be run if --runslow is passed to pytest. - Look in conftest.py to see how this is implemented. - """ - # Standard Python Libraries - import time - - result = example.example_div(256, 16) - time.sleep(4) - assert result == 16, "result should equal be 16" - - -def test_zero_division(): - """Verify that division by zero throws the correct exception.""" - with pytest.raises(ZeroDivisionError): - example.example_div(1, 0) - - -def test_zero_divisor_argument(): - """Verify that a divisor of zero is handled as expected.""" - with patch.object(sys, "argv", ["bogus", "1", "0"]): - return_code = example.example.main() - assert return_code == 1, "main() should exit with error" diff --git a/tests/test_hasher.py b/tests/test_hasher.py new file mode 100644 index 0000000..2be8cba --- /dev/null +++ b/tests/test_hasher.py @@ -0,0 +1,2 @@ +#!/usr/bin/env pytest -vs +"""Tests for hash_url_content URL hashing functionality.""" From c989d96eae89e02f966a082fc9f6171e511188fa Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Mon, 8 Feb 2021 15:52:37 -0500 Subject: [PATCH 03/29] Add utility methods for hashing Add utility methods to get a desired hashing object and to update and get the digest from a hashing object. --- src/hash_http_content/hasher.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index a72573a..b5b984f 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -1 +1,30 @@ -"""Functionality to get a hash of URL's visible content.""" +"""Functionality to get a hash of an HTTP URL's visible content.""" + +# Standard Python Libraries +import hashlib + + +def get_hasher(hash_algorithm: str) -> "hashlib._Hash": + """Get a hashing object.""" + # Not all implementations support the "usedforsecurity" keyword argument, + # which is used to indicate that the algorithm is being used for non-security + # related tasks. This is required for some algorithms on FIPS systems. + try: + hasher = getattr(hashlib, hash_algorithm)(usedforsecurity=False) + except AttributeError: + # There is no named constructor for the desired hashing algorithm + try: + # Work around typeshed's incorrect type hints + hasher = getattr(hashlib, "new")(hash_algorithm, usedforsecurity=False) + except TypeError: + hasher = hashlib.new(hash_algorithm) + except TypeError: + hasher = getattr(hashlib, hash_algorithm)() + return hasher + + +def get_hash_digest(hash_algorithm: str, contents: bytes) -> str: + """Get a hex digest representing a hash of the given contents.""" + hasher: "hashlib._Hash" = get_hasher(hash_algorithm) + hasher.update(contents) + return hasher.hexdigest() From dde6e1533b3b13796a16dbad40b237330fb10b17 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Mon, 8 Feb 2021 15:54:51 -0500 Subject: [PATCH 04/29] Add NamedTuples to store expected results --- src/hash_http_content/hasher.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index b5b984f..94ec2e9 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -2,6 +2,7 @@ # Standard Python Libraries import hashlib +from typing import NamedTuple def get_hasher(hash_algorithm: str) -> "hashlib._Hash": @@ -28,3 +29,20 @@ def get_hash_digest(hash_algorithm: str, contents: bytes) -> str: hasher: "hashlib._Hash" = get_hasher(hash_algorithm) hasher.update(contents) return hasher.hexdigest() + + +class HandlerResult(NamedTuple): + """Named tuple to store the result of a handler call.""" + + hash: str + contents: bytes + + +class UrlResult(NamedTuple): + """Named tuple to store the result of a SiteHasher.hash_url() call.""" + + status: int + visited_url: str + is_redirect: bool + hash: str + contents: bytes From 9b00ea917b393ff8feeb0defdb3bc2c445b8aac3 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Mon, 8 Feb 2021 15:56:36 -0500 Subject: [PATCH 05/29] Implement a class to provide URL hashing functionality The class has a primary method UrlHasher.hash_url() for functionality with helper methods. The main content processing is done by _handle* methods that are designed to handle a specific content-type as given by the remote server. These methods will always return a HandlerResult to standardize them, and the UrlHasher.hash_url() method will use the UrlResult named tuple to provide access convenience while making the results immutable. --- setup.py | 9 ++- src/hash_http_content/hasher.py | 136 +++++++++++++++++++++++++++++++- 2 files changed, 143 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index e7803d7..4ba8da8 100644 --- a/setup.py +++ b/setup.py @@ -82,7 +82,14 @@ def get_version(version_file): package_dir={"": "src"}, py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")], include_package_data=True, - install_requires=["docopt", "schema", "setuptools >= 24.2.0"], + install_requires=[ + "beautifulsoup4", + "docopt", + "pyppeteer", + "requests", + "schema", + "setuptools >= 24.2.0", + ], extras_require={ "test": [ "coverage", diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index 94ec2e9..e61768e 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -1,8 +1,18 @@ """Functionality to get a hash of an HTTP URL's visible content.""" # Standard Python Libraries +import asyncio import hashlib -from typing import NamedTuple +import json +from typing import Any, Callable, Dict, NamedTuple + +# Third-Party Libraries +from bs4 import BeautifulSoup +from bs4.element import Comment, PageElement +from pyppeteer import launch +from pyppeteer.browser import Browser +from pyppeteer.page import Page +import requests def get_hasher(hash_algorithm: str) -> "hashlib._Hash": @@ -46,3 +56,127 @@ class UrlResult(NamedTuple): is_redirect: bool hash: str contents: bytes + + +class UrlHasher: + """Provide functionality to get the hash digest of a given URL.""" + + def __init__( + self, + hash_algorithm: str, + encoding: str = "utf-8", + browser_options: Dict[str, Any] = {}, + ): + """Initialize an instance of this class.""" + default_browser_options = {"headless": True} + self.__browser_options = {**default_browser_options, **browser_options} + self._browser: Browser = None + self._default_encoding = encoding + self._hash_algorithm = hash_algorithm + + self._handlers: Dict[str, Callable] = {} + self._handlers["text/plain"] = self._handle_plaintext + self._handlers["application/json"] = self._handle_json + self._handlers["text/html"] = self._handle_html + + def __init_browser(self): + """Initialize the pyppeteer Browser if it does not exist.""" + if not self._browser: + self._browser = asyncio.get_event_loop().run_until_complete( + launch(**self.__browser_options) + ) + + def _is_visible_element(self, element: PageElement) -> bool: + """Return True if the given website element would be visible.""" + discard_tags = ["[document]", "script", "style"] + if isinstance(element, Comment): + return False + if element.parent.name in discard_tags: + return False + return True + + def _handle_raw_bytes(self, contents: bytes, encoding: str) -> HandlerResult: + """Handle bytes in an unspecified format or encoding.""" + digest: str = get_hash_digest(self._hash_algorithm, contents) + return HandlerResult(digest, contents) + + def _handle_plaintext(self, contents: bytes, encoding: str) -> HandlerResult: + """Handle plaintext contents.""" + if encoding: + contents = bytes(contents.decode(encoding), self._default_encoding) + digest: str = get_hash_digest(self._hash_algorithm, contents) + return HandlerResult(digest, contents) + + def _handle_json(self, contents: bytes, encoding: str) -> HandlerResult: + """Handle JSON contents.""" + # Translate the original encoding to utf-8 + if encoding: + json_str = str(contents, encoding) + else: + json_str = str(contents, self._default_encoding) + + json_data = json.loads(json_str) + # Sort the keys to make this deterministic + json_bytes = bytes( + json.dumps(json_data, separators=(",", ":"), sort_keys=True), + self._default_encoding, + ) + + digest: str = get_hash_digest(self._hash_algorithm, json_bytes) + + return HandlerResult(digest, json_bytes) + + def _handle_html(self, contents: bytes, encoding: str) -> HandlerResult: + """Handle an HTML page.""" + self.__init_browser() + + if encoding: + html = str(contents, encoding) + else: + html = str(contents, self._default_encoding) + + page: Page = asyncio.get_event_loop().run_until_complete( + self._browser.newPage() + ) + asyncio.get_event_loop().run_until_complete(page.setContent(html)) + page_contents: str = asyncio.get_event_loop().run_until_complete(page.content()) + asyncio.get_event_loop().run_until_complete(page.close()) + + soup: BeautifulSoup = BeautifulSoup(page_contents, "lxml") + text_elements = soup.find_all(text=True) + visible_text_elements = filter(self._is_visible_element, text_elements) + visible_text = " ".join(t.strip() for t in visible_text_elements if t.strip()) + visible_bytes = bytes(visible_text, self._default_encoding) + + digest: str = get_hash_digest(self._hash_algorithm, visible_bytes) + + return HandlerResult(digest, visible_bytes) + + def hash_url(self, url: str) -> UrlResult: + """Get a hash of the contents of the provided URL.""" + redirect_status_codes = [301, 307, 308] + resp = requests.get(url) + + # https://tools.ietf.org/html/rfc7231#section-3.1.1.5 + content_type = ( + resp.headers.get("content-type", "application/octet-stream").strip().lower() + ) + + # Pull off any parameters included + if ";" in content_type: + content_type = content_type.split(";", 1)[0] + + is_redirect = False + for r in resp.history: + if r.status_code in redirect_status_codes: + is_redirect = True + break + + # Default to processing as raw bytes if no appropriate handler is found + processed: HandlerResult = self._handlers.get( + content_type, self._handle_raw_bytes + )(resp.content, resp.encoding) + + return UrlResult( + resp.status_code, resp.url, is_redirect, processed.hash, processed.contents + ) From 4a27278b9d7101dae229833af55c03fe31044845 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Mon, 8 Feb 2021 16:22:46 -0500 Subject: [PATCH 06/29] Add initial testing for hash_http_content.hasher Also fix a missing import and update __init__.py to expose the UrlHasher class. --- setup.py | 1 + src/hash_http_content/__init__.py | 3 +- tests/files/testing.bin | 1 + tests/files/testing.json | 4 + tests/files/testing.txt | 1 + tests/files/testing_dynamic.html | 16 ++ tests/files/testing_html_dynamic.bin | 1 + tests/files/testing_html_static.bin | 1 + tests/files/testing_static.html | 10 + tests/test_hasher.py | 290 ++++++++++++++++++++++++++- 10 files changed, 326 insertions(+), 2 deletions(-) create mode 100644 tests/files/testing.bin create mode 100644 tests/files/testing.json create mode 100644 tests/files/testing.txt create mode 100644 tests/files/testing_dynamic.html create mode 100644 tests/files/testing_html_dynamic.bin create mode 100644 tests/files/testing_html_static.bin create mode 100644 tests/files/testing_static.html diff --git a/setup.py b/setup.py index 4ba8da8..445ccbd 100644 --- a/setup.py +++ b/setup.py @@ -85,6 +85,7 @@ def get_version(version_file): install_requires=[ "beautifulsoup4", "docopt", + "lxml", "pyppeteer", "requests", "schema", diff --git a/src/hash_http_content/__init__.py b/src/hash_http_content/__init__.py index 2714aa2..18353cb 100644 --- a/src/hash_http_content/__init__.py +++ b/src/hash_http_content/__init__.py @@ -7,5 +7,6 @@ # package_name.__version__, which is used to get version information about this # Python package. from ._version import __version__ # noqa: F401 +from .hasher import UrlHasher -__all__: List[str] = [] +__all__: List[str] = ["UrlHasher"] diff --git a/tests/files/testing.bin b/tests/files/testing.bin new file mode 100644 index 0000000..3fe7cac --- /dev/null +++ b/tests/files/testing.bin @@ -0,0 +1 @@ +Þ­¾ï diff --git a/tests/files/testing.json b/tests/files/testing.json new file mode 100644 index 0000000..7645927 --- /dev/null +++ b/tests/files/testing.json @@ -0,0 +1,4 @@ +{ + "motto": "Commit today, secure tomorrow.", + "org": "cisagov" +} diff --git a/tests/files/testing.txt b/tests/files/testing.txt new file mode 100644 index 0000000..22b1aea --- /dev/null +++ b/tests/files/testing.txt @@ -0,0 +1 @@ +Commit today, secure tomorrow. diff --git a/tests/files/testing_dynamic.html b/tests/files/testing_dynamic.html new file mode 100644 index 0000000..cde8e65 --- /dev/null +++ b/tests/files/testing_dynamic.html @@ -0,0 +1,16 @@ + + + Example Page + + + +
+ Example text! +
+ + diff --git a/tests/files/testing_html_dynamic.bin b/tests/files/testing_html_dynamic.bin new file mode 100644 index 0000000..0b74935 --- /dev/null +++ b/tests/files/testing_html_dynamic.bin @@ -0,0 +1 @@ +Example Page Dynamic example text! diff --git a/tests/files/testing_html_static.bin b/tests/files/testing_html_static.bin new file mode 100644 index 0000000..02f96f0 --- /dev/null +++ b/tests/files/testing_html_static.bin @@ -0,0 +1 @@ +Example Page Example text! diff --git a/tests/files/testing_static.html b/tests/files/testing_static.html new file mode 100644 index 0000000..09d18b7 --- /dev/null +++ b/tests/files/testing_static.html @@ -0,0 +1,10 @@ + + + Example Page + + +
+ Example text! +
+ + diff --git a/tests/test_hasher.py b/tests/test_hasher.py index 2be8cba..1e3adac 100644 --- a/tests/test_hasher.py +++ b/tests/test_hasher.py @@ -1,2 +1,290 @@ #!/usr/bin/env pytest -vs -"""Tests for hash_url_content URL hashing functionality.""" +"""Tests for hash_http_content URL hashing functionality.""" + +# Standard Python Libraries +import hashlib +import json +import os.path + +# Third-Party Libraries +from bs4 import Comment, Tag +import pytest + +# cisagov Libraries +import hash_http_content + +# Hashing algorithm to use for testing. +HASH_ALGORITHM = "sha256" +# Alternate encoding to verify conversion to utf-8 +ALT_ENCODING = "utf-16" +# Files with test values +TEST_VALUE_SOURCES = { + "html_dynamic": "tests/files/testing_dynamic.html", + "html_dynamic_bytes": "tests/files/testing_html_dynamic.bin", + "html_static": "tests/files/testing_static.html", + "html_static_bytes": "tests/files/testing_html_static.bin", + "json": "tests/files/testing.json", + "plaintext": "tests/files/testing.txt", + "raw_bytes": "tests/files/testing.bin", +} +# Digests expected for each test +EXPECTED_DIGESTS = { + "html_dynamic": "3a6f9739ba635b5bfe57246ebf137f00df890a200b6dca01388f05d81479098a", + "html_static": "206794946ac5783ddbaa03713fe9eba7be069d970731b20a7f6cadb5d845680f", + "json": "25ba3da8ab38c80c8d1e6162caeb1924a777b04b7351ce31176f7bef9cd6584d", + "plaintext": "d09107e7b64bee9d7375d734a7bfc9cc316d7c48695722be3ec2218659d59be5", + "raw_bytes": "5f78c33274e43fa9de5659265c1d917e25c03722dcb0b8d27db8d5feaa813953", +} + + +@pytest.mark.parametrize("algorithm", hashlib.algorithms_available) +def test_get_hasher(algorithm): + """Verify that the desired hashing object is created.""" + assert hash_http_content.hasher.get_hasher(algorithm).name == algorithm + + +def test_hash_hash_digest(): + """Verify that an expected hash digest is generated.""" + expected_digest = "d5f8f30f25636b1f3efc2f52a0a8724c9ffa280875a1fc9a92cfe3f644b7d5c3" + digest = hash_http_content.hasher.get_hash_digest(HASH_ALGORITHM, b"cisagov") + assert digest == expected_digest + + +def test_init_browser(): + """Ensure that a browser object is initialized.""" + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM) + assert hasher._browser is None + # Call through name mangling + hasher._UrlHasher__init_browser() + assert hasher._browser is not None + + +@pytest.mark.parametrize( + "tag,expected", + [ + (Tag(name="html", parent=Tag(name="[document]")), False), + (Tag(name="", parent=Tag(name="script")), False), + (Tag(name="", parent=Tag(name="style")), False), + (Comment("Testing page."), False), + (Tag(name="", parent=Tag(name="title")), True), + (Tag(name="", parent=Tag(name="p")), True), + ], +) +def test__is_visible_element(tag, expected): + """Verify that elements are correctly identified as visible or not.""" + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM) + assert hasher._is_visible_element(tag) == expected + + +def test_handle_raw_bytes(): + """Test the handler for bytes of an unspecified format and encoding.""" + with open(TEST_VALUE_SOURCES["raw_bytes"], "rb") as f: + # Work around the end-of-file-fixer pre-commit hook + test_bytes = f.read().rstrip() + + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM) + result = hasher._handle_raw_bytes(test_bytes, None) + + assert result.hash == EXPECTED_DIGESTS["raw_bytes"] + assert result.contents == test_bytes + + +def test_handle_plaintext(): + """Test the handler with plaintext in utf-8 encoding.""" + with open(TEST_VALUE_SOURCES["plaintext"]) as f: + test_value = f.read() + test_bytes = bytes(test_value, "utf-8") + + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM) + result = hasher._handle_plaintext(test_bytes, None) + + assert result.hash == EXPECTED_DIGESTS["plaintext"] + assert result.contents == test_bytes + + +def test_handle_plaintext_with_encoding(): + """Test the handler converting to utf-8 encoding.""" + with open(TEST_VALUE_SOURCES["plaintext"]) as f: + test_value = f.read() + test_bytes = bytes(test_value, ALT_ENCODING) + expected_bytes = bytes(test_value, "utf-8") + + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM) + result = hasher._handle_plaintext(test_bytes, ALT_ENCODING) + + assert result.hash == EXPECTED_DIGESTS["plaintext"] + assert result.contents == expected_bytes + + +def test_handle_json(): + """Test the handler with JSON in utf-8 encoding.""" + with open(TEST_VALUE_SOURCES["json"]) as f: + test_value = f.read() + test_bytes = bytes(test_value, "utf-8") + expected_bytes = bytes( + json.dumps(json.loads(test_value), separators=(",", ":"), sort_keys=True), + "utf-8", + ) + + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM) + result = hasher._handle_json(test_bytes, None) + + assert result.hash == EXPECTED_DIGESTS["json"] + assert result.contents == expected_bytes + + +def test_handle_json_with_encoding(): + """Test the handler converting JSON to utf-8 encoding.""" + with open(TEST_VALUE_SOURCES["json"]) as f: + test_value = f.read() + test_bytes = bytes(test_value, ALT_ENCODING) + expected_bytes = bytes( + json.dumps(json.loads(test_value), separators=(",", ":"), sort_keys=True), + "utf-8", + ) + + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM) + result = hasher._handle_json(test_bytes, ALT_ENCODING) + + assert result.hash == EXPECTED_DIGESTS["json"] + assert result.contents == expected_bytes + + +def test_handle_html_static(): + """Test the handler with static HTML in utf-8 encoding.""" + with open(TEST_VALUE_SOURCES["html_static"]) as f: + test_value = f.read() + test_bytes = bytes(test_value, "utf-8") + with open(TEST_VALUE_SOURCES["html_static_bytes"], "rb") as f: + # Work around the end-of-file-fixer pre-commit hook + expected_bytes = f.read().rstrip() + + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM) + result = hasher._handle_html(test_bytes, None) + + assert result.hash == EXPECTED_DIGESTS["html_static"] + assert result.contents == expected_bytes + + +def test_handle_html_static_with_encoding(): + """Test the handler converting static HTML to utf-8 encoding.""" + with open(TEST_VALUE_SOURCES["html_static"]) as f: + test_value = f.read() + test_bytes = bytes(test_value, ALT_ENCODING) + with open(TEST_VALUE_SOURCES["html_static_bytes"], "rb") as f: + # Work around the end-of-file-fixer pre-commit hook + expected_bytes = f.read().rstrip() + + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM) + result = hasher._handle_html(test_bytes, ALT_ENCODING) + + assert result.hash == EXPECTED_DIGESTS["html_static"] + assert result.contents == expected_bytes + + +def test_handle_html_dynamic(): + """Test the handler with dynamic HTML in utf-8 encoding.""" + with open(TEST_VALUE_SOURCES["html_dynamic"]) as f: + test_value = f.read() + test_bytes = bytes(test_value, "utf-8") + with open(TEST_VALUE_SOURCES["html_dynamic_bytes"], "rb") as f: + # Work around the end-of-file-fixer pre-commit hook + expected_bytes = f.read().rstrip() + + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM) + result = hasher._handle_html(test_bytes, None) + + assert result.hash == EXPECTED_DIGESTS["html_dynamic"] + assert result.contents == expected_bytes + + +def test_handle_html_dynmamic_with_encoding(): + """Test the handler converting dynamic HTML to utf-8 encoding.""" + with open(TEST_VALUE_SOURCES["html_dynamic"]) as f: + test_value = f.read() + test_bytes = bytes(test_value, ALT_ENCODING) + with open(TEST_VALUE_SOURCES["html_dynamic_bytes"], "rb") as f: + # Work around the end-of-file-fixer pre-commit hook + expected_bytes = f.read().rstrip() + + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM) + result = hasher._handle_html(test_bytes, ALT_ENCODING) + + assert result.hash == EXPECTED_DIGESTS["html_dynamic"] + assert result.contents == expected_bytes + + +def test_hash_url_html_status_200(): + """Test againt a URL that returns HTML content from an existing location.""" + test_url = "https://example.com" + expected_digest = "6fba1a7167467b6dd3da090b5ec437c1b811dd2c2133504a448fb7ca59d390c2" + + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM) + result = hasher.hash_url(test_url) + + assert result.status == 200 + assert result.is_redirect is False + assert result.hash == expected_digest + + +def test_hash_url_html_status_400(): + """Test against a URL that returns HTML content from a missing location.""" + test_url = "https://example.com/404" + expected_digest = "6fba1a7167467b6dd3da090b5ec437c1b811dd2c2133504a448fb7ca59d390c2" + + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM) + result = hasher.hash_url(test_url) + + assert result.status == 404 + assert result.is_redirect is False + assert result.hash == expected_digest + + +def test_hash_url_with_redirect(): + """Test against a URL that redirects and has no content-type parameters.""" + test_url = "http://rules.ncats.cyber.dhs.gov" + + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM) + result = hasher.hash_url(test_url) + + assert result.status == 200 + assert result.is_redirect is True + + +def test_browser_additional_options(): + """Verify that additional options are used in invoking the browser.""" + # These options are expected for a lambda style environment + options = { + "headless": True, + "args": [ + "--no-sandbox", + "--single-process", + "--disable-dev-shm-usage", + "--disable-gpu", + "--no-zygote", + ], + "executablePath": "tests/files/serverless-chrome", + } + + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM, browser_options=options) + + assert hasher._UrlHasher__browser_options == options + + +def test_browser_with_specified_executable(): + """Test running with the executablePath option.""" + # If this file does not exist, do not perform this test. + if not os.path.isfile("tests/files/serverless-chrome"): + pytest.skip("no serverless-chrome binary found") + + # options = { + # "headless": True, + # "args": [ + # "--no-sandbox", + # "--single-process", + # "--disable-dev-shm-usage", + # "--disable-gpu", + # "--no-zygote", + # ], + # "executablePath": "tests/files/serverless-chrome", + # } From 6848d0ea9610d3fd38a676adbb79d53a444f2ff8 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Mon, 8 Feb 2021 16:39:10 -0500 Subject: [PATCH 07/29] Update the README Update the README to include information about this package's functionality. --- README.md | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 902dfa1..b77aa08 100644 --- a/README.md +++ b/README.md @@ -6,20 +6,34 @@ [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/cisagov/hash-http-content.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/cisagov/hash-http-content/context:python) [![Known Vulnerabilities](https://snyk.io/test/github/cisagov/hash-http-content/develop/badge.svg)](https://snyk.io/test/github/cisagov/hash-http-content) -This is a generic skeleton project that can be used to quickly get a -new [cisagov](https://github.com/cisagov) Python library GitHub -project started. This skeleton project contains [licensing -information](LICENSE), as well as -[pre-commit hooks](https://pre-commit.com) and -[GitHub Actions](https://github.com/features/actions) configurations -appropriate for a Python library project. - -## New Repositories from a Skeleton ## - -Please see our [Project Setup guide](https://github.com/cisagov/development-guide/tree/develop/project_setup) -for step-by-step instructions on how to start a new repository from -a skeleton. This will save you time and effort when configuring a -new repository! +This is a Python library to retrieve the contents of a given HTTP URL and hash +the processed contents. + +## Content processing ## + +If an encoding is detected, this package will convert content into the UTF-8 +encoding before proceeding. + +Additional content processing is currently implemented for the following types +of content: + +* HTML +* JSON + +### HTML ### + +HTML content is processed by leveraging the +[pyppeteer](https://github.com/pyppeteer/pyppeteer) package to execute any +JavaScript on a retrieved page. The result is then parsed by +[Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/) to reduce the +content to the human visible portions of a page. + +### JSON ### + +JSON content is processed by using the +[`json` library](https://docs.python.org/3/library/json.html) that is part of +the Python standard library. It is read in and then output in a deterministic +manner to adjust for any styling differences between content. ## Contributing ## From c71b71a5e2119d58112a192bb11691d3649d8384 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Mon, 8 Feb 2021 17:47:25 -0500 Subject: [PATCH 08/29] Add testing for a supplied Chromium binary Add logic to the GitHub Actions workflow to pull down a zipped binary from the https://github.com/adieuadieu/serverless-chrome project and extract it to a specified location. The test_hasher.py file has been updated to add testing using this binary instead of what is detected/used by pyppeteer automatically. This will only take place if the binary is detected in the specified location. --- .github/workflows/build.yml | 8 +++++++ tests/test_hasher.py | 45 +++++++++++++++++++++++++++---------- 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0134014..3212988 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,6 +10,9 @@ on: env: PIP_CACHE_DIR: ~/.cache/pip PRE_COMMIT_CACHE_DIR: ~/.cache/pre-commit + SERVERLESS_CHROME_URL: "https://github.com/adieuadieu/serverless-chrome/\ + releases/download/v1.0.0-57/stable-headless-chromium-amazonlinux-2.zip" + SERVERLESS_CHROME_ZIP: /tmp/serverless-chrome.zip jobs: lint: @@ -64,6 +67,11 @@ jobs: ${{ hashFiles('**/requirements.txt') }}" restore-keys: | ${{ env.BASE_CACHE_KEY }} + - name: Download and extract a serverless-chrome binary + run: | + curl -L --output ${{ env.SERVERLESS_CHROME_ZIP }} \ + ${{ env.SERVERLESS_CHROME_URL }} + unzip ${{ env.SERVERLESS_CHROME_ZIP }} -d tests/files/ - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/tests/test_hasher.py b/tests/test_hasher.py index 1e3adac..ff6968a 100644 --- a/tests/test_hasher.py +++ b/tests/test_hasher.py @@ -265,26 +265,47 @@ def test_browser_additional_options(): ], "executablePath": "tests/files/serverless-chrome", } + with open(TEST_VALUE_SOURCES["plaintext"]) as f: + test_value = f.read() + test_bytes = bytes(test_value, ALT_ENCODING) + expected_bytes = bytes(test_value, "utf-8") hasher = hash_http_content.UrlHasher(HASH_ALGORITHM, browser_options=options) + result = hasher._handle_plaintext(test_bytes, ALT_ENCODING) assert hasher._UrlHasher__browser_options == options + assert result.hash == EXPECTED_DIGESTS["plaintext"] + assert result.contents == expected_bytes def test_browser_with_specified_executable(): """Test running with the executablePath option.""" + serverless_chrome_path = "tests/files/headless-chromium" # If this file does not exist, do not perform this test. - if not os.path.isfile("tests/files/serverless-chrome"): + if not os.path.isfile(serverless_chrome_path): pytest.skip("no serverless-chrome binary found") - # options = { - # "headless": True, - # "args": [ - # "--no-sandbox", - # "--single-process", - # "--disable-dev-shm-usage", - # "--disable-gpu", - # "--no-zygote", - # ], - # "executablePath": "tests/files/serverless-chrome", - # } + # These options are expected for a lambda style environment + options = { + "headless": True, + "args": [ + "--no-sandbox", + "--single-process", + "--disable-dev-shm-usage", + "--disable-gpu", + "--no-zygote", + ], + "executablePath": serverless_chrome_path, + } + + with open(TEST_VALUE_SOURCES["plaintext"]) as f: + test_value = f.read() + test_bytes = bytes(test_value, ALT_ENCODING) + expected_bytes = bytes(test_value, "utf-8") + + hasher = hash_http_content.UrlHasher(HASH_ALGORITHM, browser_options=options) + result = hasher._handle_plaintext(test_bytes, ALT_ENCODING) + + assert hasher._UrlHasher__browser_options == options + assert result.hash == EXPECTED_DIGESTS["plaintext"] + assert result.contents == expected_bytes From 4945a2b084160b2262093203f26187d89001a65c Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Mon, 8 Feb 2021 17:54:06 -0500 Subject: [PATCH 09/29] Consolidate handler definitions Declare the dictionary of handlers upfront instead of adding them individually. --- src/hash_http_content/hasher.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index e61768e..a068ef0 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -74,10 +74,11 @@ def __init__( self._default_encoding = encoding self._hash_algorithm = hash_algorithm - self._handlers: Dict[str, Callable] = {} - self._handlers["text/plain"] = self._handle_plaintext - self._handlers["application/json"] = self._handle_json - self._handlers["text/html"] = self._handle_html + self._handlers: Dict[str, Callable] = { + "application/json": self._handle_json, + "text/html": self._handle_html, + "text/plain": self._handle_plaintext, + } def __init_browser(self): """Initialize the pyppeteer Browser if it does not exist.""" From 8c893c1912a3c724c15ab463e361d7aed62c9029 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Tue, 9 Feb 2021 11:15:10 -0500 Subject: [PATCH 10/29] Implement a command line interface to the package --- src/hash_http_content/cli.py | 102 ++++++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 1 deletion(-) diff --git a/src/hash_http_content/cli.py b/src/hash_http_content/cli.py index e5855e3..ece8526 100644 --- a/src/hash_http_content/cli.py +++ b/src/hash_http_content/cli.py @@ -1,5 +1,105 @@ -"""Command line interface to the hash-url-content package.""" +"""Command line interface to the hash-url-content package. + +Usage: + site-hash [--hash-algorithm=algorithm] ([--show-content] [--show-redirect] | [--json]) URL ... + site-hash --list-algorithms + site-hash (-v | --version) + site-hash (-h | --help) + +Options: + -h, --help Display this help text. + -a, --hash-algorithm=algorithm Use the provided hash alogorithm. + [default: sha256] + -l, --list-algorithms List available hash algorithms. + -j, --json Output the results as a JSON. + -c, --show-content Output the content after processing. + -r, --show-redirect Output if the requested URL was redirected. + -v, --version Show version information. +""" + +# Standard Python Libraries +import hashlib +from json import dumps +import sys +from typing import Any, Dict +from urllib.parse import urlparse + +# Third-Party Libraries +import docopt +from schema import And, Schema, SchemaError, Use + +from ._version import __version__ +from .hasher import UrlHasher def main(): """Return the hash(es) and information from the requested URL(s).""" + args: Dict[str, str] = docopt.docopt(__doc__, version=__version__) + schema: Schema = Schema( + { + "--hash-algorithm": And( + str, + Use(str.lower), + lambda a: a in hashlib.algorithms_available, + error=f"Invalid algorithm provided. Must be one of: {sorted(hashlib.algorithms_available)}", + ), + str: object, + } + ) + + try: + validated_args: Dict[str, Any] = schema.validate(args) + except SchemaError as err: + # Exit because one or more of the arguments were invalid + print(err, file=sys.stderr) + return 1 + + if validated_args["--list-algorithms"]: + print("Algorithms supported for this platform:") + for algo in sorted(hashlib.algorithms_available): + print(f"- {algo}") + return 0 + + if validated_args["--json"]: + results = [] + + for url in validated_args["URL"]: + # Prefer an HTTPS URL + parsed_url = urlparse(url, "https") + if not parsed_url.netloc: + parsed_url = parsed_url._replace(netloc=parsed_url.path, path="") + + hasher = UrlHasher(validated_args["--hash-algorithm"]) + url_results = hasher.hash_url(parsed_url.geturl()) + + if validated_args["--json"]: + # We cannot guarantee that the contents are serializable, so they are + # excluded from JSON results. + results.append( + { + "contents_hash": url_results.hash, + "is_redirected": url_results.is_redirect, + "requested_url": url, + "retrieved_url": url_results.visited_url, + "status_code": url_results.status, + } + ) + else: + print(f"Results for {url}:") + print(f" Retrieved URL - '{url_results.visited_url}'") + print(f" Status code - '{url_results.status}'") + if validated_args["--show-redirect"]: + print(f" Redirect - {url_results.is_redirect}") + print( + f" Hash ({validated_args['--hash-algorithm']}) of contents - {url_results.hash}" + ) + if validated_args["--show-content"]: + print() + print("Contents:") + print(url_results.contents) + print() + + if validated_args["--json"]: + print(dumps(results)) + + return 0 From be15172dd3ce9f533f1d772837ea1b9d238cd492 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Tue, 9 Feb 2021 11:44:23 -0500 Subject: [PATCH 11/29] Add some debug logging to hash_http_content.hasher Also correct the name of a test for the same file. --- src/hash_http_content/hasher.py | 26 ++++++++++++++++++++++++++ tests/test_hasher.py | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index a068ef0..7b4ef38 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -4,6 +4,7 @@ import asyncio import hashlib import json +import logging from typing import Any, Callable, Dict, NamedTuple # Third-Party Libraries @@ -17,6 +18,7 @@ def get_hasher(hash_algorithm: str) -> "hashlib._Hash": """Get a hashing object.""" + logging.debug("Creating a %s hashing object", hash_algorithm) # Not all implementations support the "usedforsecurity" keyword argument, # which is used to indicate that the algorithm is being used for non-security # related tasks. This is required for some algorithms on FIPS systems. @@ -36,6 +38,11 @@ def get_hasher(hash_algorithm: str) -> "hashlib._Hash": def get_hash_digest(hash_algorithm: str, contents: bytes) -> str: """Get a hex digest representing a hash of the given contents.""" + logging.debug( + "Generating a %s digest for provided content of length %d", + hash_algorithm, + len(contents), + ) hasher: "hashlib._Hash" = get_hasher(hash_algorithm) hasher.update(contents) return hasher.hexdigest() @@ -68,12 +75,20 @@ def __init__( browser_options: Dict[str, Any] = {}, ): """Initialize an instance of this class.""" + logging.debug("Initializing UrlHasher object") default_browser_options = {"headless": True} + logging.debug("Default browser options: %s", default_browser_options) + self.__browser_options = {**default_browser_options, **browser_options} + logging.debug("Using browser options: %s", self.__browser_options) + self._browser: Browser = None self._default_encoding = encoding self._hash_algorithm = hash_algorithm + logging.debug("Using default encoding '%s'", self._default_encoding) + logging.debug("Using hashing algorithm '%s'", self._hash_algorithm) + self._handlers: Dict[str, Callable] = { "application/json": self._handle_json, "text/html": self._handle_html, @@ -83,6 +98,7 @@ def __init__( def __init_browser(self): """Initialize the pyppeteer Browser if it does not exist.""" if not self._browser: + logging.debug("Initializing Browser object") self._browser = asyncio.get_event_loop().run_until_complete( launch(**self.__browser_options) ) @@ -91,18 +107,22 @@ def _is_visible_element(self, element: PageElement) -> bool: """Return True if the given website element would be visible.""" discard_tags = ["[document]", "script", "style"] if isinstance(element, Comment): + logging.debug("Skipping Comment tag") return False if element.parent.name in discard_tags: + logging.debug("Skipping element in parent tag '%s'", element.parent.name) return False return True def _handle_raw_bytes(self, contents: bytes, encoding: str) -> HandlerResult: """Handle bytes in an unspecified format or encoding.""" + logging.debug("Handling content as raw bytes") digest: str = get_hash_digest(self._hash_algorithm, contents) return HandlerResult(digest, contents) def _handle_plaintext(self, contents: bytes, encoding: str) -> HandlerResult: """Handle plaintext contents.""" + logging.debug("Handling content as plaintext") if encoding: contents = bytes(contents.decode(encoding), self._default_encoding) digest: str = get_hash_digest(self._hash_algorithm, contents) @@ -110,6 +130,7 @@ def _handle_plaintext(self, contents: bytes, encoding: str) -> HandlerResult: def _handle_json(self, contents: bytes, encoding: str) -> HandlerResult: """Handle JSON contents.""" + logging.debug("Handling content as JSON") # Translate the original encoding to utf-8 if encoding: json_str = str(contents, encoding) @@ -129,6 +150,7 @@ def _handle_json(self, contents: bytes, encoding: str) -> HandlerResult: def _handle_html(self, contents: bytes, encoding: str) -> HandlerResult: """Handle an HTML page.""" + logging.debug("Handling content as HTML") self.__init_browser() if encoding: @@ -136,6 +158,7 @@ def _handle_html(self, contents: bytes, encoding: str) -> HandlerResult: else: html = str(contents, self._default_encoding) + logging.debug("Setting page contents and rendering") page: Page = asyncio.get_event_loop().run_until_complete( self._browser.newPage() ) @@ -143,6 +166,7 @@ def _handle_html(self, contents: bytes, encoding: str) -> HandlerResult: page_contents: str = asyncio.get_event_loop().run_until_complete(page.content()) asyncio.get_event_loop().run_until_complete(page.close()) + logging.debug("Parsing rendered page contents") soup: BeautifulSoup = BeautifulSoup(page_contents, "lxml") text_elements = soup.find_all(text=True) visible_text_elements = filter(self._is_visible_element, text_elements) @@ -155,6 +179,7 @@ def _handle_html(self, contents: bytes, encoding: str) -> HandlerResult: def hash_url(self, url: str) -> UrlResult: """Get a hash of the contents of the provided URL.""" + logging.debug("Hashing provided URL '%s'", url) redirect_status_codes = [301, 307, 308] resp = requests.get(url) @@ -167,6 +192,7 @@ def hash_url(self, url: str) -> UrlResult: if ";" in content_type: content_type = content_type.split(";", 1)[0] + logging.debug("Checking for a redirect in the request") is_redirect = False for r in resp.history: if r.status_code in redirect_status_codes: diff --git a/tests/test_hasher.py b/tests/test_hasher.py index ff6968a..b98af22 100644 --- a/tests/test_hasher.py +++ b/tests/test_hasher.py @@ -227,7 +227,7 @@ def test_hash_url_html_status_200(): assert result.hash == expected_digest -def test_hash_url_html_status_400(): +def test_hash_url_html_status_404(): """Test against a URL that returns HTML content from a missing location.""" test_url = "https://example.com/404" expected_digest = "6fba1a7167467b6dd3da090b5ec437c1b811dd2c2133504a448fb7ca59d390c2" From fed60c71e209dbcf97133c8d9b7dbe334c4a6840 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Tue, 9 Feb 2021 11:56:02 -0500 Subject: [PATCH 12/29] Adjust fallback handler usage If the contents appear to be text, use the plaintext handler as a fallback instead. --- src/hash_http_content/hasher.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index 7b4ef38..7c245d0 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -191,6 +191,7 @@ def hash_url(self, url: str) -> UrlResult: # Pull off any parameters included if ";" in content_type: content_type = content_type.split(";", 1)[0] + logging.debug("Using content type '%s'", content_type) logging.debug("Checking for a redirect in the request") is_redirect = False @@ -199,10 +200,19 @@ def hash_url(self, url: str) -> UrlResult: is_redirect = True break - # Default to processing as raw bytes if no appropriate handler is found - processed: HandlerResult = self._handlers.get( - content_type, self._handle_raw_bytes - )(resp.content, resp.encoding) + processed: HandlerResult + # If the content appears to be text, we should fall back to processing it + # as plaintext instead of raw bytes. + if resp.apparent_encoding == "ascii": + # Default to processing as plaintext if no appropriate handler is found + processed = self._handlers.get(content_type, self._handle_plaintext)( + resp.content, resp.encoding + ) + else: + # Default to processing as raw bytes if no appropriate handler is found + processed = self._handlers.get(content_type, self._handle_raw_bytes)( + resp.content, resp.encoding + ) return UrlResult( resp.status_code, resp.url, is_redirect, processed.hash, processed.contents From 999e58642078d05975024bac2ab79566cb30928c Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Tue, 9 Feb 2021 13:47:55 -0500 Subject: [PATCH 13/29] Add testing for the command line interface Also adjust the output of the interface slightly. --- src/hash_http_content/cli.py | 4 +- tests/test_cli.py | 225 ++++++++++++++++++++++++++++++++++- 2 files changed, 226 insertions(+), 3 deletions(-) diff --git a/src/hash_http_content/cli.py b/src/hash_http_content/cli.py index ece8526..52a0b31 100644 --- a/src/hash_http_content/cli.py +++ b/src/hash_http_content/cli.py @@ -97,9 +97,9 @@ def main(): print() print("Contents:") print(url_results.contents) - print() + print() if validated_args["--json"]: - print(dumps(results)) + print(dumps(results, separators=(",", ":"), sort_keys=True)) return 0 diff --git a/tests/test_cli.py b/tests/test_cli.py index 154e0b3..e38d571 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,2 +1,225 @@ #!/usr/bin/env pytest -vs -"""Tests for hash_url_content command line interface.""" +"""Tests for hash_http_content command line interface.""" + +# Standard Python Libraries +import hashlib +import json +import os +import sys +from unittest.mock import patch + +# Third-Party Libraries +import pytest + +# cisagov Libraries +from hash_http_content import __version__, cli + +# define sources of version strings +RELEASE_TAG = os.getenv("RELEASE_TAG") +PROJECT_VERSION = __version__ + + +def test_stdout_version(capsys): + """Verify that version string sent to stdout agrees with the module version.""" + with pytest.raises(SystemExit): + with patch.object(sys, "argv", ["bogus", "--version"]): + cli.main() + captured = capsys.readouterr() + assert ( + captured.out == f"{PROJECT_VERSION}\n" + ), "standard output by '--version' should agree with module.__version__" + + +def test_running_as_module(capsys): + """Verify that the __main__.py file loads correctly.""" + with pytest.raises(SystemExit): + with patch.object(sys, "argv", ["bogus", "--version"]): + # F401 is a "Module imported but unused" warning. This import + # emulates how this project would be run as a module. The only thing + # being done by __main__ is importing the main entrypoint of the + # package and running it, so there is nothing to use from this + # import. As a result, we can safely ignore this warning. + # cisagov Libraries + import hash_http_content.__main__ # noqa: F401 + captured = capsys.readouterr() + assert ( + captured.out == f"{PROJECT_VERSION}\n" + ), "standard output by '--version' should agree with module.__version__" + + +@pytest.mark.skipif( + RELEASE_TAG in [None, ""], reason="this is not a release (RELEASE_TAG not set)" +) +def test_release_version(): + """Verify that release tag version agrees with the module version.""" + assert ( + RELEASE_TAG == f"v{PROJECT_VERSION}" + ), "RELEASE_TAG does not match the project version" + + +def test_list_algorithms(capsys): + """Validate a matching list of algorithms is returned.""" + expected_output = "Algorithms supported for this platform:\n" + "\n".join( + f"- {a}" for a in sorted(hashlib.algorithms_available) + ) + with patch.object(sys, "argv", ["bogus", "--list-algorithms"]): + return_code = cli.main() + captured = capsys.readouterr() + assert return_code == 0 + assert captured.out.rstrip() == expected_output + + +def test_invalid_hash_type(capsys): + """Validate that an unsupported hash type causes an error.""" + expected_output = f"Invalid algorithm provided. Must be one of: {sorted(hashlib.algorithms_available)}" + with patch.object( + sys, "argv", ["bogus", "--hash-algorithm", "nonsensical", "localhost"] + ): + return_code = cli.main() + captured = capsys.readouterr() + assert return_code == 1 + assert captured.err.rstrip() == expected_output + + +def test_full_run_no_http_schema(capsys): + """Validate output for a given URL with no schema.""" + expected_output = "\n".join( + [ + "Results for example.com:", + " Retrieved URL - 'https://example.com/'", + " Status code - '200'", + " Hash (sha256) of contents - 6fba1a7167467b6dd3da090b5ec437c1b811dd2c2133504a448fb7ca59d390c2", + ] + ) + with patch.object(sys, "argv", ["bogus", "example.com"]): + return_code = cli.main() + captured = capsys.readouterr() + + assert return_code == 0 + assert captured.out.rstrip() == expected_output + + +def test_full_run_with_http_schema(capsys): + """Validate output for a given URL with a provided schema.""" + expected_output = "\n".join( + [ + "Results for https://example.com:", + " Retrieved URL - 'https://example.com/'", + " Status code - '200'", + " Hash (sha256) of contents - 6fba1a7167467b6dd3da090b5ec437c1b811dd2c2133504a448fb7ca59d390c2", + ] + ) + with patch.object(sys, "argv", ["bogus", "https://example.com"]): + return_code = cli.main() + captured = capsys.readouterr() + + assert return_code == 0 + assert captured.out.rstrip() == expected_output + + +def test_full_run_no_redirect(capsys): + """Validate output for a given URL that has no redirect.""" + expected_output = [ + "Results for http://example.com:", + " Retrieved URL - 'http://example.com/'", + " Status code - '200'", + " Redirect - False", + ] + with patch.object(sys, "argv", ["bogus", "--show-redirect", "http://example.com"]): + return_code = cli.main() + captured = capsys.readouterr() + captured_lines = captured.out.split("\n") + + assert return_code == 0 + + for i, value in enumerate(expected_output): + assert captured_lines[i] == value + + +def test_full_run_with_redirect(capsys): + """Validate output for a given URL that has a redirect.""" + expected_output = [ + "Results for http://rules.ncats.cyber.dhs.gov:", + " Retrieved URL - 'https://rules.ncats.cyber.dhs.gov/'", + " Status code - '200'", + " Redirect - True", + ] + with patch.object( + sys, "argv", ["bogus", "--show-redirect", "http://rules.ncats.cyber.dhs.gov"] + ): + return_code = cli.main() + captured = capsys.readouterr() + captured_lines = captured.out.split("\n") + + assert return_code == 0 + + for i, value in enumerate(expected_output): + assert captured_lines[i] == value + + +def test_full_run_with_content(capsys): + """Validate output with content for a given URL.""" + expected_output = "\n".join( + [ + "Results for https://example.com:", + " Retrieved URL - 'https://example.com/'", + " Status code - '200'", + " Hash (sha256) of contents - 6fba1a7167467b6dd3da090b5ec437c1b811dd2c2133504a448fb7ca59d390c2", + "", + "Contents:", + r"b'Example Domain Example Domain This domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission. More information...'", + ] + ) + with patch.object(sys, "argv", ["bogus", "--show-content", "https://example.com"]): + return_code = cli.main() + captured = capsys.readouterr() + + assert return_code == 0 + assert captured.out.rstrip() == expected_output + + +def test_full_run_check_redirect_with_content(capsys): + """Validate output with content for a given URL with redirect check.""" + expected_output = "\n".join( + [ + "Results for https://example.com:", + " Retrieved URL - 'https://example.com/'", + " Status code - '200'", + " Redirect - False", + " Hash (sha256) of contents - 6fba1a7167467b6dd3da090b5ec437c1b811dd2c2133504a448fb7ca59d390c2", + "", + "Contents:", + r"b'Example Domain Example Domain This domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission. More information...'", + ] + ) + with patch.object( + sys, + "argv", + ["bogus", "--show-content", "--show-redirect", "https://example.com"], + ): + return_code = cli.main() + captured = capsys.readouterr() + + assert return_code == 0 + assert captured.out.rstrip() == expected_output + + +def test_full_run_json_output(capsys): + """Validate JSON output for a given URL.""" + expected_result = [ + { + "contents_hash": "6fba1a7167467b6dd3da090b5ec437c1b811dd2c2133504a448fb7ca59d390c2", + "is_redirected": False, + "requested_url": "https://example.com", + "retrieved_url": "https://example.com/", + "status_code": 200, + } + ] + with patch.object(sys, "argv", ["bogus", "--json", "https://example.com"]): + return_code = cli.main() + captured = capsys.readouterr() + + captured_result = json.loads(captured.out) + + assert return_code == 0 + assert captured_result == expected_result From dfb8f93d2330b0ca03882167316c4712488e502c Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Tue, 9 Feb 2021 17:34:14 -0500 Subject: [PATCH 14/29] Add a script to retrieve serverless-chrome binaries Switch the build workflow to use this script instead to retrieve it in CI. --- .github/workflows/build.yml | 8 +---- get_serverless_chrome_binary.sh | 52 +++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 7 deletions(-) create mode 100755 get_serverless_chrome_binary.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3212988..86f45d4 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,9 +10,6 @@ on: env: PIP_CACHE_DIR: ~/.cache/pip PRE_COMMIT_CACHE_DIR: ~/.cache/pre-commit - SERVERLESS_CHROME_URL: "https://github.com/adieuadieu/serverless-chrome/\ - releases/download/v1.0.0-57/stable-headless-chromium-amazonlinux-2.zip" - SERVERLESS_CHROME_ZIP: /tmp/serverless-chrome.zip jobs: lint: @@ -68,10 +65,7 @@ jobs: restore-keys: | ${{ env.BASE_CACHE_KEY }} - name: Download and extract a serverless-chrome binary - run: | - curl -L --output ${{ env.SERVERLESS_CHROME_ZIP }} \ - ${{ env.SERVERLESS_CHROME_URL }} - unzip ${{ env.SERVERLESS_CHROME_ZIP }} -d tests/files/ + run: ./get_serverless_chrome_binary.sh - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/get_serverless_chrome_binary.sh b/get_serverless_chrome_binary.sh new file mode 100755 index 0000000..3255af0 --- /dev/null +++ b/get_serverless_chrome_binary.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +set -o nounset +set -o errexit +set -o pipefail + +function usage { + echo "Usage:" + echo " ${0##*/} [options]" + echo + echo "Options:" + echo " -h, --help Show the help message." + echo " -l, --latest Pull down the latest release on GitHub." + exit "$1" +} + +# Defaults to a specific version for use in GitHub Actions +DOWNLOAD_URL="https://github.com/adieuadieu/serverless-chrome/releases/download/v1.0.0-57/stable-headless-chromium-amazonlinux-2.zip" +LOCAL_FILE="serverless-chrome.zip" +LOCAL_DIR="tests/files/" + + +# Get the URL of the latest stable release available +function get_latest_stable_url { + releases_url="https://api.github.com/repos/adieuadieu/serverless-chrome/releases" + # Get the URL for the latest release's assets + latest_assets=$(curl -s "$releases_url" | jq -r '.[0].assets_url') + # Download the zip for the stable branch + DOWNLOAD_URL=$(curl -s "$latest_assets" | jq -r '.[] | select(.browser_download_url | contains("stable")) | .browser_download_url') +} + +while (( "$#" )) +do + case "$1" in + -h|--help) + usage 0 + ;; + -l|--latest) + get_latest_stable_url + shift 1 + ;; + -*) + usage 1 + ;; + esac +done + +# Follow redirects and output as the specified file name +curl -L --output "$LOCAL_FILE" "$DOWNLOAD_URL" +# Extract the specified file to the specified directory and overwrite without +# prompting +unzip -o "$LOCAL_FILE" -d "$LOCAL_DIR" From 00207ee4b049b8ff480b5d43b6a717e163d6081d Mon Sep 17 00:00:00 2001 From: Nick M <50747025+mcdonnnj@users.noreply.github.com> Date: Wed, 10 Feb 2021 15:41:25 -0500 Subject: [PATCH 15/29] Update comments based on feedback Update comments with suggestions from code review. Co-authored-by: dav3r --- README.md | 4 ++-- src/hash_http_content/cli.py | 2 +- tests/test_hasher.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index b77aa08..0a3fdd0 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,8 @@ [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/cisagov/hash-http-content.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/cisagov/hash-http-content/context:python) [![Known Vulnerabilities](https://snyk.io/test/github/cisagov/hash-http-content/develop/badge.svg)](https://snyk.io/test/github/cisagov/hash-http-content) -This is a Python library to retrieve the contents of a given HTTP URL and hash -the processed contents. +This is a Python library to retrieve the contents of a given URL via HTTP (or +HTTPS) and hash the processed contents. ## Content processing ## diff --git a/src/hash_http_content/cli.py b/src/hash_http_content/cli.py index 52a0b31..756bea0 100644 --- a/src/hash_http_content/cli.py +++ b/src/hash_http_content/cli.py @@ -1,4 +1,4 @@ -"""Command line interface to the hash-url-content package. +"""Command line interface to the hash-http-content package. Usage: site-hash [--hash-algorithm=algorithm] ([--show-content] [--show-redirect] | [--json]) URL ... diff --git a/tests/test_hasher.py b/tests/test_hasher.py index b98af22..c6a728a 100644 --- a/tests/test_hasher.py +++ b/tests/test_hasher.py @@ -253,7 +253,7 @@ def test_hash_url_with_redirect(): def test_browser_additional_options(): """Verify that additional options are used in invoking the browser.""" - # These options are expected for a lambda style environment + # These options are expected for an AWS Lambda style environment options = { "headless": True, "args": [ @@ -285,7 +285,7 @@ def test_browser_with_specified_executable(): if not os.path.isfile(serverless_chrome_path): pytest.skip("no serverless-chrome binary found") - # These options are expected for a lambda style environment + # These options are expected for an AWS Lambda style environment options = { "headless": True, "args": [ From 5635e2a9ecdb15b9036558ef2bfa377c7806a784 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Wed, 10 Feb 2021 18:00:59 -0500 Subject: [PATCH 16/29] Expand comment about mypy workaround Add additional context and explanation about a workaround being used to pass the mypy pre-commit hook. --- src/hash_http_content/hasher.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index 7c245d0..75f56ec 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -27,7 +27,17 @@ def get_hasher(hash_algorithm: str) -> "hashlib._Hash": except AttributeError: # There is no named constructor for the desired hashing algorithm try: - # Work around typeshed's incorrect type hints + # mypy relies on typeshed (https://github.com/python/typeshed) for + # stdlib type hinting, but it does not have the correct type hints for + # hashlib.new(). The PR I submitted to fix them + # (https://github.com/python/typeshed/pull/4973) was approved, but I + # am not sure if mypy will still have issues with the usage of this + # keyword in non Python 3.9 (when the usedforsecurity kwarg was added) + # environments. I believe the earliest I can test this will be in mypy + # v0.900, and I have made + # https://github.com/cisagov/hash-http-content/issues/3 to document + # the status of this workaround. + # hasher = hashlib.new(hash_algorithm, usedforsecurity=False) hasher = getattr(hashlib, "new")(hash_algorithm, usedforsecurity=False) except TypeError: hasher = hashlib.new(hash_algorithm) From a988651494d4639adc2d583a2d3095cc1eec369a Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Thu, 11 Feb 2021 15:06:51 -0500 Subject: [PATCH 17/29] Add missing redirect status code and explain choices The 302 status code was missed for the list of redirect_status_codes so it has been added. I have also added an explanation for why these four values are chosen instead of just matching any 3xx status code. --- src/hash_http_content/hasher.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index 75f56ec..eb0e266 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -190,7 +190,17 @@ def _handle_html(self, contents: bytes, encoding: str) -> HandlerResult: def hash_url(self, url: str) -> UrlResult: """Get a hash of the contents of the provided URL.""" logging.debug("Hashing provided URL '%s'", url) - redirect_status_codes = [301, 307, 308] + + # These values were chosen to keep in line with the type of redirection + # that indicates the desirted resource is at a different URI per + # https://tools.ietf.org/html/rfc7231#section-6.4 + # 1. Redirects that indicate the resource might be available at a + # different URI, as provided by the Location field, as in the + # status codes 301 (Moved Permanently), 302 (Found), and 307 + # (Temporary Redirect). + # This follows the logic in the creation of status code 308 per + # https://tools.ietf.org/html/rfc7238#section-1 + redirect_status_codes = [301, 302, 307, 308] resp = requests.get(url) # https://tools.ietf.org/html/rfc7231#section-3.1.1.5 From fa02dac1717d7929abf29118b20e524aa38d612a Mon Sep 17 00:00:00 2001 From: Nick M <50747025+mcdonnnj@users.noreply.github.com> Date: Fri, 12 Feb 2021 11:52:27 -0500 Subject: [PATCH 18/29] Fix typo in comment Fix a typo I made in the comment explaining the redirect statuses being used. Co-authored-by: dav3r --- src/hash_http_content/hasher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index eb0e266..3f9b70c 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -192,7 +192,7 @@ def hash_url(self, url: str) -> UrlResult: logging.debug("Hashing provided URL '%s'", url) # These values were chosen to keep in line with the type of redirection - # that indicates the desirted resource is at a different URI per + # that indicates the desired resource is at a different URI per # https://tools.ietf.org/html/rfc7231#section-6.4 # 1. Redirects that indicate the resource might be available at a # different URI, as provided by the Location field, as in the From 9f0088c51b810a443c18ceec084f71b4f66a4260 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Sat, 13 Feb 2021 02:23:49 -0500 Subject: [PATCH 19/29] Add content_type member to the UrlResult NamedTuple Add support for the new member to the cli and its output. Update testing to reflect this addition. --- src/hash_http_content/cli.py | 2 ++ src/hash_http_content/hasher.py | 8 +++++++- tests/test_cli.py | 7 +++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/hash_http_content/cli.py b/src/hash_http_content/cli.py index 756bea0..ddaf6d6 100644 --- a/src/hash_http_content/cli.py +++ b/src/hash_http_content/cli.py @@ -77,6 +77,7 @@ def main(): # excluded from JSON results. results.append( { + "content_type": url_results.content_type, "contents_hash": url_results.hash, "is_redirected": url_results.is_redirect, "requested_url": url, @@ -88,6 +89,7 @@ def main(): print(f"Results for {url}:") print(f" Retrieved URL - '{url_results.visited_url}'") print(f" Status code - '{url_results.status}'") + print(f" Content type - '{url_results.content_type}'") if validated_args["--show-redirect"]: print(f" Redirect - {url_results.is_redirect}") print( diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index 3f9b70c..09889cf 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -71,6 +71,7 @@ class UrlResult(NamedTuple): status: int visited_url: str is_redirect: bool + content_type: str hash: str contents: bytes @@ -235,5 +236,10 @@ def hash_url(self, url: str) -> UrlResult: ) return UrlResult( - resp.status_code, resp.url, is_redirect, processed.hash, processed.contents + resp.status_code, + resp.url, + is_redirect, + content_type, + processed.hash, + processed.contents, ) diff --git a/tests/test_cli.py b/tests/test_cli.py index e38d571..96b3817 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -88,6 +88,7 @@ def test_full_run_no_http_schema(capsys): "Results for example.com:", " Retrieved URL - 'https://example.com/'", " Status code - '200'", + " Content type - 'text/html'", " Hash (sha256) of contents - 6fba1a7167467b6dd3da090b5ec437c1b811dd2c2133504a448fb7ca59d390c2", ] ) @@ -106,6 +107,7 @@ def test_full_run_with_http_schema(capsys): "Results for https://example.com:", " Retrieved URL - 'https://example.com/'", " Status code - '200'", + " Content type - 'text/html'", " Hash (sha256) of contents - 6fba1a7167467b6dd3da090b5ec437c1b811dd2c2133504a448fb7ca59d390c2", ] ) @@ -123,6 +125,7 @@ def test_full_run_no_redirect(capsys): "Results for http://example.com:", " Retrieved URL - 'http://example.com/'", " Status code - '200'", + " Content type - 'text/html'", " Redirect - False", ] with patch.object(sys, "argv", ["bogus", "--show-redirect", "http://example.com"]): @@ -142,6 +145,7 @@ def test_full_run_with_redirect(capsys): "Results for http://rules.ncats.cyber.dhs.gov:", " Retrieved URL - 'https://rules.ncats.cyber.dhs.gov/'", " Status code - '200'", + " Content type - 'text/plain'", " Redirect - True", ] with patch.object( @@ -164,6 +168,7 @@ def test_full_run_with_content(capsys): "Results for https://example.com:", " Retrieved URL - 'https://example.com/'", " Status code - '200'", + " Content type - 'text/html'", " Hash (sha256) of contents - 6fba1a7167467b6dd3da090b5ec437c1b811dd2c2133504a448fb7ca59d390c2", "", "Contents:", @@ -185,6 +190,7 @@ def test_full_run_check_redirect_with_content(capsys): "Results for https://example.com:", " Retrieved URL - 'https://example.com/'", " Status code - '200'", + " Content type - 'text/html'", " Redirect - False", " Hash (sha256) of contents - 6fba1a7167467b6dd3da090b5ec437c1b811dd2c2133504a448fb7ca59d390c2", "", @@ -208,6 +214,7 @@ def test_full_run_json_output(capsys): """Validate JSON output for a given URL.""" expected_result = [ { + "content_type": "text/html", "contents_hash": "6fba1a7167467b6dd3da090b5ec437c1b811dd2c2133504a448fb7ca59d390c2", "is_redirected": False, "requested_url": "https://example.com", From 5d0486a0a76d19b2732d31ac9aca8e3f8dfccb69 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Sun, 14 Feb 2021 00:58:56 -0500 Subject: [PATCH 20/29] Add a timeout instance variable to UrlHasher This timeout is used for request.get() calls to provide a changeable limit for how long to wait for a request to finish. The default value is five seconds. --- src/hash_http_content/hasher.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index 09889cf..72009e3 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -90,6 +90,10 @@ def __init__( default_browser_options = {"headless": True} logging.debug("Default browser options: %s", default_browser_options) + # Timeout in seconds + self._timeout = 5 + logging.debug("Using request timeout limit of '%d' seconds", self._timeout) + self.__browser_options = {**default_browser_options, **browser_options} logging.debug("Using browser options: %s", self.__browser_options) @@ -202,7 +206,7 @@ def hash_url(self, url: str) -> UrlResult: # This follows the logic in the creation of status code 308 per # https://tools.ietf.org/html/rfc7238#section-1 redirect_status_codes = [301, 302, 307, 308] - resp = requests.get(url) + resp = requests.get(url, timeout=self._timeout) # https://tools.ietf.org/html/rfc7231#section-3.1.1.5 content_type = ( From 813be3186c4df4874d6d7b0110c56d9f2ccb5411 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Sun, 14 Feb 2021 18:19:34 -0500 Subject: [PATCH 21/29] Update UrlHasher._handle_html() to navigate to a local file When testing against different sites, I found that for more complex sites there was not always enough time for everything to process and render. Unfortunately the setContent() coroutine does not allow you to pass additional options. I have switched to using the goto() coroutine instead, as this allows me ot use the waitUntil option to give time for the page to process and fully load. --- src/hash_http_content/hasher.py | 40 ++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index 72009e3..6d561fe 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -5,6 +5,7 @@ import hashlib import json import logging +import tempfile from typing import Any, Callable, Dict, NamedTuple # Third-Party Libraries @@ -168,18 +169,37 @@ def _handle_html(self, contents: bytes, encoding: str) -> HandlerResult: logging.debug("Handling content as HTML") self.__init_browser() - if encoding: - html = str(contents, encoding) - else: - html = str(contents, self._default_encoding) + # Until the Page.setContent() method allows options, writing the HTML + # document to a temporary file and navigating to it with Page.goto() is + # the only way to leverage the `waitUntil` option to give time for the + # page's contents to load. Support for options in Page.setContent() is + # expected in pyppeteer when the puppeteer v2.1.1 feature parity rewrite + # is completed per: + # https://github.com/pyppeteer/pyppeteer/issues/134 for more information + with tempfile.NamedTemporaryFile(suffix=".html") as fp: + # Output to a temporary file so it's available to the browser + fp.write(contents) + fp.flush() + + page: Page = asyncio.get_event_loop().run_until_complete( + self._browser.newPage() + ) + + logging.debug("Navigating to temporary file '%s'", fp.name) + # Wait for everything to load after navigating to the temporary file + asyncio.get_event_loop().run_until_complete( + page.goto(f"file://{fp.name}", {"waitUntil": ["load", "networkidle2"]}) + ) + page_contents: str = asyncio.get_event_loop().run_until_complete( + page.content() + ) + + asyncio.get_event_loop().run_until_complete(page.close()) - logging.debug("Setting page contents and rendering") - page: Page = asyncio.get_event_loop().run_until_complete( - self._browser.newPage() + # Try to guarantee our preferred encoding + page_contents = bytes(page_contents.encode(self._default_encoding)).decode( + self._default_encoding ) - asyncio.get_event_loop().run_until_complete(page.setContent(html)) - page_contents: str = asyncio.get_event_loop().run_until_complete(page.content()) - asyncio.get_event_loop().run_until_complete(page.close()) logging.debug("Parsing rendered page contents") soup: BeautifulSoup = BeautifulSoup(page_contents, "lxml") From 0383c77f6f6a29b6499bc07ffa9de0f70dfb9679 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Sun, 14 Feb 2021 23:59:57 -0500 Subject: [PATCH 22/29] Switch to reusing a browser page Previously a new page was created, used, and then closed for every UrlHasher_handle_html() call. However, since we already reuse the browser object for a class instance, there's no sense in not reusing a page in the browser in the same way. --- src/hash_http_content/hasher.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index 6d561fe..065043d 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -99,6 +99,7 @@ def __init__( logging.debug("Using browser options: %s", self.__browser_options) self._browser: Browser = None + self._browser_page: Page = None self._default_encoding = encoding self._hash_algorithm = hash_algorithm @@ -118,6 +119,9 @@ def __init_browser(self): self._browser = asyncio.get_event_loop().run_until_complete( launch(**self.__browser_options) ) + self._browser_page = asyncio.get_event_loop().run_until_complete( + self._browser.newPage() + ) def _is_visible_element(self, element: PageElement) -> bool: """Return True if the given website element would be visible.""" @@ -181,21 +185,17 @@ def _handle_html(self, contents: bytes, encoding: str) -> HandlerResult: fp.write(contents) fp.flush() - page: Page = asyncio.get_event_loop().run_until_complete( - self._browser.newPage() - ) - logging.debug("Navigating to temporary file '%s'", fp.name) # Wait for everything to load after navigating to the temporary file asyncio.get_event_loop().run_until_complete( - page.goto(f"file://{fp.name}", {"waitUntil": ["load", "networkidle2"]}) + self._browser_page.goto( + f"file://{fp.name}", {"waitUntil": ["load", "networkidle2"]} + ) ) page_contents: str = asyncio.get_event_loop().run_until_complete( - page.content() + self._browser_page.content() ) - asyncio.get_event_loop().run_until_complete(page.close()) - # Try to guarantee our preferred encoding page_contents = bytes(page_contents.encode(self._default_encoding)).decode( self._default_encoding From 11ac606167f7be78a947bf58cc69c70d95bfae1a Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Mon, 15 Feb 2021 02:25:42 -0500 Subject: [PATCH 23/29] Handle browser timeout while waiting for content Adjust the browser's timeout while waiting for the events given in the waitUntil to occur from the default of 30 seconds to five seconds. Wrap everything in a try/except block and pull whatever content is rendered at the end of the timeout as a fallback. If a site has long-polling or similar side activity, then the networkidle2 event may never occur. Handling the timeout will allow us to retrieve a site that has likely loaded already. --- src/hash_http_content/hasher.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index 065043d..4168bcb 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -13,6 +13,7 @@ from bs4.element import Comment, PageElement from pyppeteer import launch from pyppeteer.browser import Browser +from pyppeteer.errors import TimeoutError from pyppeteer.page import Page import requests @@ -186,12 +187,21 @@ def _handle_html(self, contents: bytes, encoding: str) -> HandlerResult: fp.flush() logging.debug("Navigating to temporary file '%s'", fp.name) - # Wait for everything to load after navigating to the temporary file - asyncio.get_event_loop().run_until_complete( - self._browser_page.goto( - f"file://{fp.name}", {"waitUntil": ["load", "networkidle2"]} + + try: + # Wait for everything to load after navigating to the temporary file + asyncio.get_event_loop().run_until_complete( + self._browser_page.goto( + f"file://{fp.name}", + # Wait for load and networkidle2 events up to the given + # timeout of five seconds (in milliseconds) + {"timeout": 5000, "waitUntil": ["load", "networkidle2"]}, + ) ) - ) + # Waiting for load and networkidle2 events to occur exceeded the + # configured timeout + except TimeoutError: + pass page_contents: str = asyncio.get_event_loop().run_until_complete( self._browser_page.content() ) From 05ce989f46b8c832defda514eab49ad4b2dd258e Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Mon, 15 Feb 2021 15:34:16 -0500 Subject: [PATCH 24/29] Add retry mechanism to requests.get() call Add retries to the requests.get() call for the provided URL in UrlHasher.hash_url(). This will allow a modified number of retries in case there are any network difficulties or similar. After self._retries additional attempts, it will raise the exception that was caught. --- src/hash_http_content/hasher.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index 4168bcb..9cca14f 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -92,6 +92,10 @@ def __init__( default_browser_options = {"headless": True} logging.debug("Default browser options: %s", default_browser_options) + # Number of retries + self._retries = 3 + logging.debug("Using retry value of '%d'", self._retries) + # Timeout in seconds self._timeout = 5 logging.debug("Using request timeout limit of '%d' seconds", self._timeout) @@ -236,7 +240,22 @@ def hash_url(self, url: str) -> UrlResult: # This follows the logic in the creation of status code 308 per # https://tools.ietf.org/html/rfc7238#section-1 redirect_status_codes = [301, 302, 307, 308] - resp = requests.get(url, timeout=self._timeout) + + # Attempt to retrieve the given URL, retrying self._retries times before + # raising an exception + get_tries = 0 + while True: + try: + resp = requests.get(url, timeout=self._timeout) + break + except Exception as err: + get_tries += 1 + if get_tries <= self._retries: + logging.warning( + "Performing retry %d/%d for '%s'", get_tries, self._retries, url + ) + else: + raise err # https://tools.ietf.org/html/rfc7231#section-3.1.1.5 content_type = ( From 9ac5dc5685467b4b6503129d44d83d96910c8e32 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Mon, 15 Feb 2021 15:38:07 -0500 Subject: [PATCH 25/29] Make Page.goto() timeout configurable Change from a hard coded value to reusing the self._timeout value. --- src/hash_http_content/hasher.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index 9cca14f..ce80a48 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -197,9 +197,12 @@ def _handle_html(self, contents: bytes, encoding: str) -> HandlerResult: asyncio.get_event_loop().run_until_complete( self._browser_page.goto( f"file://{fp.name}", - # Wait for load and networkidle2 events up to the given - # timeout of five seconds (in milliseconds) - {"timeout": 5000, "waitUntil": ["load", "networkidle2"]}, + { + # Wait for load and networkidle2 events up to the + # value of self_timeout (in milliseconds) + "timeout": self._timeout * 1000, + "waitUntil": ["load", "networkidle2"], + }, ) ) # Waiting for load and networkidle2 events to occur exceeded the From 7fc99db99be0cef5140b5778b044d00ca6da6354 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Mon, 15 Feb 2021 16:04:38 -0500 Subject: [PATCH 26/29] Add option to control TLS validation Added a verify kwarg to UrlHasher.hash_url() to allow control over TLS verification in the requests library. The kwarg is directly used as the kwarg of the same name in the requests.get() method. --- src/hash_http_content/hasher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index ce80a48..04f62b4 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -6,7 +6,7 @@ import json import logging import tempfile -from typing import Any, Callable, Dict, NamedTuple +from typing import Any, Callable, Dict, NamedTuple, Union # Third-Party Libraries from bs4 import BeautifulSoup @@ -229,7 +229,7 @@ def _handle_html(self, contents: bytes, encoding: str) -> HandlerResult: return HandlerResult(digest, visible_bytes) - def hash_url(self, url: str) -> UrlResult: + def hash_url(self, url: str, verify: Union[bool, str] = True) -> UrlResult: """Get a hash of the contents of the provided URL.""" logging.debug("Hashing provided URL '%s'", url) @@ -249,7 +249,7 @@ def hash_url(self, url: str) -> UrlResult: get_tries = 0 while True: try: - resp = requests.get(url, timeout=self._timeout) + resp = requests.get(url, timeout=self._timeout, verify=verify) break except Exception as err: get_tries += 1 From b4c49d2fd1b0fc1aa79d5c6798a28761aed73dbb Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Mon, 15 Feb 2021 21:43:16 -0500 Subject: [PATCH 27/29] Narrow scope of try block in UrlHasher.hash_url() When performing requests.get() we were blanket catching any Exception. Retries should only be for connection related issued, so this narrows the caught exceptions down to ConnectionError and Timeout (which will match both the ConnectTimeout and ReadTimeout errors). --- src/hash_http_content/hasher.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index 04f62b4..e89d194 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -16,6 +16,7 @@ from pyppeteer.errors import TimeoutError from pyppeteer.page import Page import requests +from requests.exceptions import ConnectionError, Timeout def get_hasher(hash_algorithm: str) -> "hashlib._Hash": @@ -251,7 +252,12 @@ def hash_url(self, url: str, verify: Union[bool, str] = True) -> UrlResult: try: resp = requests.get(url, timeout=self._timeout, verify=verify) break - except Exception as err: + except (ConnectionError, Timeout) as err: + logging.debug( + "Encountered a(n) %s exception while attempting to GET from '%s'", + type(err).__name__, + url, + ) get_tries += 1 if get_tries <= self._retries: logging.warning( From 3499d5c05f103b5346c914d3544b0fc2d6f51ba4 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Tue, 16 Feb 2021 02:17:41 -0500 Subject: [PATCH 28/29] Add hasher.UrlResult to the public objects Since the result of a UrlHasher.hash_url() call and the UrlHasher class itself are the primary interfaces to this class, they should both be publicly available. --- src/hash_http_content/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hash_http_content/__init__.py b/src/hash_http_content/__init__.py index 18353cb..9cb1be8 100644 --- a/src/hash_http_content/__init__.py +++ b/src/hash_http_content/__init__.py @@ -7,6 +7,6 @@ # package_name.__version__, which is used to get version information about this # Python package. from ._version import __version__ # noqa: F401 -from .hasher import UrlHasher +from .hasher import UrlHasher, UrlResult -__all__: List[str] = ["UrlHasher"] +__all__: List[str] = ["UrlHasher", "UrlResult"] From 2c51f2b29dc9971127572d86dc089a5947a6d5f9 Mon Sep 17 00:00:00 2001 From: Nicholas McDonnell <50747025+mcdonnnj@users.noreply.github.com> Date: Tue, 16 Feb 2021 13:42:29 -0500 Subject: [PATCH 29/29] Add additional type hints to the UrlHasher class During review type hints were mentioned, and it made me take a second look to see if I had missed any that could be easily added. This was especially important for some of the instance variables that were added, as they are expected to be modified if desired. --- src/hash_http_content/hasher.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/hash_http_content/hasher.py b/src/hash_http_content/hasher.py index e89d194..f858b14 100644 --- a/src/hash_http_content/hasher.py +++ b/src/hash_http_content/hasher.py @@ -94,20 +94,23 @@ def __init__( logging.debug("Default browser options: %s", default_browser_options) # Number of retries - self._retries = 3 + self._retries: int = 3 logging.debug("Using retry value of '%d'", self._retries) # Timeout in seconds - self._timeout = 5 + self._timeout: int = 5 logging.debug("Using request timeout limit of '%d' seconds", self._timeout) - self.__browser_options = {**default_browser_options, **browser_options} + self.__browser_options: Dict[str, Any] = { + **default_browser_options, + **browser_options, + } logging.debug("Using browser options: %s", self.__browser_options) self._browser: Browser = None self._browser_page: Page = None - self._default_encoding = encoding - self._hash_algorithm = hash_algorithm + self._default_encoding: str = encoding + self._hash_algorithm: str = hash_algorithm logging.debug("Using default encoding '%s'", self._default_encoding) logging.debug("Using hashing algorithm '%s'", self._hash_algorithm) @@ -223,8 +226,10 @@ def _handle_html(self, contents: bytes, encoding: str) -> HandlerResult: soup: BeautifulSoup = BeautifulSoup(page_contents, "lxml") text_elements = soup.find_all(text=True) visible_text_elements = filter(self._is_visible_element, text_elements) - visible_text = " ".join(t.strip() for t in visible_text_elements if t.strip()) - visible_bytes = bytes(visible_text, self._default_encoding) + visible_text: str = " ".join( + t.strip() for t in visible_text_elements if t.strip() + ) + visible_bytes: bytes = bytes(visible_text, self._default_encoding) digest: str = get_hash_digest(self._hash_algorithm, visible_bytes) @@ -267,7 +272,7 @@ def hash_url(self, url: str, verify: Union[bool, str] = True) -> UrlResult: raise err # https://tools.ietf.org/html/rfc7231#section-3.1.1.5 - content_type = ( + content_type: str = ( resp.headers.get("content-type", "application/octet-stream").strip().lower() )