verify digest for cachito archive

* STONEBLD-509 Signed-off-by: Robert Cerven <rcerven@redhat.com>
containerbuildsystem · May 16, 2023 · 6d63200 · 6d63200
1 parent 81f3444
commit 6d63200
Show file tree

Hide file tree

Showing 5 changed files with 82 additions and 11 deletions.
diff --git a/atomic_reactor/constants.py b/atomic_reactor/constants.py
@@ -218,3 +218,8 @@
 
 # https://raw.githubusercontent.com/CycloneDX/specification/1.4/schema/bom-1.4.schema.json
 SBOM_SCHEMA_PATH = 'schemas/sbom-1.4.schema.json'
+
+# algorithm used for hash of cachito archive
+CACHITO_HASH_ALG = 'sha256'
+# string used in response header in Digest
+CACHITO_ALG_STR = 'sha-256'
diff --git a/atomic_reactor/download.py b/atomic_reactor/download.py
@@ -5,6 +5,7 @@
 This software may be modified and distributed under the terms
 of the BSD license. See the LICENSE file for details.
 """
+import base64
 import hashlib
 import logging
 import os
@@ -17,14 +18,16 @@
     DEFAULT_DOWNLOAD_BLOCK_SIZE,
     HTTP_BACKOFF_FACTOR,
     HTTP_MAX_RETRIES,
+    CACHITO_HASH_ALG,
+    CACHITO_ALG_STR,
 )
 
 
 logger = logging.getLogger(__name__)
 
 
 def download_url(url, dest_dir, insecure=False, session=None, dest_filename=None,
-                 expected_checksums=None):
+                 expected_checksums=None, verify_cachito_digest=False):
     """Download file from URL, handling retries
 
     To download to a temporary directory, use:
@@ -37,6 +40,7 @@ def download_url(url, dest_dir, insecure=False, session=None, dest_filename=None
     :param dest_filename: optional filename for downloaded file
     :param expected_checksums: optional dictionary of checksum_type and
                                checksum to verify downloaded files
+    :param verify_cachito_digest: bool, verify sha digest for cachito archive
     :return: str, path of downloaded file
     """
 
@@ -52,6 +56,7 @@ def download_url(url, dest_dir, insecure=False, session=None, dest_filename=None
     logger.debug('downloading %s', url)
 
     checksums = {algo: hashlib.new(algo) for algo in expected_checksums}
+    cachito_hasher = hashlib.new(CACHITO_HASH_ALG)
 
     for attempt in range(HTTP_MAX_RETRIES + 1):
         response = session.get(url, stream=True, verify=not insecure)
@@ -62,11 +67,30 @@ def download_url(url, dest_dir, insecure=False, session=None, dest_filename=None
                     f.write(chunk)
                     for checksum in checksums.values():
                         checksum.update(chunk)
+
+                    if verify_cachito_digest:
+                        cachito_hasher.update(chunk)
+
             for algo, checksum in checksums.items():
                 if checksum.hexdigest() != expected_checksums[algo]:
                     raise ValueError(
                         'Computed {} checksum, {}, does not match expected checksum, {}'
                         .format(algo, checksum.hexdigest(), expected_checksums[algo]))
+
+            if verify_cachito_digest:
+                logger.info('will verify cachito digest')
+                if 'Digest' in response.headers:
+                    logger.info('digest is in cachito response header')
+
+                    digest = base64.b64encode(cachito_hasher.digest()).decode("utf-8")
+                    digest_str = f'{CACHITO_ALG_STR}={digest}'
+                    if digest_str != response.headers['Digest']:
+                        raise ValueError(
+                            'Cachito archive digest "{}" does not match expected digest "{}"'
+                            .format(digest_str, response.headers['Digest']))
+                    else:
+                        logger.info('digest for cachito archive is correct')
+
             break
         except requests.exceptions.RequestException:
             if attempt < HTTP_MAX_RETRIES:

diff --git a/atomic_reactor/utils/cachito.py b/atomic_reactor/utils/cachito.py
@@ -183,7 +183,7 @@ def download_sources(self, request, dest_dir='.', dest_filename=REMOTE_SOURCE_TA
         url = self.assemble_download_url(request_id)
         dest_path = download_url(
             url, dest_dir=dest_dir, insecure=not self.session.verify, session=self.session,
-            dest_filename=dest_filename)
+            dest_filename=dest_filename, verify_cachito_digest=True)
         logger.debug('Sources bundle for request %d downloaded to %s', request_id, dest_path)
         return dest_path
 

diff --git a/tests/test_download.py b/tests/test_download.py
@@ -18,6 +18,7 @@
 
 from atomic_reactor.util import get_retrying_requests_session
 from atomic_reactor.download import download_url
+from atomic_reactor.constants import CACHITO_ALG_STR
 
 
 class TestDownloadUrl(object):
@@ -34,6 +35,36 @@ def test_happy_path(self):
         with open(result, 'rb') as f:
             assert f.read() == content
 
+    @responses.activate
+    def test_cachito_download_digest_matches(self):
+        url = 'https://example.com/path/file'
+        dest_dir = tempfile.mkdtemp()
+        content = b'abc'
+        digest = 'ungWv48Bz+pBQUDeXa4iI7ADYaOWF3qctBD/YfIAFa0='
+        digest_str = f'{CACHITO_ALG_STR}={digest}'
+
+        reader = BufferedReader(BytesIO(content), buffer_size=1)
+        responses.add(responses.GET, url, body=reader, headers={'Digest': digest_str})
+        result = download_url(url, dest_dir, verify_cachito_digest=True)
+
+        assert os.path.basename(result) == 'file'
+        with open(result, 'rb') as f:
+            assert f.read() == content
+
+    @responses.activate
+    def test_cachito_download_digest_mismatches(self):
+        url = 'https://example.com/path/file'
+        dest_dir = tempfile.mkdtemp()
+        content = b'abc'
+        digest = 'wrong'
+        digest_str = f'{CACHITO_ALG_STR}={digest}'
+
+        reader = BufferedReader(BytesIO(content), buffer_size=1)
+        responses.add(responses.GET, url, body=reader, headers={'Digest': digest_str})
+
+        with pytest.raises(ValueError, match='does not match expected digest'):
+            download_url(url, dest_dir, verify_cachito_digest=True)
+
     def test_connection_failure(self):
         url = 'https://example.com/path/file'
         dest_dir = tempfile.mkdtemp()

diff --git a/tests/utils/test_cachito.py b/tests/utils/test_cachito.py
@@ -20,6 +20,7 @@
 from datetime import datetime
 from textwrap import dedent
 
+from atomic_reactor.constants import CACHITO_ALG_STR
 
 CACHITO_URL = 'http://cachito.example.com'
 CACHITO_REQUEST_ID = 123
@@ -278,24 +279,34 @@ def test_wait_for_request_bad_request_type():
 
 
 @responses.activate
-@pytest.mark.parametrize('cachito_request', (
-    CACHITO_REQUEST_ID,
-    {'id': CACHITO_REQUEST_ID},
+@pytest.mark.parametrize(('cachito_request', 'digest_match'), (
+    (CACHITO_REQUEST_ID, True),
+    ({'id': CACHITO_REQUEST_ID}, False)
 ))
-def test_download_sources(tmpdir, cachito_request):
+def test_download_sources(tmpdir, cachito_request, digest_match):
     blob = 'glop-glop-I\'m-a-blob'
     expected_dest_path = os.path.join(str(tmpdir), 'remote-source.tar.gz')
 
+    if digest_match:
+        digest = 'XrN1l765qbGhErVrxe8Cj6+zCfwhqZoldJxOSYrpUlo='
+    else:
+        digest = 'wrong'
+    digest_str = f'{CACHITO_ALG_STR}={digest}'
+
     responses.add(
         responses.GET,
         '{}/api/v1/requests/{}/download'.format(CACHITO_URL, CACHITO_REQUEST_ID),
-        body=blob)
+        body=blob, headers={'Digest': digest_str})
 
-    dest_path = CachitoAPI(CACHITO_URL).download_sources(cachito_request, str(tmpdir))
+    if digest_match:
+        dest_path = CachitoAPI(CACHITO_URL).download_sources(cachito_request, str(tmpdir))
 
-    assert dest_path == expected_dest_path
-    with open(dest_path) as f:
-        assert f.read() == blob
+        assert dest_path == expected_dest_path
+        with open(dest_path) as f:
+            assert f.read() == blob
+    else:
+        with pytest.raises(ValueError, match='does not match expected digest'):
+            CachitoAPI(CACHITO_URL).download_sources(cachito_request, str(tmpdir))
 
 
 def test_download_sources_bad_request_type(tmpdir):