diff --git a/gcp/workers/cron/generate_sitemap/generate_and_upload.sh b/gcp/workers/cron/generate_sitemap/generate_and_upload.sh deleted file mode 100755 index 03f48bb1534..00000000000 --- a/gcp/workers/cron/generate_sitemap/generate_and_upload.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -set -e - -SCRIPT_PATH=$(dirname "$(readlink -f "$0")") - -SITEMAP_OUTPUT="sitemap_output/" -OUTPUT_BUCKET="${OUTPUT_GCS_BUCKET:=test-osv-dev-sitemap}" -BASE_URL_PATH="${BASE_URL:=https://test.osv.dev}" - -echo "Begin sitemap generation for $BASE_URL_PATH" - -"$SCRIPT_PATH/generate_sitemap.py" --base_url $BASE_URL_PATH - -echo "Begin Syncing with cloud to $OUTPUT_BUCKET" - -gsutil -m rsync -c -d $SITEMAP_OUTPUT "gs://$OUTPUT_BUCKET/" diff --git a/gcp/workers/cron/generate_sitemap/generate_sitemap.py b/gcp/workers/cron/generate_sitemap/generate_sitemap.py deleted file mode 100755 index 61a72676b58..00000000000 --- a/gcp/workers/cron/generate_sitemap/generate_sitemap.py +++ /dev/null @@ -1,179 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Generate sitemap.""" -import logging -import sys -import os -import osv -import osv.logs -import datetime -import argparse -from collections import defaultdict -from google.cloud import ndb - -from xml.etree.ElementTree import Element, SubElement, ElementTree - -_OUTPUT_DIRECTORY = './sitemap_output' -_SITEMAPS_PREFIX = 'sitemap_' -_SITEMAP_INDEX_PATH = f'./{_SITEMAPS_PREFIX}index.xml' -_SITEMAP_URL_LIMIT = 49999 - - -def epoch() -> datetime.datetime: - return datetime.datetime.fromtimestamp(0, tz=datetime.UTC) - - -alias_to_last_modified: defaultdict[str, datetime.datetime] = defaultdict(epoch) - - -def fetch_vulnerabilities_and_dates( - ecosystem: str) -> list[tuple[str, datetime.datetime]]: - """Fetch vulnerabilities' id for the given ecosystem.""" - # Query with projection to reduce data returned - # Order does not matter, other than to keep things consistent - bugs = osv.Bug.query( - osv.Bug.status == osv.BugStatus.PROCESSED, - osv.Bug.public == True, # pylint: disable=singleton-comparison - osv.Bug.ecosystem == ecosystem, - projection=[osv.Bug.last_modified]).order(-osv.Bug.last_modified) - - bug_and_dates = [] - for bug in bugs: - key = bug.key.id() - last_mod_date = max(bug.last_modified, alias_to_last_modified[bug.key.id()]) - bug_and_dates.append((key, last_mod_date)) - - return bug_and_dates - - -def osv_get_ecosystems(): - """Get list of ecosystems.""" - # This includes ecosystems with only non processed/public entries - query = osv.Bug.query(projection=[osv.Bug.ecosystem], distinct=True) - return sorted([bug.ecosystem[0] for bug in query if bug.ecosystem], - key=str.lower) - - -def get_sitemap_filename_for_ecosystem(ecosystem: str) -> str: - ecosystem_name = ecosystem.replace(' ', '_').replace('.', '__').strip() - return f'./{_SITEMAPS_PREFIX}{ecosystem_name}.xml' - - -def get_sitemap_url_for_ecosystem(ecosystem: str, base_url: str) -> str: - ecosystem_name = ecosystem.replace(' ', '_').replace('.', '__').strip() - return f'{base_url}/{_SITEMAPS_PREFIX}{ecosystem_name}.xml' - - -def generate_sitemap_for_ecosystem(ecosystem: str, - base_url: str) -> datetime.datetime: - """ - Generate a sitemap for the give n ecosystem. - - Returns the latest modified date of it's entries. - """ - logging.info('Generating sitemap for ecosystem "%s".', ecosystem) - vulnerability_and_dates = fetch_vulnerabilities_and_dates(ecosystem) - filename = get_sitemap_filename_for_ecosystem(ecosystem) - urlset = Element( - 'urlset', xmlns='http://www.sitemaps.org/schemas/sitemap/0.9') - - if len(vulnerability_and_dates) > _SITEMAP_URL_LIMIT: - logging.warning('Ecosystem "%s" Exceeded sitemap size limit', ecosystem) - - # TODO: For large ecosystems with over 50,000 vulnerabilities, generate - # multiple sitemaps. - for vuln_id, last_modified in vulnerability_and_dates[:_SITEMAP_URL_LIMIT]: - url = SubElement(urlset, 'url') - loc = SubElement(url, 'loc') - loc.text = f'{base_url}/vulnerability/{vuln_id}' - lastmod = SubElement(url, 'lastmod') - lastmod.text = last_modified.isoformat() - - tree = ElementTree(urlset) - tree.write(filename, encoding='utf-8', xml_declaration=True) - - # Addition of epoch for edge cases where vulnerability is empty - return max([ - last_mod for _, last_mod in vulnerability_and_dates[:_SITEMAP_URL_LIMIT] - ] + [epoch()]) - - -def generate_sitemap_index(ecosystems: set[str], base_url: str, - last_mod_dict: dict[str, datetime.datetime]) -> None: - """Generate a sitemap index.""" - logging.info('Generating sitemap index.') - sitemapindex = Element( - 'sitemapindex', xmlns='http://www.sitemaps.org/schemas/sitemap/0.9') - - for ecosystem in ecosystems: - sitemap = SubElement(sitemapindex, 'sitemap') - loc = SubElement(sitemap, 'loc') - loc.text = get_sitemap_url_for_ecosystem(ecosystem, base_url) - lastmod = SubElement(sitemap, 'lastmod') - lastmod.text = last_mod_dict[ecosystem].isoformat() - - tree = ElementTree(sitemapindex) - tree.write(_SITEMAP_INDEX_PATH, encoding='utf-8', xml_declaration=True) - - -def generate_sitemaps(base_url: str) -> None: - """Generate sitemaps including all vulnerabilities, split by ecosystem.""" - logging.info("Begin generating sitemaps.") - # Go over the base ecosystems index. Otherwise we'll have duplicated - # vulnerabilities in the sitemap. - base_ecosystems = { - ecosystem for ecosystem in osv_get_ecosystems() if ':' not in ecosystem - } - - ecosystem_last_mod_dates = {} - for ecosystem in base_ecosystems: - ecosystem_last_mod_dates[ecosystem] = generate_sitemap_for_ecosystem( - ecosystem, base_url) - - generate_sitemap_index(base_ecosystems, base_url, ecosystem_last_mod_dates) - - -def preload_alias_groups(): - """Fetch all alias groups, as we will be querying all of them anyway""" - logging.info("Preloading alias groups into memory.") - aliases = osv.AliasGroup.query() - for al in aliases: - al: osv.AliasGroup - for bug_id in al.bug_ids: # type: ignore - alias_to_last_modified[bug_id] = al.last_modified - - -def main() -> int: - parser = argparse.ArgumentParser(description='Generate sitemaps.') - parser.add_argument( - '--base_url', - required=True, - help='The base URL for the sitemap entries (without trailing /).') - args = parser.parse_args() - - os.makedirs(_OUTPUT_DIRECTORY, exist_ok=True) - os.chdir(_OUTPUT_DIRECTORY) - - preload_alias_groups() - generate_sitemaps(args.base_url) - return 0 - - -if __name__ == '__main__': - _ndb_client = ndb.Client() - osv.logs.setup_gcp_logging('generate_sitemap') - with _ndb_client.context(): - sys.exit(main()) diff --git a/gcp/workers/cron/generate_sitemap/generate_sitemap_test.py b/gcp/workers/cron/generate_sitemap/generate_sitemap_test.py deleted file mode 100644 index 7925846405b..00000000000 --- a/gcp/workers/cron/generate_sitemap/generate_sitemap_test.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -"""Sitemap generator tests.""" - -# limitations under the License. -import unittest -import tempfile -from unittest.mock import patch, MagicMock -import generate_sitemap -import osv -from datetime import datetime -from datetime import UTC - - -class TestSitemapGeneration(unittest.TestCase): - """Tests to verify the functionality of the sitemap generator script""" - - def temp_file(self): - # Create a temporary file for testing - self.test_file = tempfile.NamedTemporaryFile(delete=False) - self.test_file.write(b'This is a test file.') - self.test_file.close() - return self.test_file.name - - @patch.object(osv.Bug, 'query') - def test_fetch_vulnerabilities_and_dates(self, mock_query): - """Test it returns the vulnerability ids for ecosystem""" - # Mock the returned query - mock_query.return_value.order.return_value = [ - MagicMock( - last_modified=datetime.fromtimestamp(10, tz=UTC), - key=MagicMock(id=MagicMock(return_value='vuln1'))), - MagicMock( - last_modified=datetime.fromtimestamp(20, tz=UTC), - key=MagicMock(id=MagicMock(return_value='vuln2'))), - ] - - result = generate_sitemap.fetch_vulnerabilities_and_dates('Go') - self.assertEqual(result, [('vuln1', datetime.fromtimestamp(10, tz=UTC)), - ('vuln2', datetime.fromtimestamp(20, tz=UTC))]) - - @patch.object(osv.Bug, 'query') - def test_osv_get_ecosystems(self, mock_query): - """Test it returns the ecosystems""" - # Mock the returned query - mock_query.return_value = [ - MagicMock(ecosystem=['UVI']), - MagicMock(ecosystem=['Go']) - ] - - result = generate_sitemap.osv_get_ecosystems() - self.assertEqual(result, ['Go', 'UVI']) - - @patch('generate_sitemap.fetch_vulnerabilities_and_dates') - @patch('generate_sitemap.ElementTree') - def test_generate_sitemap_for_ecosystem(self, mock_element_tree, - mock_fetch_vulns): - """Check it generates the sitemap for ecosystem""" - mock_fetch_vulns.return_value = [ - ('vuln1', datetime.fromtimestamp(10, tz=UTC)), - ('vuln2', datetime.fromtimestamp(20, tz=UTC)) - ] - mock_tree = MagicMock() - mock_element_tree.return_value = mock_tree - - generate_sitemap.generate_sitemap_for_ecosystem('Go', 'http://example.com') - - mock_tree.write.assert_called_once_with( - './sitemap_Go.xml', encoding='utf-8', xml_declaration=True) - - @patch('generate_sitemap.fetch_vulnerabilities_and_dates') - @patch('generate_sitemap.ElementTree') - def test_generate_sitemap_for_ecosystem_with_space(self, mock_element_tree, - mock_fetch_vulns): - """" - Check it creates the sitemap correctly where there is a space in the - ecosystem name. - """ - mock_fetch_vulns.return_value = [ - ('vuln1', datetime.fromtimestamp(10, tz=UTC)), - ('vuln2', datetime.fromtimestamp(20, tz=UTC)) - ] - mock_tree = MagicMock() - mock_element_tree.return_value = mock_tree - - generate_sitemap.generate_sitemap_for_ecosystem('Rocky Linux', - 'http://example.com') - - mock_tree.write.assert_called_once_with( - './sitemap_Rocky_Linux.xml', encoding='utf-8', xml_declaration=True) - - @patch('generate_sitemap.fetch_vulnerabilities_and_dates') - @patch('generate_sitemap.ElementTree') - def test_generate_sitemap_for_ecosystem_with_period(self, mock_element_tree, - mock_fetch_vulns): - """" - Check it creates the sitemap correctly where there is a period in the - ecosystem name. - """ - mock_fetch_vulns.return_value = [ - ('vuln1', datetime.fromtimestamp(10, tz=UTC)), - ('vuln2', datetime.fromtimestamp(20, tz=UTC)) - ] - mock_tree = MagicMock() - mock_element_tree.return_value = mock_tree - - generate_sitemap.generate_sitemap_for_ecosystem('crates.io', - 'http://example.com') - - mock_tree.write.assert_called_once_with( - './sitemap_crates__io.xml', encoding='utf-8', xml_declaration=True) - - @patch('generate_sitemap.ElementTree') - def test_generate_sitemap_index(self, mock_element_tree): - """Check it generates the sitemap index as expected""" - mock_tree = MagicMock() - mock_element_tree.return_value = mock_tree - - generate_sitemap.generate_sitemap_index( - {'Go', 'UVI'}, 'http://example.com', { - 'Go': datetime.fromtimestamp(10, tz=UTC), - 'UVI': datetime.fromtimestamp(20, tz=UTC) - }) - - mock_tree.write.assert_called_once_with( - './sitemap_index.xml', encoding='utf-8', xml_declaration=True) - - @patch('generate_sitemap.generate_sitemap_for_ecosystem') - @patch('generate_sitemap.generate_sitemap_index') - @patch('generate_sitemap.osv_get_ecosystems') - def test_generate_sitemap(self, mock_get_ecosystems, mock_generate_index, - mock_generate_sitemap): - """ - Check the outer wrapper generates the ecosystems' sitemaps as well as - sitemap index. - """ - mock_get_ecosystems.return_value = ['Go', 'UVI:Library', 'Android'] - mock_generate_sitemap.return_value = datetime.fromtimestamp(10, tz=UTC) - generate_sitemap.generate_sitemaps('http://example.com') - - self.assertEqual(mock_generate_sitemap.call_count, 2) - mock_generate_sitemap.assert_any_call('Go', 'http://example.com') - mock_generate_sitemap.assert_any_call('Android', 'http://example.com') - - mock_generate_index.assert_called_once_with( - {'Android', 'Go'}, 'http://example.com', { - 'Android': datetime.fromtimestamp(10, tz=UTC), - 'Go': datetime.fromtimestamp(10, tz=UTC), - }) - - -if __name__ == '__main__': - unittest.main() diff --git a/gcp/workers/exporter/Dockerfile b/gcp/workers/exporter/Dockerfile deleted file mode 100644 index e7e236208a3..00000000000 --- a/gcp/workers/exporter/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -FROM gcr.io/oss-vdb/worker - -COPY exporter.py /usr/local/bin -COPY export_runner.py /usr/local/bin -RUN chmod 755 /usr/local/bin/exporter.py -RUN chmod 755 /usr/local/bin/export_runner.py -ENTRYPOINT ["export_runner.py"] diff --git a/gcp/workers/exporter/build.sh b/gcp/workers/exporter/build.sh deleted file mode 100755 index 58d4729d4ab..00000000000 --- a/gcp/workers/exporter/build.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -x -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -docker build -t gcr.io/oss-vdb/exporter:$1 . && \ -docker build -t gcr.io/oss-vdb/exporter:latest . && \ -docker push gcr.io/oss-vdb/exporter:$1 && \ -docker push gcr.io/oss-vdb/exporter:latest diff --git a/gcp/workers/exporter/export_runner.py b/gcp/workers/exporter/export_runner.py deleted file mode 100644 index 155a85d0702..00000000000 --- a/gcp/workers/exporter/export_runner.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""OSV Exporter.""" -import argparse -import concurrent.futures -import csv -import glob -import logging -import os -import subprocess -import tempfile -import zipfile as z - -from google.cloud import ndb, storage - -from exporter import safe_upload_single, LAST_MODIFIED_FILE -import osv -import osv.logs - -DEFAULT_WORK_DIR = '/work' -DEFAULT_EXPORT_BUCKET = 'osv-vulnerabilities' -DEFAULT_EXPORT_PROCESSES = 7 - - -def main(): - parser = argparse.ArgumentParser(description='Exporter') - parser.add_argument( - '--work_dir', help='Working directory', default=DEFAULT_WORK_DIR) - parser.add_argument( - '--bucket', - help='Bucket name to export to', - default=DEFAULT_EXPORT_BUCKET) - parser.add_argument( - '--processes', - help='Maximum number of parallel exports, default to number of cpu cores', - type=int, - # If 0 or None, use the DEFAULT_EXPORT_PROCESSES value - default=os.cpu_count() or DEFAULT_EXPORT_PROCESSES) - args = parser.parse_args() - - query = osv.Bug.query(projection=[osv.Bug.ecosystem], distinct=True) - ecosystems = [bug.ecosystem[0] for bug in query if bug.ecosystem] + ['list'] - - # Set TMPDIR to change the tempfile default directory - tmp_dir = os.path.join(args.work_dir, 'tmp') - os.makedirs(tmp_dir, exist_ok=True) - os.environ['TMPDIR'] = tmp_dir - - with tempfile.TemporaryDirectory() as export_dir: - with concurrent.futures.ThreadPoolExecutor( - max_workers=args.processes) as executor: - for eco in ecosystems: - # Skip exporting data for child ecosystems (e.g., 'Debian:11'). - if ':' in eco: - continue - executor.submit(spawn_ecosystem_exporter, export_dir, args.bucket, eco) - # Upload a ZIP file containing records from all ecosystems. - aggregate_all_vulnerabilities(export_dir, args.bucket) - - -def spawn_ecosystem_exporter(work_dir: str, bucket: str, eco: str): - """ - Spawns the ecosystem specific exporter. - """ - logging.info('Starting export of ecosystem: %s', eco) - proc = subprocess.Popen([ - # Assume exporter.py is in the same directory as this file - # This is true for local dev and Docker - f'{os.path.dirname(__file__)}/exporter.py', - '--work_dir', - work_dir, - '--bucket', - bucket, - '--ecosystem', - eco - ]) - return_code = proc.wait() - if return_code != 0: - logging.error( - 'Export of %s failed with Exit Code: %d. See logs for related errors.', - eco, return_code) - - -def aggregate_all_vulnerabilities(work_dir: str, export_bucket: str): - """ - Aggregates vulnerability records from each ecosystem into a single zip - file and uploads it to the export bucket. - """ - logging.info('Generating unified all.zip archive.') - zip_file_name = 'all.zip' - output_zip = os.path.join(work_dir, zip_file_name) - all_vulns = {} - - for file_path in glob.glob( - os.path.join(work_dir, '**/*.json'), recursive=True): - all_vulns[os.path.basename(file_path)] = file_path - - with z.ZipFile(output_zip, 'a', z.ZIP_DEFLATED) as all_zip: - for vuln_filename in sorted(all_vulns): - file_path = all_vulns[vuln_filename] - all_zip.write(file_path, os.path.basename(file_path)) - - # Also aggregate modified_list.csv files - output_modified_list = os.path.join(work_dir, LAST_MODIFIED_FILE) - full_modified_list = [] - for file_path in glob.glob( - os.path.join(work_dir, f'**/{LAST_MODIFIED_FILE}'), recursive=True): - dir_from_work_dir = os.path.relpath(os.path.dirname(file_path), work_dir) - with open(file_path, 'r') as infile: - reader = csv.reader(infile) - for line in reader: - # Create ,/ - full_modified_list.append((line[0], f'{dir_from_work_dir}/{line[1]}')) - - full_modified_list.sort(reverse=True) - - with open(output_modified_list, 'w') as outfile: - csv.writer(outfile).writerows(full_modified_list) - - storage_client = storage.Client() - bucket = storage_client.get_bucket(export_bucket) - safe_upload_single(bucket, output_zip, zip_file_name) - logging.info('Unified all.zip uploaded successfully.') - safe_upload_single(bucket, output_modified_list, LAST_MODIFIED_FILE) - logging.info('Unified %s uploaded successfully.', LAST_MODIFIED_FILE) - - -if __name__ == '__main__': - _ndb_client = ndb.Client() - osv.logs.setup_gcp_logging('exporter-runner') - with _ndb_client.context(): - main() diff --git a/gcp/workers/exporter/exporter.py b/gcp/workers/exporter/exporter.py deleted file mode 100755 index 60b083295f9..00000000000 --- a/gcp/workers/exporter/exporter.py +++ /dev/null @@ -1,286 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""OSV Exporter.""" -import argparse -import concurrent.futures -import csv -import json -import logging -import os -import zipfile -from typing import List - -from google.cloud import ndb -from google.cloud import storage -from google.cloud.storage import retry -from google.cloud.storage.bucket import Bucket -from google.protobuf import json_format - -import requests - -import osv -import osv.logs - -DEFAULT_WORK_DIR = '/work' - -DEFAULT_EXPORT_BUCKET = 'osv-vulnerabilities' -DEFAULT_SAFE_DELTA_PCT = 10 -_EXPORT_WORKERS = 32 -ECOSYSTEMS_FILE = 'ecosystems.txt' -LAST_MODIFIED_FILE = 'modified_id.csv' -VANIR_SIGNATURES_KEY = 'vanir_signatures' -OSV_GIT_JSON_FILE_NAME = 'osv_git.json' - - -class Error(Exception): - """Base exception class.""" - - -def modify_storage_client_adapters(storage_client: storage.Client, - pool_connections: int = 128, - max_retries: int = 3, - pool_block: bool = True) -> storage.Client: - """Returns a modified google.cloud.storage.Client object. - - Due to many concurrent GCS connections, the default connection pool can become - overwhelmed, introducing delays. - - Solution described in https://github.com/googleapis/python-storage/issues/253 - - These affect the urllib3.HTTPConnectionPool underpinning the storage.Client's - HTTP requests. - - Args: - storage_client: an existing google.cloud.storage.Client object - pool_connections: number of pool_connections desired - max_retries: maximum retries - pool_block: blocking behaviour when pool is exhausted - - Returns: - the google.cloud.storage.Client appropriately modified. - - """ - adapter = requests.adapters.HTTPAdapter( - pool_connections=pool_connections, - max_retries=max_retries, - pool_block=pool_block) - # pylint: disable=protected-access - storage_client._http.mount('https://', adapter) - storage_client._http._auth_request.session.mount('https://', adapter) - return storage_client - - -class Exporter: - """Exporter.""" - - def __init__(self, work_dir, export_bucket, ecosystem): - self._work_dir = work_dir - self._export_bucket = export_bucket - self._ecosystem = ecosystem - - def run(self): - """Run exporter.""" - if self._ecosystem == "list": - query = osv.Bug.query(projection=[osv.Bug.ecosystem], distinct=True) - # Filter out ecosystems that contain a colon, - # as these represent Linux distro releases. - ecosystems = [ - bug.ecosystem[0] - for bug in query - if bug.ecosystem and ':' not in bug.ecosystem[0] - ] - self._export_ecosystem_list_to_bucket(ecosystems, self._work_dir) - else: - self._export_ecosystem_to_bucket(self._ecosystem, self._work_dir) - - def _export_ecosystem_list_to_bucket(self, ecosystems: List[str], - tmp_dir: str): - """Export an ecosystems.txt file with all of the ecosystem names. - - See https://github.com/google/osv.dev/issues/619 - - Args: - ecosystems: the list of ecosystem names - tmp_dir: temporary directory for scratch - """ - - logging.info('Exporting ecosystem list to %s', ECOSYSTEMS_FILE) - storage_client = storage.Client() - bucket = storage_client.get_bucket(self._export_bucket) - ecosystems_file_path = os.path.join(tmp_dir, ECOSYSTEMS_FILE) - with open(ecosystems_file_path, "w") as ecosystems_file: - ecosystems_file.writelines([e + "\n" for e in ecosystems]) - - upload_single(bucket, ecosystems_file_path, ECOSYSTEMS_FILE) - - def _export_ecosystem_to_bucket(self, ecosystem: str, work_dir: str): - """Export the vulnerabilities in an ecosystem to GCS. - - Args: - ecosystem: the ecosystem name - work_dir: working directory for scratch - - This simultaneously exports every Bug for the given ecosystem to individual - files in the scratch filesystem, and a zip file in the scratch filesystem. - - At the conclusion of this export, all of the files in the scratch filesystem - (including the zip file) are uploaded to the GCS bucket. - """ - logging.info('Exporting vulnerabilities for ecosystem %s', ecosystem) - storage_client = modify_storage_client_adapters(storage.Client()) - bucket = storage_client.get_bucket(self._export_bucket) - - ecosystem_dir = os.path.join(work_dir, ecosystem) - os.makedirs(ecosystem_dir, exist_ok=True) - zip_path = os.path.join(ecosystem_dir, 'all.zip') - git_vulns_with_vanir_signatures = [] - with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zip_file: - files_to_zip = [] - id_and_modified = [] - - @ndb.tasklet - def _export_to_file_and_zipfile(bug: osv.Bug): - """Write out a bug record to both a single file and the zip file.""" - if not bug.public or bug.status == osv.BugStatus.UNPROCESSED: - return - - try: - file_path = os.path.join(ecosystem_dir, bug.id() + '.json') - vulnerability = yield bug.to_vulnerability_async( - include_source=True, include_alias=True, include_upstream=True) - osv.write_vulnerability(vulnerability, file_path) - - if ecosystem == 'GIT': - # The GIT ecosystem has vulnerabilities that have Vanir sigantures - # generated. These are exported to a file. - if any(VANIR_SIGNATURES_KEY in affected.database_specific - for affected in vulnerability.affected): - git_vulns_with_vanir_signatures.append( - json_format.MessageToDict( - vulnerability, preserving_proto_field_name=True)) - - files_to_zip.append(file_path) - # ToJsonString converts it into an ISO string - # with timezone Z correctly appended - id_and_modified.append( - (vulnerability.modified.ToJsonString(), vulnerability.id)) - except Exception: - logging.exception('Failed to export bug: "%s"', bug.id()) - raise - - # This *should* pause here until - # all the exports have been written to disk. - osv.Bug.query( - osv.Bug.ecosystem == ecosystem).map(_export_to_file_and_zipfile) - - # Write out the GIT vulnerabilities that have Vanir signatures to - # a JSON file. - if ecosystem == 'GIT' and git_vulns_with_vanir_signatures: - output_path = os.path.join(ecosystem_dir, OSV_GIT_JSON_FILE_NAME) - with open(output_path, 'w') as f: - json.dump(git_vulns_with_vanir_signatures, f, indent=2) - - files_to_zip.sort() - for file_path in files_to_zip: - zip_file.write(file_path, os.path.basename(file_path)) - - id_and_modified.sort(reverse=True) - with open(os.path.join(ecosystem_dir, LAST_MODIFIED_FILE), - 'w') as modified_file: - csv.writer(modified_file).writerows(id_and_modified) - - with concurrent.futures.ThreadPoolExecutor( - max_workers=_EXPORT_WORKERS) as executor: - # Note: the individual ecosystem all.zip is included here - # TODO: use safe_upload_single() on the zip files. - for filename in os.listdir(ecosystem_dir): - executor.submit(upload_single, bucket, - os.path.join(ecosystem_dir, filename), - f'{ecosystem}/{filename}') - - -def upload_single(bucket: Bucket, source_path: str, target_path: str): - """Upload a single file to a GCS bucket.""" - logging.info('Uploading %s', target_path) - try: - blob = bucket.blob(target_path) - blob.upload_from_filename(source_path, retry=retry.DEFAULT_RETRY) - except Exception as e: - logging.exception('Failed to export: %s', e) - - -def safe_upload_single(bucket: Bucket, - source_path: str, - target_path: str, - safe_delta_pct: int = DEFAULT_SAFE_DELTA_PCT): - """Upload a single file to a GCS bucket, with a size check. - - This refuses to overwrite the GCS object if the file size difference is - greater than the permitted threshold (10% by default). - - NOTE: this intentionally only catches unexpectedly smaller files, not larger - ones. - - Args: - bucket: (Bucket): the GCS bucket object to upload to. - source_path: (str): the source path to the file to upload. - target_path: (str): the target path in Bucket to upload to. - safe_delta_pct: (int): the threshold at which to raise an exception. - - Raises: - Error: if safe_delta_pct is exceeded - """ - - source_size = os.stat(source_path).st_size - logging.info('Uploading %s', target_path) - try: - blob = bucket.get_blob(target_path) - if blob and blob.size and (source_size / blob.size) * 100 < safe_delta_pct: - raise (Error( - f'Cowardly refusing to overwrite {blob.name} ({blob.size} bytes) ' - f'with {source_path} ({source_size} bytes)')) - if blob: - blob.upload_from_filename(source_path, retry=retry.DEFAULT_RETRY) - else: - bucket.blob(target_path).upload_from_filename( - source_path, retry=retry.DEFAULT_RETRY) - except Exception as e: - logging.exception('Failed to export: %s', e) - - -def main(): - parser = argparse.ArgumentParser(description='Exporter') - parser.add_argument( - '--work_dir', help='Working directory', default=DEFAULT_WORK_DIR) - parser.add_argument( - '--bucket', - help='Bucket name to export to', - default=DEFAULT_EXPORT_BUCKET) - parser.add_argument( - '--ecosystem', - required=True, - help='Ecosystem to upload, pass the value "list" ' + - 'to export the ecosystem.txt file') - args = parser.parse_args() - - exporter = Exporter(args.work_dir, args.bucket, args.ecosystem) - exporter.run() - - -if __name__ == '__main__': - _ndb_client = ndb.Client() - osv.logs.setup_gcp_logging('exporter') - with _ndb_client.context(): - main()