From 6f11c3708425ed41715b397950100403bcd9888b Mon Sep 17 00:00:00 2001 From: Blake McIvor Date: Tue, 20 Dec 2016 14:44:14 +1000 Subject: [PATCH] cache and reuse repodata instead of fetching every time Bug: 1343247 Change-Id: I40f1e9a4931b13a8a36ffe459ecb9d3e67fbc766 --- acceptance_tests/conftest.py | 2 + acceptance_tests/test_check.py | 97 ++++++++++++++++++++++++- acceptance_tests/test_list_deps.py | 35 --------- rpmdeplint/__init__.py | 5 +- rpmdeplint/repodata.py | 113 ++++++++++++++++++++++------- rpmdeplint/tests/test_repodata.py | 9 ++- 6 files changed, 195 insertions(+), 66 deletions(-) diff --git a/acceptance_tests/conftest.py b/acceptance_tests/conftest.py index f3105a0..f5ebd09 100644 --- a/acceptance_tests/conftest.py +++ b/acceptance_tests/conftest.py @@ -60,10 +60,12 @@ class DirServer(WSGIServer): def __init__(self, host='127.0.0.1', port=0): super(DirServer, self).__init__(host, port, self) self.basepath = None + self.num_requests = 0 def __call__(self, environ, start_response): path_info = os.path.normpath(environ['PATH_INFO']) localpath = os.path.join(self.basepath, path_info.lstrip('/')) + self.num_requests += 1 if not os.path.exists(localpath): start_response('404 Not Found', []) diff --git a/acceptance_tests/test_check.py b/acceptance_tests/test_check.py index e533ab6..f9c8e53 100644 --- a/acceptance_tests/test_check.py +++ b/acceptance_tests/test_check.py @@ -4,6 +4,9 @@ # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. +import os +import glob +import time import shutil import rpmfluff from data_setup import run_rpmdeplint @@ -19,7 +22,6 @@ def test_finds_all_problems(request, dir_server): p_depending = rpmfluff.SimpleRpmBuild('d', '0.1', '1', ['i386']) p_depending.add_requires('libfoo.so.4') repo_packages = [p_newer, p_with_content, p_old_soname, p_depending] - baserepo = rpmfluff.YumRepoBuild(repo_packages) baserepo.make('i386') dir_server.basepath = baserepo.repoDir @@ -125,3 +127,96 @@ def cleanUp(): assert err == ('Problems with dependency set:\n' 'nothing provides libfoo.so.4 needed by a-0.1-1.noarch\n' 'nothing provides libfoo.so.4 needed by b-0.1-1.i386\n') + + +def test_cache_is_used_when_available(request, dir_server): + p1 = rpmfluff.SimpleRpmBuild('a', '0.1', '1', ['i386']) + baserepo = rpmfluff.YumRepoBuild((p1,)) + baserepo.make('i386') + dir_server.basepath = baserepo.repoDir + + def cleanUp(): + shutil.rmtree(baserepo.repoDir) + shutil.rmtree(p1.get_base_dir()) + request.addfinalizer(cleanUp) + + # Assuming cache is cleaned first + assert dir_server.num_requests == 0 + + run_rpmdeplint(['rpmdeplint', 'check', '--repo=base,{}'.format( + dir_server.url), p1.get_built_rpm('i386')]) + + # A single run of rpmdeplint with a clean cache should expect network + # requests for - repomd.xml, primary.xml.gz and filelists.xml.gz. Requiring + # a total of 3 + assert dir_server.num_requests == 3 + + run_rpmdeplint(['rpmdeplint', 'check', '--repo=base,{}'.format( + dir_server.url), p1.get_built_rpm('i386')]) + + # Executing 2 subprocesses should expect 4 requests if repodata cache is + # functioning correctly. A single request for each file in the repo + # - repomd.xml, primary.xml.gz, filelists.xml.gz, with an additional + # request from the second process checking metadata. The additional + # single request shows that the files are skipped in the second process + assert dir_server.num_requests == 4 + + +def test_cache_doesnt_grow_unboundedly(request, dir_server): + os.environ['RPMDEPLINT_EXPIRY_SECONDS'] = '1' + base_cache_dir = os.path.join(os.environ.get('XDG_CACHE_HOME', + os.path.join(os.path.expanduser('~'), '.cache')), 'rpmdeplint') + + p1 = rpmfluff.SimpleRpmBuild('a', '0.1', '1', ['i386']) + firstrepo = rpmfluff.YumRepoBuild((p1, )) + firstrepo.make('i386') + dir_server.basepath = firstrepo.repoDir + + run_rpmdeplint(['rpmdeplint', 'check', '--repo=base,{}'.format( + dir_server.url), p1.get_built_rpm('i386')]) + + def get_file_cache_path(repodir, file_type): + temp_path = glob.iglob(os.path.join(repodir, 'repodata', file_type)) + primary_fn = os.path.basename(next(temp_path)) + primary_fn_checksum = primary_fn.split('-')[0] + return os.path.join(base_cache_dir, primary_fn_checksum[:1], + primary_fn_checksum[1:], primary_fn) + + # files are stored in cache_path /.cache/checksum[1:]/checksum[:1] + first_primary_cache_path = get_file_cache_path(firstrepo.repoDir, + '*-primary.xml.gz') + first_filelists_cache_path = get_file_cache_path(firstrepo.repoDir, + '*-filelists.xml.gz') + + assert os.path.exists(first_primary_cache_path) + assert os.path.exists(first_filelists_cache_path) + + p2 = rpmfluff.SimpleRpmBuild('b', '0.1', '1', ['i386']) + secondrepo = rpmfluff.YumRepoBuild((p2, )) + secondrepo.make('i386') + dir_server.basepath = secondrepo.repoDir + + # ensure time period of cache has expired + time.sleep(2) + + def cleanUp(): + shutil.rmtree(firstrepo.repoDir) + shutil.rmtree(secondrepo.repoDir) + shutil.rmtree(p1.get_base_dir()) + shutil.rmtree(p2.get_base_dir()) + request.addfinalizer(cleanUp) + + run_rpmdeplint(['rpmdeplint', 'check', '--repo=base,{}'.format( + dir_server.url), p2.get_built_rpm('i386')]) + + # files are stored in cache_path /.cache/checksum[1:]/checksum[:1] + second_primary_cache_path = get_file_cache_path(secondrepo.repoDir, + '*-primary.xml.gz') + second_filelists_cache_path = get_file_cache_path(secondrepo.repoDir, + '*-filelists.xml.gz') + + # Ensure the cache only has files from the second one + assert not os.path.exists(first_primary_cache_path) + assert not os.path.exists(first_filelists_cache_path) + assert os.path.exists(second_primary_cache_path) + assert os.path.exists(second_filelists_cache_path) diff --git a/acceptance_tests/test_list_deps.py b/acceptance_tests/test_list_deps.py index dc94e9f..268cb52 100644 --- a/acceptance_tests/test_list_deps.py +++ b/acceptance_tests/test_list_deps.py @@ -6,9 +6,7 @@ import shutil import rpmfluff -import os -from rpmdeplint.repodata import REPO_CACHE_DIR, REPO_CACHE_NAME_PREFIX from data_setup import run_rpmdeplint @@ -101,36 +99,3 @@ def test_erroneous_cli_input_errors(): '--derp']) assert exitcode == 2 - - -def test_rpmdeplint_does_not_leave_repocache_dirs_behind(request, dir_server): - p1 = rpmfluff.SimpleRpmBuild('a', '0.1', '1', ['i386']) - baserepo = rpmfluff.YumRepoBuild([p1]) - baserepo.make('i386') - - # Setup the HTTP server serving our repository needed to create tmp - # directories for the repository information by librepo - dir_server.basepath = baserepo.repoDir - - cache_dirs = len([x for x in os.listdir(REPO_CACHE_DIR) - if x.startswith(REPO_CACHE_NAME_PREFIX)]) - - p2 = rpmfluff.SimpleRpmBuild('b', '0.1', '1', ['i386']) - p2.make() - exitcode, out, err = run_rpmdeplint(['rpmdeplint', - 'list-deps', - '--repo=base,{}'.format(dir_server.url), - p2.get_built_rpm('i386')]) - - assert err == '' - # Note: This could fail if other processes have changed the cache directory - # while rpmdeplint ran. - assert cache_dirs == len([x for x in os.listdir(REPO_CACHE_DIR) - if x.startswith(REPO_CACHE_NAME_PREFIX)]) - - def cleanUp(): - shutil.rmtree(baserepo.repoDir) - shutil.rmtree(p1.get_base_dir()) - shutil.rmtree(p2.get_base_dir()) - - request.addfinalizer(cleanUp) diff --git a/rpmdeplint/__init__.py b/rpmdeplint/__init__.py index b2dd788..810095d 100644 --- a/rpmdeplint/__init__.py +++ b/rpmdeplint/__init__.py @@ -132,9 +132,8 @@ def __enter__(self): return self def __exit__(self, type, value, tb): - """Clean up cache directory to prevent from growing unboundedly.""" - for repo in self.repos_by_name.values(): - repo.cleanup_cache() + """ Perform NOP on exit of analyzer to maintain the state of repodata + cache """ def find_packages_that_require(self, name): pkgs = hawkey.Query(self._sack).filter(requires=name, latest_per_arch=True) diff --git a/rpmdeplint/repodata.py b/rpmdeplint/repodata.py index 38734a0..a03170a 100644 --- a/rpmdeplint/repodata.py +++ b/rpmdeplint/repodata.py @@ -9,8 +9,10 @@ import os import logging import tempfile -import shutil +import requests +import errno import glob +import time from six.moves import configparser import librepo import hawkey @@ -124,29 +126,88 @@ def as_hawkey_repo(self): def download_repodata(self): logger.debug('Loading repodata for %s from %s', self.name, - self.baseurl or self.metalink) + self.baseurl or self.metalink) self.librepo_handle = h = librepo.Handle() r = librepo.Result() h.repotype = librepo.LR_YUMREPO - h.setopt(librepo.LRO_YUMDLIST, ["filelists", "primary"]) if self.baseurl: h.urls = [self.baseurl] if self.metalink: h.mirrorlist = self.metalink + h.setopt(librepo.LRO_DESTDIR, tempfile.mkdtemp(self.name, + prefix=REPO_CACHE_NAME_PREFIX, dir=REPO_CACHE_DIR)) h.setopt(librepo.LRO_INTERRUPTIBLE, True) - + h.setopt(librepo.LRO_YUMDLIST, []) if self.baseurl and os.path.isdir(self.baseurl): + self._download_metadata_result(h, r) + self._yum_repomd = r.yum_repomd self._root_path = self.baseurl - h.local = True + self.primary_fn = self.primary_url + self.filelists_fn = self.filelists_url else: - self._root_path = h.destdir = tempfile.mkdtemp( - self.name, prefix=REPO_CACHE_NAME_PREFIX, dir=REPO_CACHE_DIR) + self._root_path = h.destdir = tempfile.mkdtemp(self.name, + prefix=REPO_CACHE_NAME_PREFIX, dir=REPO_CACHE_DIR) + self._download_metadata_result(h, r) + self._yum_repomd = r.yum_repomd + self.primary_fn = self._download_repodata_file( + self.primary_checksum, self.primary_url) + self.filelists_fn = self._download_repodata_file( + self.filelists_checksum, self.filelists_url) + + def _download_metadata_result(self, handle, result): try: - h.perform(r) + handle.perform(result) except librepo.LibrepoException as ex: raise RepoDownloadError(self, str(ex)) - self._yum_repomd = r.yum_repomd + def _download_repodata_file(self, checksum, url): + """ + Each created file in cache becomes immutable, and is referenced in + the directory tree within XDG_CACHE_HOME as + $XDG_CACHE_HOME/// + + Both metadata and the files to be cached are written to a tempdir first + then renamed to the cache dir atomically to avoid them potentially being + accessed before written to cache. + """ + filepath_in_cache = os.path.join(os.path.join(self.cache_basedir, + checksum[:1], checksum[1:], os.path.basename(url))) + self.clean_expired_cache(self.cache_basedir) + try: + with open(filepath_in_cache, 'r+'): + logger.debug('Using cached file for %s', self.name) + return filepath_in_cache + except IOError: + pass # download required + try: + os.makedirs(os.path.dirname(filepath_in_cache)) + except OSError as e: + if e.errno != errno.EEXIST: + logger.debug('Cache directory %s already exists', + os.path.dirname(filepath_in_cache)) + raise + fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(filepath_in_cache)) + with requests.Session() as session, os.fdopen(fd, 'wb+') as temp_file: + data = session.get(url, stream=True) + for chunk in data.iter_content(): + temp_file.write(chunk) + os.rename(temp_path, filepath_in_cache) + return filepath_in_cache + + def clean_expired_cache(self, root_path): + """ + Removes any file within the directory tree that is older than the + given value in RPMDEPLINT_EXPIRY_SECONDS environment variable. + """ + current_time = time.time() + for root, dirs, files in os.walk(root_path): + for fd in files: + file_path = os.path.join(root, fd) + modified = os.stat(file_path).st_mtime + if modified < current_time - self.expiry_seconds: + if os.path.isfile(file_path): + os.remove(file_path) def download_package(self, location, checksum_type, checksum): if self.librepo_handle.local: @@ -169,19 +230,6 @@ def download_package(self, location, checksum_type, checksum): logger.debug('Saved as %s', target.local_path) return target.local_path - def cleanup_cache(self): - """Deletes this repository's cache directory from disk. - - In case of an error, the error is logged and no exception is raised. - """ - if self.librepo_handle.local: - return - - try: - shutil.rmtree(self._root_path) - except OSError as err: - logger.error(err) - @property def yum_repomd(self): return self._yum_repomd @@ -189,11 +237,24 @@ def yum_repomd(self): def repomd_fn(self): return os.path.join(self._root_path, 'repodata', 'repomd.xml') @property - def primary_fn(self): - return os.path.join(self._root_path, self.yum_repomd['primary']['location_href']) + def primary_url(self): + return os.path.join(self.baseurl, self.yum_repomd['primary']['location_href']) + @property + def primary_checksum(self): + return self.yum_repomd['primary']['checksum'] + @property + def filelists_checksum(self): + return self.yum_repomd['filelists']['checksum'] + @property + def filelists_url(self): + return os.path.join(self.baseurl, self.yum_repomd['filelists']['location_href']) + @property + def cache_basedir(self): + return os.path.join(os.environ.get('XDG_CACHE_HOME', + os.path.join(os.path.expanduser('~'), '.cache')), 'rpmdeplint') @property - def filelists_fn(self): - return os.path.join(self._root_path, self.yum_repomd['filelists']['location_href']) + def expiry_seconds(self): + return float(os.getenv('RPMDEPLINT_EXPIRY_SECONDS', '604800')) def __str__(self): strrep = ["Repo Name: {0}".format(self.name), diff --git a/rpmdeplint/tests/test_repodata.py b/rpmdeplint/tests/test_repodata.py index 8d19ced..f217070 100644 --- a/rpmdeplint/tests/test_repodata.py +++ b/rpmdeplint/tests/test_repodata.py @@ -75,6 +75,7 @@ def test_loads_system_yum_repo_with_substitutions(yumdir, monkeypatch): assert repos[0].name == 'dummy' assert repos[0].baseurl == 'http://example.invalid/21/s390x/' + def test_bad_repo_url_raises_error(yumdir): yumdir.join('yum.repos.d', 'dummy.repo').write( '[dummy]\nname=Dummy\nbaseurl=http://example.invalid/dummy\nenabled=1\n', @@ -85,4 +86,10 @@ def test_bad_repo_url_raises_error(yumdir): with pytest.raises(RepoDownloadError) as rde: repos[0].download_repodata() assert 'Cannot download repomd.xml' in str(rde.value) - assert 'Repo Name: dummy' in str(rde.value) \ No newline at end of file + assert 'Repo Name: dummy' in str(rde.value) + + +def test_set_cache_expiry_in_seconds(monkeypatch): + monkeypatch.setenv('RPMDEPLINT_EXPIRY_SECONDS', '100') + repo = Repo(repo_name='Dummy', baseurl='http://example.invalid/dummy') + assert repo.expiry_seconds == 100