Skip to content

Commit

Permalink
cache and reuse repodata instead of fetching every time
Browse files Browse the repository at this point in the history
Bug: 1343247
Change-Id: I40f1e9a4931b13a8a36ffe459ecb9d3e67fbc766
  • Loading branch information
bmcivor committed Mar 3, 2017
1 parent 67b4ed6 commit 6f11c37
Show file tree
Hide file tree
Showing 6 changed files with 195 additions and 66 deletions.
2 changes: 2 additions & 0 deletions acceptance_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,12 @@ class DirServer(WSGIServer):
def __init__(self, host='127.0.0.1', port=0):
super(DirServer, self).__init__(host, port, self)
self.basepath = None
self.num_requests = 0

def __call__(self, environ, start_response):
path_info = os.path.normpath(environ['PATH_INFO'])
localpath = os.path.join(self.basepath, path_info.lstrip('/'))
self.num_requests += 1

if not os.path.exists(localpath):
start_response('404 Not Found', [])
Expand Down
97 changes: 96 additions & 1 deletion acceptance_tests/test_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

import os
import glob
import time
import shutil
import rpmfluff
from data_setup import run_rpmdeplint
Expand All @@ -19,7 +22,6 @@ def test_finds_all_problems(request, dir_server):
p_depending = rpmfluff.SimpleRpmBuild('d', '0.1', '1', ['i386'])
p_depending.add_requires('libfoo.so.4')
repo_packages = [p_newer, p_with_content, p_old_soname, p_depending]

baserepo = rpmfluff.YumRepoBuild(repo_packages)
baserepo.make('i386')
dir_server.basepath = baserepo.repoDir
Expand Down Expand Up @@ -125,3 +127,96 @@ def cleanUp():
assert err == ('Problems with dependency set:\n'
'nothing provides libfoo.so.4 needed by a-0.1-1.noarch\n'
'nothing provides libfoo.so.4 needed by b-0.1-1.i386\n')


def test_cache_is_used_when_available(request, dir_server):
p1 = rpmfluff.SimpleRpmBuild('a', '0.1', '1', ['i386'])
baserepo = rpmfluff.YumRepoBuild((p1,))
baserepo.make('i386')
dir_server.basepath = baserepo.repoDir

def cleanUp():
shutil.rmtree(baserepo.repoDir)
shutil.rmtree(p1.get_base_dir())
request.addfinalizer(cleanUp)

# Assuming cache is cleaned first
assert dir_server.num_requests == 0

run_rpmdeplint(['rpmdeplint', 'check', '--repo=base,{}'.format(
dir_server.url), p1.get_built_rpm('i386')])

# A single run of rpmdeplint with a clean cache should expect network
# requests for - repomd.xml, primary.xml.gz and filelists.xml.gz. Requiring
# a total of 3
assert dir_server.num_requests == 3

run_rpmdeplint(['rpmdeplint', 'check', '--repo=base,{}'.format(
dir_server.url), p1.get_built_rpm('i386')])

# Executing 2 subprocesses should expect 4 requests if repodata cache is
# functioning correctly. A single request for each file in the repo
# - repomd.xml, primary.xml.gz, filelists.xml.gz, with an additional
# request from the second process checking metadata. The additional
# single request shows that the files are skipped in the second process
assert dir_server.num_requests == 4


def test_cache_doesnt_grow_unboundedly(request, dir_server):
os.environ['RPMDEPLINT_EXPIRY_SECONDS'] = '1'
base_cache_dir = os.path.join(os.environ.get('XDG_CACHE_HOME',
os.path.join(os.path.expanduser('~'), '.cache')), 'rpmdeplint')

p1 = rpmfluff.SimpleRpmBuild('a', '0.1', '1', ['i386'])
firstrepo = rpmfluff.YumRepoBuild((p1, ))
firstrepo.make('i386')
dir_server.basepath = firstrepo.repoDir

run_rpmdeplint(['rpmdeplint', 'check', '--repo=base,{}'.format(
dir_server.url), p1.get_built_rpm('i386')])

def get_file_cache_path(repodir, file_type):
temp_path = glob.iglob(os.path.join(repodir, 'repodata', file_type))
primary_fn = os.path.basename(next(temp_path))
primary_fn_checksum = primary_fn.split('-')[0]
return os.path.join(base_cache_dir, primary_fn_checksum[:1],
primary_fn_checksum[1:], primary_fn)

# files are stored in cache_path /.cache/checksum[1:]/checksum[:1]
first_primary_cache_path = get_file_cache_path(firstrepo.repoDir,
'*-primary.xml.gz')
first_filelists_cache_path = get_file_cache_path(firstrepo.repoDir,
'*-filelists.xml.gz')

assert os.path.exists(first_primary_cache_path)
assert os.path.exists(first_filelists_cache_path)

p2 = rpmfluff.SimpleRpmBuild('b', '0.1', '1', ['i386'])
secondrepo = rpmfluff.YumRepoBuild((p2, ))
secondrepo.make('i386')
dir_server.basepath = secondrepo.repoDir

# ensure time period of cache has expired
time.sleep(2)

def cleanUp():
shutil.rmtree(firstrepo.repoDir)
shutil.rmtree(secondrepo.repoDir)
shutil.rmtree(p1.get_base_dir())
shutil.rmtree(p2.get_base_dir())
request.addfinalizer(cleanUp)

run_rpmdeplint(['rpmdeplint', 'check', '--repo=base,{}'.format(
dir_server.url), p2.get_built_rpm('i386')])

# files are stored in cache_path /.cache/checksum[1:]/checksum[:1]
second_primary_cache_path = get_file_cache_path(secondrepo.repoDir,
'*-primary.xml.gz')
second_filelists_cache_path = get_file_cache_path(secondrepo.repoDir,
'*-filelists.xml.gz')

# Ensure the cache only has files from the second one
assert not os.path.exists(first_primary_cache_path)
assert not os.path.exists(first_filelists_cache_path)
assert os.path.exists(second_primary_cache_path)
assert os.path.exists(second_filelists_cache_path)
35 changes: 0 additions & 35 deletions acceptance_tests/test_list_deps.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@

import shutil
import rpmfluff
import os

from rpmdeplint.repodata import REPO_CACHE_DIR, REPO_CACHE_NAME_PREFIX
from data_setup import run_rpmdeplint


Expand Down Expand Up @@ -101,36 +99,3 @@ def test_erroneous_cli_input_errors():
'--derp'])

assert exitcode == 2


def test_rpmdeplint_does_not_leave_repocache_dirs_behind(request, dir_server):
p1 = rpmfluff.SimpleRpmBuild('a', '0.1', '1', ['i386'])
baserepo = rpmfluff.YumRepoBuild([p1])
baserepo.make('i386')

# Setup the HTTP server serving our repository needed to create tmp
# directories for the repository information by librepo
dir_server.basepath = baserepo.repoDir

cache_dirs = len([x for x in os.listdir(REPO_CACHE_DIR)
if x.startswith(REPO_CACHE_NAME_PREFIX)])

p2 = rpmfluff.SimpleRpmBuild('b', '0.1', '1', ['i386'])
p2.make()
exitcode, out, err = run_rpmdeplint(['rpmdeplint',
'list-deps',
'--repo=base,{}'.format(dir_server.url),
p2.get_built_rpm('i386')])

assert err == ''
# Note: This could fail if other processes have changed the cache directory
# while rpmdeplint ran.
assert cache_dirs == len([x for x in os.listdir(REPO_CACHE_DIR)
if x.startswith(REPO_CACHE_NAME_PREFIX)])

def cleanUp():
shutil.rmtree(baserepo.repoDir)
shutil.rmtree(p1.get_base_dir())
shutil.rmtree(p2.get_base_dir())

request.addfinalizer(cleanUp)
5 changes: 2 additions & 3 deletions rpmdeplint/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,8 @@ def __enter__(self):
return self

def __exit__(self, type, value, tb):
"""Clean up cache directory to prevent from growing unboundedly."""
for repo in self.repos_by_name.values():
repo.cleanup_cache()
""" Perform NOP on exit of analyzer to maintain the state of repodata
cache """

def find_packages_that_require(self, name):
pkgs = hawkey.Query(self._sack).filter(requires=name, latest_per_arch=True)
Expand Down
113 changes: 87 additions & 26 deletions rpmdeplint/repodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@
import os
import logging
import tempfile
import shutil
import requests
import errno
import glob
import time
from six.moves import configparser
import librepo
import hawkey
Expand Down Expand Up @@ -124,29 +126,88 @@ def as_hawkey_repo(self):

def download_repodata(self):
logger.debug('Loading repodata for %s from %s', self.name,
self.baseurl or self.metalink)
self.baseurl or self.metalink)
self.librepo_handle = h = librepo.Handle()
r = librepo.Result()
h.repotype = librepo.LR_YUMREPO
h.setopt(librepo.LRO_YUMDLIST, ["filelists", "primary"])
if self.baseurl:
h.urls = [self.baseurl]
if self.metalink:
h.mirrorlist = self.metalink
h.setopt(librepo.LRO_DESTDIR, tempfile.mkdtemp(self.name,
prefix=REPO_CACHE_NAME_PREFIX, dir=REPO_CACHE_DIR))
h.setopt(librepo.LRO_INTERRUPTIBLE, True)

h.setopt(librepo.LRO_YUMDLIST, [])
if self.baseurl and os.path.isdir(self.baseurl):
self._download_metadata_result(h, r)
self._yum_repomd = r.yum_repomd
self._root_path = self.baseurl
h.local = True
self.primary_fn = self.primary_url
self.filelists_fn = self.filelists_url
else:
self._root_path = h.destdir = tempfile.mkdtemp(
self.name, prefix=REPO_CACHE_NAME_PREFIX, dir=REPO_CACHE_DIR)
self._root_path = h.destdir = tempfile.mkdtemp(self.name,
prefix=REPO_CACHE_NAME_PREFIX, dir=REPO_CACHE_DIR)
self._download_metadata_result(h, r)
self._yum_repomd = r.yum_repomd
self.primary_fn = self._download_repodata_file(
self.primary_checksum, self.primary_url)
self.filelists_fn = self._download_repodata_file(
self.filelists_checksum, self.filelists_url)

def _download_metadata_result(self, handle, result):
try:
h.perform(r)
handle.perform(result)
except librepo.LibrepoException as ex:
raise RepoDownloadError(self, str(ex))

self._yum_repomd = r.yum_repomd
def _download_repodata_file(self, checksum, url):
"""
Each created file in cache becomes immutable, and is referenced in
the directory tree within XDG_CACHE_HOME as
$XDG_CACHE_HOME/<checksum-type>/<checksum-first-letter>/<rest-of
-checksum>
Both metadata and the files to be cached are written to a tempdir first
then renamed to the cache dir atomically to avoid them potentially being
accessed before written to cache.
"""
filepath_in_cache = os.path.join(os.path.join(self.cache_basedir,
checksum[:1], checksum[1:], os.path.basename(url)))
self.clean_expired_cache(self.cache_basedir)
try:
with open(filepath_in_cache, 'r+'):
logger.debug('Using cached file for %s', self.name)
return filepath_in_cache
except IOError:
pass # download required
try:
os.makedirs(os.path.dirname(filepath_in_cache))
except OSError as e:
if e.errno != errno.EEXIST:
logger.debug('Cache directory %s already exists',
os.path.dirname(filepath_in_cache))
raise
fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(filepath_in_cache))
with requests.Session() as session, os.fdopen(fd, 'wb+') as temp_file:
data = session.get(url, stream=True)
for chunk in data.iter_content():
temp_file.write(chunk)
os.rename(temp_path, filepath_in_cache)
return filepath_in_cache

def clean_expired_cache(self, root_path):
"""
Removes any file within the directory tree that is older than the
given value in RPMDEPLINT_EXPIRY_SECONDS environment variable.
"""
current_time = time.time()
for root, dirs, files in os.walk(root_path):
for fd in files:
file_path = os.path.join(root, fd)
modified = os.stat(file_path).st_mtime
if modified < current_time - self.expiry_seconds:
if os.path.isfile(file_path):
os.remove(file_path)

def download_package(self, location, checksum_type, checksum):
if self.librepo_handle.local:
Expand All @@ -169,31 +230,31 @@ def download_package(self, location, checksum_type, checksum):
logger.debug('Saved as %s', target.local_path)
return target.local_path

def cleanup_cache(self):
"""Deletes this repository's cache directory from disk.
In case of an error, the error is logged and no exception is raised.
"""
if self.librepo_handle.local:
return

try:
shutil.rmtree(self._root_path)
except OSError as err:
logger.error(err)

@property
def yum_repomd(self):
return self._yum_repomd
@property
def repomd_fn(self):
return os.path.join(self._root_path, 'repodata', 'repomd.xml')
@property
def primary_fn(self):
return os.path.join(self._root_path, self.yum_repomd['primary']['location_href'])
def primary_url(self):
return os.path.join(self.baseurl, self.yum_repomd['primary']['location_href'])
@property
def primary_checksum(self):
return self.yum_repomd['primary']['checksum']
@property
def filelists_checksum(self):
return self.yum_repomd['filelists']['checksum']
@property
def filelists_url(self):
return os.path.join(self.baseurl, self.yum_repomd['filelists']['location_href'])
@property
def cache_basedir(self):
return os.path.join(os.environ.get('XDG_CACHE_HOME',
os.path.join(os.path.expanduser('~'), '.cache')), 'rpmdeplint')
@property
def filelists_fn(self):
return os.path.join(self._root_path, self.yum_repomd['filelists']['location_href'])
def expiry_seconds(self):
return float(os.getenv('RPMDEPLINT_EXPIRY_SECONDS', '604800'))

def __str__(self):
strrep = ["Repo Name: {0}".format(self.name),
Expand Down
9 changes: 8 additions & 1 deletion rpmdeplint/tests/test_repodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def test_loads_system_yum_repo_with_substitutions(yumdir, monkeypatch):
assert repos[0].name == 'dummy'
assert repos[0].baseurl == 'http://example.invalid/21/s390x/'


def test_bad_repo_url_raises_error(yumdir):
yumdir.join('yum.repos.d', 'dummy.repo').write(
'[dummy]\nname=Dummy\nbaseurl=http://example.invalid/dummy\nenabled=1\n',
Expand All @@ -85,4 +86,10 @@ def test_bad_repo_url_raises_error(yumdir):
with pytest.raises(RepoDownloadError) as rde:
repos[0].download_repodata()
assert 'Cannot download repomd.xml' in str(rde.value)
assert 'Repo Name: dummy' in str(rde.value)
assert 'Repo Name: dummy' in str(rde.value)


def test_set_cache_expiry_in_seconds(monkeypatch):
monkeypatch.setenv('RPMDEPLINT_EXPIRY_SECONDS', '100')
repo = Repo(repo_name='Dummy', baseurl='http://example.invalid/dummy')
assert repo.expiry_seconds == 100

0 comments on commit 6f11c37

Please sign in to comment.