diff --git a/lib/pbench/cli/server/report.py b/lib/pbench/cli/server/report.py index 5490a4cf96..8702dcfc0d 100644 --- a/lib/pbench/cli/server/report.py +++ b/lib/pbench/cli/server/report.py @@ -486,12 +486,12 @@ def report( try: config = config_setup(context) - logger = get_pbench_logger("report-generator", config) + logger = get_pbench_logger("pbench-report-generator", config) if any((all, archive, backup, cache)): cache_m = CacheManager(config, logger) verifier.status("starting discovery") watcher.update("discovering cache") - cache_m.full_discovery() + cache_m.full_discovery(search=False) watcher.update("processing reports") verifier.status("finished discovery") if all or archive: diff --git a/lib/pbench/cli/server/tree_manage.py b/lib/pbench/cli/server/tree_manage.py index 05b8925f8a..4c29ae905d 100644 --- a/lib/pbench/cli/server/tree_manage.py +++ b/lib/pbench/cli/server/tree_manage.py @@ -80,7 +80,7 @@ def tree_manage( logger = None try: config = config_setup(context) - logger = get_pbench_logger("cachemanager", config) + logger = get_pbench_logger("pbench-tree-manager", config) cache_m = CacheManager(config, logger) cache_m.full_discovery() if display: diff --git a/lib/pbench/server/api/resources/datasets_contents.py b/lib/pbench/server/api/resources/datasets_contents.py index 1507c46d61..20f3c590f7 100644 --- a/lib/pbench/server/api/resources/datasets_contents.py +++ b/lib/pbench/server/api/resources/datasets_contents.py @@ -1,5 +1,4 @@ from http import HTTPStatus -from pathlib import Path from flask import current_app, jsonify from flask.wrappers import Request, Response @@ -22,8 +21,6 @@ BadDirpath, CacheExtractBadPath, CacheManager, - CacheObject, - CacheType, TarballNotFound, ) from pbench.server.database.models.datasets import Dataset @@ -65,100 +62,22 @@ def _get(self, params: ApiParams, req: Request, context: ApiContext) -> Response dataset: Dataset = params.uri["dataset"] target = params.uri.get("target") - path = Path("." if target in ("/", None) else target) + path = "." if target in ("/", None) else target + + prefix = current_app.server_config.rest_uri + origin = ( + f"{self._get_uri_base(req).host}{prefix}/datasets/{dataset.resource_id}" + ) cache_m = CacheManager(self.config, current_app.logger) try: - info = cache_m.find_entry(dataset.resource_id, path) + info = cache_m.get_contents(dataset.resource_id, path, origin) except (BadDirpath, CacheExtractBadPath, TarballNotFound) as e: raise APIAbort(HTTPStatus.NOT_FOUND, str(e)) except Exception as e: raise APIInternalError(f"Cache find error: {str(e)!r}") - prefix = current_app.server_config.rest_uri - origin = ( - f"{self._get_uri_base(req).host}{prefix}/datasets/{dataset.resource_id}" - ) - - details: CacheObject = info["details"] - if details.type is CacheType.DIRECTORY: - children = info["children"] if "children" in info else {} - dir_list = [] - file_list = [] - - for c, value in children.items(): - d: CacheObject = value["details"] - if d.type is CacheType.DIRECTORY: - dir_list.append( - { - "name": c, - "type": d.type.name, - "uri": f"{origin}/contents/{d.location}", - } - ) - elif d.type is CacheType.SYMLINK: - if d.resolve_type is CacheType.DIRECTORY: - uri = f"{origin}/contents/{d.resolve_path}" - elif d.resolve_type is CacheType.FILE: - uri = f"{origin}/inventory/{d.resolve_path}" - else: - uri = f"{origin}/inventory/{d.location}" - file_list.append( - { - "name": c, - "type": d.type.name, - "link": str(d.resolve_path), - "link_type": d.resolve_type.name, - "uri": uri, - } - ) - else: - r = { - "name": c, - "type": d.type.name, - "uri": f"{origin}/inventory/{d.location}", - } - if d.type is CacheType.FILE: - r["size"] = d.size - file_list.append(r) - - dir_list.sort(key=lambda d: d["name"]) - file_list.sort(key=lambda d: d["name"]) - - # Normalize because we want the "root" directory to be reported as - # "" rather than as Path's favored "." - loc = str(details.location) - name = details.name - if loc == ".": - loc = "" - name = "" - val = { - "name": name, - "type": details.type.name, - "directories": dir_list, - "files": file_list, - "uri": f"{origin}/contents/{loc}", - } - else: - access = "inventory" - link = str(details.location) - if details.type is CacheType.SYMLINK: - if details.resolve_type is CacheType.DIRECTORY: - access = "contents" - if details.resolve_type in (CacheType.FILE, CacheType.DIRECTORY): - link = str(details.resolve_path) - val = { - "name": details.name, - "type": details.type.name, - "uri": f"{origin}/{access}/{link}", - } - if details.type is CacheType.SYMLINK: - val["link"] = link - val["link_type"] = details.resolve_type.name - elif details.type is CacheType.FILE: - val["size"] = details.size - try: - return jsonify(val) + return jsonify(info) except Exception as e: - raise APIInternalError(f"JSONIFY {val}: {str(e)!r}") + raise APIInternalError(f"JSONIFY {info}: {str(e)!r}") diff --git a/lib/pbench/server/cache_manager.py b/lib/pbench/server/cache_manager.py index 3c8bb6bacd..2c47401cc8 100644 --- a/lib/pbench/server/cache_manager.py +++ b/lib/pbench/server/cache_manager.py @@ -6,6 +6,7 @@ import fcntl from logging import Logger import math +import os from pathlib import Path import shlex import shutil @@ -14,9 +15,11 @@ from typing import Any, IO, Optional, Union import humanize +from sqlalchemy import and_ from pbench.common import MetadataLog, selinux from pbench.server import JSONOBJECT, OperationCode, PathLike, PbenchServerConfig +from pbench.server.database.database import Database from pbench.server.database.models.audit import Audit, AuditStatus, AuditType from pbench.server.database.models.datasets import Dataset, DatasetNotFound, Metadata from pbench.server.utils import get_tarball_md5 @@ -799,6 +802,131 @@ def extract(tarball_path: Path, path: str) -> Inventory: tarball_path, f"Unexpected error from {tar_path}: {error_text!r}" ) + def get_contents(self, path: str, origin: str) -> JSONOBJECT: + """Return a description of a directory. + + Args: + path: relative path within the tarball + origin: root URI path for the dataset + + Returns: + A "json" dict describing the target. + """ + + with LockManager(self.lock) as lock: + artifact: Path = self.get_results(lock) / path + try: + # NOTE: os.path.abspath() removes ".." but Path.absolute(), + # doesn't, meaning the latter allows a query with path "..", + # and we don't want to allow reaching outside of the tarball. + arel = Path(os.path.abspath(artifact)).relative_to(self.unpacked) + except Exception: + raise CacheExtractError(self.name, path) + if artifact.is_dir() and not artifact.is_symlink(): + dir_list = [] + file_list = [] + for f in artifact.iterdir(): + relative = f.relative_to(self.unpacked) + if f.is_symlink(): + append_to = file_list + target = f.resolve() + try: + link = target.relative_to(self.unpacked) + except Exception: + link = f.readlink() + uri = f"{origin}/inventory/{relative}" + link_type = CacheType.BROKEN + else: + if target.is_dir(): + uri = f"{origin}/contents/{link}" + link_type = CacheType.DIRECTORY + append_to = dir_list + elif target.is_file(): + uri = f"{origin}/inventory/{link}" + link_type = CacheType.FILE + else: + uri = f"{origin}/inventory/{relative}" + link_type = CacheType.OTHER + append_to.append( + { + "name": f.name, + "type": CacheType.SYMLINK.name, + "link": str(link), + "link_type": link_type.name, + "uri": uri, + } + ) + elif f.is_dir(): + dir_list.append( + { + "name": f.name, + "type": CacheType.DIRECTORY.name, + "uri": f"{origin}/contents/{relative}", + } + ) + else: + t = CacheType.FILE if f.is_file() else CacheType.OTHER + r = { + "name": f.name, + "type": t.name, + "uri": f"{origin}/inventory/{relative}", + } + if t is CacheType.FILE: + r["size"] = f.stat().st_size + file_list.append(r) + + # Normalize because we want the "root" directory to be reported as + # "" rather than as Path's favored "." + loc = str(arel) + name = artifact.name + if loc == ".": + loc = "" + name = "" + dir_list.sort(key=lambda d: d["name"]) + file_list.sort(key=lambda d: d["name"]) + val = { + "name": name, + "type": CacheType.DIRECTORY.name, + "directories": dir_list, + "files": file_list, + "uri": f"{origin}/contents/{loc}", + } + else: + access = "inventory" + link_type = CacheType.FILE + ltype = CacheType.OTHER + if artifact.is_symlink(): + link_type = CacheType.SYMLINK + try: + target = artifact.resolve(strict=True) + trel = target.relative_to(self.unpacked) + if target.is_dir(): + access = "contents" + ltype = CacheType.DIRECTORY + arel = trel + elif target.is_file(): + ltype = CacheType.FILE + arel = trel + else: + ltype = CacheType.OTHER + except Exception: + ltype = CacheType.BROKEN + trel = artifact.readlink() + elif not artifact.is_file(): + link_type = CacheType.OTHER + + val = { + "name": artifact.name, + "type": link_type.name, + "uri": f"{origin}/{access}/{arel}", + } + if link_type is CacheType.SYMLINK: + val["link"] = str(trel) + val["link_type"] = ltype.name + if link_type is CacheType.FILE or ltype is CacheType.FILE: + val["size"] = artifact.stat().st_size + return val + def get_inventory(self, path: str) -> dict[str, Any]: """Return a JSON description of a tarball member file. @@ -944,6 +1072,8 @@ def get_results(self, lock: LockManager) -> Path: """ if not self.unpacked: + start = time.time() + reclaim = start lock.upgrade() # If necessary, attempt to reclaim some unused cache so we have @@ -952,6 +1082,7 @@ def get_results(self, lock: LockManager) -> Path: self.controller.cache_manager.reclaim_cache( goal_bytes=self.get_unpacked_size() + RECLAIM_BYTES_PAD ) + reclaim = time.time() audit = None error = None @@ -970,6 +1101,7 @@ def get_results(self, lock: LockManager) -> Path: "Unable to audit unpack for {}: '{}'", self.name, e ) + ustart = time.time() try: tar_command = "tar -x --no-same-owner --delay-directory-restore " tar_command += f"--force-local --file='{str(self.tarball_path)}'" @@ -993,17 +1125,19 @@ def get_results(self, lock: LockManager) -> Path: attributes=attributes, ) lock.downgrade() - - # Even if we have an unpacked directory, if it wasn't done under this - # CacheManager instance we may not have a cachemap, so be prepared to - # build one. - if not self.cachemap: - self.build_map() + uend = time.time() + self.logger.info( + "{}: reclaim {:.3f}, unpack {:.3f}", + self.name, + reclaim - start, + uend - ustart, + ) self.last_ref.touch(exist_ok=True) # If we have a Dataset, and haven't already done this, compute the # unpacked size and record it in metadata so we can use it later. + ssize = time.time() if self.dataset and not Metadata.getvalue( self.dataset, Metadata.SERVER_UNPACKED ): @@ -1019,6 +1153,7 @@ def get_results(self, lock: LockManager) -> Path: Metadata.setvalue(self.dataset, Metadata.SERVER_UNPACKED, size) except Exception as e: self.logger.warning("usage check failed: {}", e) + self.logger.info("{}: size update {:.3f}", self.name, time.time() - ssize) return self.unpacked def cache_delete(self): @@ -1083,7 +1218,7 @@ class Controller: but the audit report generator will flag it. """ - def __init__(self, path: Path, cache: Path, logger: Logger): + def __init__(self, path: Path, cache: Path, logger: Logger, discover: bool = True): """Manage the representation of a controller archive on disk. In this context, the path parameter refers to a controller directory @@ -1094,6 +1229,7 @@ def __init__(self, path: Path, cache: Path, logger: Logger): path: Controller ARCHIVE directory path cache: The base of the cache tree logger: Logger object + discover: Discover all tarballs if True """ self.logger = logger @@ -1118,21 +1254,29 @@ def __init__(self, path: Path, cache: Path, logger: Logger): # Discover the tarballs that already exist. # Depends on instance properties and should remain at the end of the # constructor! - self._discover_tarballs() + if discover: + self._discover_tarballs() - def _add_if_tarball(self, file: Path, md5: Optional[str] = None): + def _add_if_tarball(self, file: Path, md5: Optional[str] = None) -> bool: """Check for a tar file, and create an object Args: file: path of potential tarball md5: known MD5 hash, or None to compute here + + Returns: + true if the tarball was added """ - if file.is_file() and Dataset.is_tarball(file): - hash = md5 if md5 else get_tarball_md5(file) - tarball = Tarball(file, hash, self) - self.tarballs[tarball.name] = tarball - self.datasets[tarball.resource_id] = tarball - tarball.check_unpacked() + + if not (file.is_file() and Dataset.is_tarball(file)): + return False + + hash = md5 if md5 else get_tarball_md5(file) + tarball = Tarball(file, hash, self) + self.tarballs[tarball.name] = tarball + self.datasets[tarball.resource_id] = tarball + tarball.check_unpacked() + return True def _discover_tarballs(self): """Discover the known tarballs @@ -1288,16 +1432,22 @@ def __init__(self, options: PbenchServerConfig, logger: Logger): # resource_id. self.datasets: dict[str, Tarball] = {} - def full_discovery(self) -> "CacheManager": + def full_discovery(self, search: bool = True) -> "CacheManager": """Discover the ARCHIVE and CACHE trees - NOTE: both _discover_unpacked() and _discover_results() rely on the - results of _discover_archive(), which must run first. + Full discovery is only needed for reporting, and is not required + to find, create, or delete a specific dataset. (See find_dataset.) - Full discovery is not required in order to find, create, or delete a - specific dataset. + Args: + search: search the ARCHIVE tree rather than using the Dataset table + + Returns: + CacheManager instance for chaining """ - self._discover_controllers() + if search: + self._discover_controllers() + else: + self._discover_datasets() return self def __contains__(self, dataset_id: str) -> bool: @@ -1341,7 +1491,7 @@ def _clean_empties(self, controller: str): self.delete_if_empty(archive) del self.controllers[controller] - def _add_controller(self, directory: Path) -> None: + def _add_controller(self, directory: Path, discover: bool = True) -> Controller: """Create a new Controller object Add a new controller to the set of known controllers, and append the @@ -1350,15 +1500,20 @@ def _add_controller(self, directory: Path) -> None: Args: directory: A controller directory within the ARCHIVE tree + discover: Discover tarballs if True + + Returns: + The new controller """ - controller = Controller(directory, self.options.CACHE, self.logger) + controller = Controller(directory, self.options.CACHE, self.logger, discover) controller.cache_manager = self self.controllers[controller.name] = controller self.tarballs.update(controller.tarballs) self.datasets.update(controller.datasets) + return controller def _discover_controllers(self): - """Build a representation of the ARCHIVE tree + """Discover the ARCHIVE tree exhaustively by searching it. Record all controllers (top level directories), and the tarballs that represent datasets within them. @@ -1367,15 +1522,64 @@ def _discover_controllers(self): if file.is_dir() and file.name != CacheManager.TEMPORARY: self._add_controller(file) + def _discover_datasets(self): + """Discover the ARCHIVE tree from the SQL representation of datasets + + Discover all controllers and tarballs with server.tarball-path + metadata. This will find only correct and "live" tarballs, and may + miss incorrectly labeled tarballs. (Unless we're trying to diagnose + ARCHIVE tree errors, this is probably what we want.) + """ + rows = ( + Database.db_session.query( + Dataset.name, + Dataset.resource_id, + Metadata.value["tarball-path"].as_string(), + ) + .execution_options(stream_results=True) + .outerjoin( + Metadata, + and_(Dataset.id == Metadata.dataset_ref, Metadata.key == "server"), + ) + .yield_per(1000) + .all() + ) + + for name, resource_id, path in rows: + if not path: + # This runs asychronously with normal operation, and we might + # find a dataset before the "server.tarball-path" metadata is + # set. Issue a warning in case this becomes a concern, but + # otherwise ignore it and skip the dataset. + self.logger.warning( + "query unexpectedly returned name {}, resource_id {}, path {}", + name, + resource_id, + path, + ) + continue + tarball = Path(path) + controller_dir = tarball.parent + if controller_dir.name == resource_id: + controller_dir = controller_dir.parent + controller = self.controllers.get( + controller_dir.name, self._add_controller(controller_dir, False) + ) + if controller._add_if_tarball(tarball, resource_id): + self.tarballs.update(controller.tarballs) + self.datasets.update(controller.datasets) + else: + # This would be abnormal: log it and continue + self.logger.error( + "Unable to add {}: MD5 {}, path {}", name, resource_id, path + ) + def find_dataset(self, dataset_id: str) -> Tarball: - """Search the ARCHIVE tree for a matching dataset tarball. + """Return a descriptor of the identified tarball. This will build the Controller and Tarball object for that dataset if they do not already exist. - FIXME: This builds the entire Controller, which will discover all - datasets within the controller. This could be streamlined. - This allows a targeted minimal entry for mutation without discovering the entire tree. @@ -1393,16 +1597,35 @@ def find_dataset(self, dataset_id: str) -> Tarball: if dataset_id in self.datasets: return self.datasets[dataset_id] - # The dataset isn't already known; so search for it in the ARCHIVE tree - # and (if found) discover the controller containing that dataset. - for dir_entry in self.archive_root.iterdir(): - if dir_entry.is_dir() and dir_entry.name != self.TEMPORARY: - for file in dir_entry.glob(f"**/*{Dataset.TARBALL_SUFFIX}"): - md5 = get_tarball_md5(file) - if md5 == dataset_id: - self._add_controller(dir_entry) - return self.datasets[dataset_id] - raise TarballNotFound(dataset_id) + # The dataset isn't already known; so follow the tarball-path to build + # just the necessary controller and tarball objects. + start = time.time() + sql = start + try: + dataset = Dataset.query(resource_id=dataset_id) + sql = time.time() + tarball = Path(Metadata.getvalue(dataset, Metadata.TARBALL_PATH)) + except Exception as e: + raise TarballNotFound(dataset_id) from e + controller_dir = tarball.parent + if controller_dir.name == dataset_id: + controller_dir = controller_dir.parent + found = time.time() + controller = self.controllers.get( + controller_dir.name, self._add_controller(controller_dir, False) + ) + controller._add_if_tarball(tarball, dataset_id) + self.tarballs.update(controller.tarballs) + self.datasets.update(controller.datasets) + add = time.time() + self.logger.info( + "{}: {:.3f} to find tarball, {:.3f} SQL, {:.3f} to discover controller", + dataset.name, + found - start, + sql - start, + add - found, + ) + return self.datasets[dataset_id] # These are wrappers for controller and tarball operations which need to be # aware of higher-level constructs in the Pbench Server cache manager such as @@ -1413,16 +1636,24 @@ def find_dataset(self, dataset_id: str) -> Tarball: # Alternate constructor to create a Tarball object and move an incoming # tarball and md5 into the proper controller directory. # - # unpack - # Unpack the ARCHIVE tarball file into a new directory under the - # CACHE directory tree. + # find_entry + # Locate a path within a tarball. # - # uncache - # Remove the unpacked directory tree when no longer needed. + # get_contents + # Return metadata about a path within a tarball. + # + # get_inventory + # Return a managed byte stream for a file within a tarball. + # + # get_inventory_bytes + # Return the contents a file within a tarball as a string. # # delete # Remove the tarball and MD5 file from ARCHIVE after uncaching the # unpacked directory tree. + # + # reclaim_cache + # Remove cached tarball trees to free disk space. def create(self, tarfile_path: Path) -> Tarball: """Bring a new tarball under cache manager management. @@ -1501,6 +1732,20 @@ def find_entry(self, dataset_id: str, path: Path) -> dict[str, Any]: tarball = self.find_dataset(dataset_id) return tarball.find_entry(path) + def get_contents(self, dataset_id: str, path: str, origin: str) -> dict[str, Any]: + """Get information about dataset files from the cache map + + Args: + dataset_id: Dataset resource ID + path: path of requested content + origin: base URI for links + + Returns: + contents metadata + """ + tarball = self.find_dataset(dataset_id) + return tarball.get_contents(path, origin) + def get_inventory(self, dataset_id: str, target: str) -> Optional[JSONOBJECT]: """Return filestream data for a file within a dataset tarball diff --git a/lib/pbench/test/unit/server/test_cache_manager.py b/lib/pbench/test/unit/server/test_cache_manager.py index 6f29a8359f..146f29debf 100644 --- a/lib/pbench/test/unit/server/test_cache_manager.py +++ b/lib/pbench/test/unit/server/test_cache_manager.py @@ -31,7 +31,8 @@ TarballUnpackError, ) from pbench.server.database.models.audit import Audit, AuditStatus, AuditType -from pbench.server.database.models.datasets import Dataset, DatasetBadName +from pbench.server.database.models.datasets import Dataset, DatasetBadName, Metadata +from pbench.server.database.models.users import User from pbench.test.unit.server.conftest import make_tarball @@ -581,6 +582,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path: class MockTarball: def __init__(self, path: Path, resource_id: str, controller: Controller): self.name = Dataset.stem(path) + self.logger = controller.logger self.resource_id = "ABC" self.dataset = None self.tarball_path = path @@ -1552,6 +1554,170 @@ def read(self, size: int = -1) -> bytes: assert stream.lock is None assert my_calls == exp_calls + def test_get_contents(self, server_config, make_logger, tarball, db_session): + """Test the TOC analysis""" + + source_tarball, source_md5, md5 = tarball + cm = CacheManager(server_config, make_logger) + cm.create(source_tarball) + root = f"https://fake/api/v1/datasets/{md5}" + + # Unpack the tarball and show the canonical `tarball` fixture contents + assert cm.get_contents(md5, "", root) == { + "name": "", + "type": "DIRECTORY", + "uri": f"{root}/contents/", + "directories": [], + "files": [ + { + "name": "metadata.log", + "size": 28, + "type": "FILE", + "uri": f"{root}/inventory/metadata.log", + } + ], + } + + assert cm.get_contents(md5, "metadata.log", root) == { + "name": "metadata.log", + "type": "FILE", + "uri": f"{root}/inventory/metadata.log", + "size": 28, + } + + with pytest.raises(CacheExtractError): + cm.get_contents(md5, "../..", root) + + # Now that we have an unpacked directory, manually "enhance" it to test + # more paths. + + base: Path = cm.find_dataset(md5).unpacked + subdir = base / "subdir" + subdir.mkdir() + ldir = base / "dir_link" + ldir.symlink_to(subdir) + lfile = base / "file_link" + lfile.symlink_to(Path(base / "metadata.log")) + badlink = base / "bad_link" + badlink.symlink_to("/etc/passwd") + illegallink = base / "illegal_link" + illegallink.symlink_to(base / "..") + linklink = base / "link_link" + linklink.symlink_to(lfile) + fifo = base / "fifo" + os.mkfifo(fifo) + fifolink = base / "fifo_link" + fifolink.symlink_to(fifo) + assert cm.get_contents(md5, "", root) == { + "name": "", + "type": "DIRECTORY", + "directories": [ + { + "name": "dir_link", + "type": "SYMLINK", + "link": "subdir", + "link_type": "DIRECTORY", + "uri": f"{root}/contents/subdir", + }, + { + "name": "subdir", + "type": "DIRECTORY", + "uri": f"{root}/contents/subdir", + }, + ], + "files": [ + { + "name": "bad_link", + "type": "SYMLINK", + "link": "/etc/passwd", + "link_type": "BROKEN", + "uri": f"{root}/inventory/bad_link", + }, + { + "name": "fifo", + "type": "OTHER", + "uri": f"{root}/inventory/fifo", + }, + { + "name": "fifo_link", + "type": "SYMLINK", + "link": "fifo", + "link_type": "OTHER", + "uri": f"{root}/inventory/fifo_link", + }, + { + "name": "file_link", + "type": "SYMLINK", + "link": "metadata.log", + "link_type": "FILE", + "uri": f"{root}/inventory/metadata.log", + }, + { + "name": "illegal_link", + "type": "SYMLINK", + "link": str(base / ".."), + "link_type": "BROKEN", + "uri": f"{root}/inventory/illegal_link", + }, + { + "name": "link_link", + "type": "SYMLINK", + "link": "metadata.log", + "link_type": "FILE", + "uri": f"{root}/inventory/metadata.log", + }, + { + "name": "metadata.log", + "type": "FILE", + "uri": f"{root}/inventory/metadata.log", + "size": 28, + }, + ], + "uri": f"{root}/contents/", + } + + assert cm.get_contents(md5, "subdir", root) == { + "name": "subdir", + "type": "DIRECTORY", + "directories": [], + "files": [], + "uri": f"{root}/contents/subdir", + } + assert cm.get_contents(md5, "dir_link", root) == { + "name": "dir_link", + "type": "SYMLINK", + "uri": f"{root}/contents/subdir", + "link": "subdir", + "link_type": "DIRECTORY", + } + assert cm.get_contents(md5, "file_link", root) == { + "name": "file_link", + "type": "SYMLINK", + "uri": f"{root}/inventory/metadata.log", + "link": "metadata.log", + "link_type": "FILE", + "size": 28, + } + assert cm.get_contents(md5, "bad_link", root) == { + "name": "bad_link", + "type": "SYMLINK", + "uri": f"{root}/inventory/bad_link", + "link": "/etc/passwd", + "link_type": "BROKEN", + } + assert cm.get_contents(md5, "fifo_link", root) == { + "name": "fifo_link", + "type": "SYMLINK", + "uri": f"{root}/inventory/fifo_link", + "link": "fifo", + "link_type": "OTHER", + } + assert cm.get_contents(md5, "fifo", root) == { + "name": "fifo", + "type": "OTHER", + "uri": f"{root}/inventory/fifo", + } + def test_find( self, selinux_enabled, @@ -1566,6 +1732,12 @@ def test_find( through the various supported methods. """ + def mock_query(**kwargs) -> Dataset: + id = kwargs.get("resource_id") + if id == md5: + return Dataset(name=dataset_name, resource_id=md5) + raise TarballNotFound(id) + monkeypatch.setattr(Tarball, "_get_metadata", fake_get_metadata) source_tarball, source_md5, md5 = tarball dataset_name = Dataset.stem(source_tarball) @@ -1580,6 +1752,8 @@ def test_find( assert tarball assert tarball.name == dataset_name assert tarball.resource_id == md5 + monkeypatch.setattr(Dataset, "query", mock_query) + monkeypatch.setattr(Metadata, "getvalue", lambda d, k: tarball.tarball_path) # Test __getitem__ assert tarball == cm[md5] @@ -1607,7 +1781,7 @@ def test_find( assert str(exc.value) == "The dataset tarball named 'foobar' is not found" assert exc.value.tarball == "foobar" - # Unpack the dataset, creating INCOMING and RESULTS links + # Unpack the dataset tarball.get_results(FakeLockRef(tarball.lock)) assert tarball.cache == controller.cache / md5 assert tarball.unpacked == controller.cache / md5 / tarball.name @@ -1807,6 +1981,44 @@ def test_compatibility( assert not tar2.exists() assert not tar1.parent.exists() + def test_discover_datasets( + self, + db_session, + selinux_enabled, + create_drb_user, + server_config, + make_logger, + tarball, + monkeypatch, + ): + """Test compatibility with both new "isolated" and old tarballs + + Make sure we can discover and manage (unpack and delete) both new + tarballs with an MD5 isolation directory and pre-existing tarballs + directly in the controller directory. + """ + + monkeypatch.setattr(Tarball, "_get_metadata", fake_get_metadata) + source_tarball, source_md5, md5 = tarball + name = Dataset.stem(source_tarball) + cm = CacheManager(server_config, make_logger) + cm.create(source_tarball) + assert cm[md5].name == name + path = str(cm[md5].tarball_path) + + # Rediscover the cache, which should find the tarball + cm1 = CacheManager(server_config, make_logger).full_discovery() + assert cm1[md5].name == name + + # Rediscover again, using dataset metadata. But the cache manager + # doesn't create the Dataset or Metadata, so we do it here) + drb = User.query(username="drb") + ds = Dataset(name=name, resource_id=md5, owner=drb, access="public") + ds.add() + Metadata.setvalue(ds, Metadata.TARBALL_PATH, path) + cm2 = CacheManager(server_config, make_logger).full_discovery(search=False) + assert cm2[md5].name == name + def test_lockref(self, monkeypatch): """Test behavior of the LockRef class""" diff --git a/lib/pbench/test/unit/server/test_datasets_compare.py b/lib/pbench/test/unit/server/test_datasets_compare.py index ed5611cdbd..467946de9e 100644 --- a/lib/pbench/test/unit/server/test_datasets_compare.py +++ b/lib/pbench/test/unit/server/test_datasets_compare.py @@ -10,11 +10,13 @@ from pbench.server.database.models.datasets import Dataset, DatasetNotFound, Metadata from pbench.server.database.models.users import User +real_getvalue = Metadata.getvalue -def mock_get_value(dataset: Dataset, key: str, user: Optional[User] = None) -> str: - if dataset.name == "uperf_3" or dataset.name == "uperf_4": - return "hammerDB" - return "uperf" + +def mock_get_value(dataset: Dataset, key: str, user: Optional[User] = None) -> JSON: + if key == Metadata.SERVER_BENCHMARK: + return "hammerDB" if dataset.name in ("uperf_3", "uperf_4") else "uperf" + return real_getvalue(dataset, key, user) class TestCompareDatasets: @@ -55,8 +57,11 @@ def query_api( return query_api def test_no_postprocessed_data(self, query_get_as, monkeypatch): - monkeypatch.setattr(Metadata, "getvalue", mock_get_value) - + """Test with an unknown benchmark""" + fio_2 = Dataset.query(name="fio_2") + Metadata.setvalue( + fio_2, Metadata.SERVER_BENCHMARK, Metadata.SERVER_BENCHMARK_UNKNOWN + ) query_get_as(["fio_2"], "drb", HTTPStatus.BAD_REQUEST) def test_with_incorrect_data(self, query_get_as, monkeypatch): diff --git a/lib/pbench/test/unit/server/test_datasets_contents.py b/lib/pbench/test/unit/server/test_datasets_contents.py index dc05989ca6..1aaa296f00 100644 --- a/lib/pbench/test/unit/server/test_datasets_contents.py +++ b/lib/pbench/test/unit/server/test_datasets_contents.py @@ -5,11 +5,11 @@ import pytest import requests -from pbench.server.cache_manager import BadDirpath, CacheManager, CacheObject, CacheType +from pbench.server.cache_manager import BadDirpath, CacheManager from pbench.server.database.models.datasets import Dataset, DatasetNotFound -class TestDatasetsAccess: +class TestDatasetsContents: @pytest.fixture() def query_get_as(self, client, server_config, more_datasets, pbench_drb_token): """ @@ -63,220 +63,57 @@ def test_unauthorized_access(self, query_get_as): "message": "User drb is not authorized to READ a resource owned by test with private access" } + def test_contents_error(self, monkeypatch, query_get_as): + """This fails with an internal error from get_contents""" + + def mock_contents(_s, _d, _p, _o): + raise Exception("Nobody expected me") + + monkeypatch.setattr(CacheManager, "get_contents", mock_contents) + query_get_as("drb", "metadata.log", HTTPStatus.INTERNAL_SERVER_ERROR) + + def test_jsonify_error(self, monkeypatch, query_get_as): + """This fails with an internal error from jsonifying bad data""" + + monkeypatch.setattr( + CacheManager, "get_contents", lambda _s, _d, _p, _o: {1: 0, Path("."): "a"} + ) + query_get_as("drb", "metadata.log", HTTPStatus.INTERNAL_SERVER_ERROR) + @pytest.mark.parametrize("key", ("", ".", "subdir1")) def test_path_is_directory(self, query_get_as, monkeypatch, key): - """Mock a directory cache node to check the returned data""" + """Mock a directory cache node to check that data is passed through""" name = "" if key == "." else key - - def mock_find_entry(_s, _d: str, path: Optional[Path]): - file = path if path else Path(".") - return { - "children": {}, - "details": CacheObject( - name, file, None, None, None, CacheType.DIRECTORY - ), - } - - monkeypatch.setattr(CacheManager, "find_entry", mock_find_entry) - response = query_get_as("fio_2", key, HTTPStatus.OK) - assert response.json == { + contents = { "directories": [], "files": [], - "name": name, - "type": "DIRECTORY", "uri": f"https://localhost/api/v1/datasets/random_md5_string4/contents/{name}", } + monkeypatch.setattr(CacheManager, "get_contents", lambda s, d, p, o: contents) + response = query_get_as("fio_2", key, HTTPStatus.OK) + assert response.json == contents + def test_not_a_file(self, query_get_as, monkeypatch): - """When 'find_entry' fails with an exception, we report the text""" + """When 'get_contents' fails with an exception, we report the text""" - def mock_find_entry(_s, _d: str, path: Optional[Path]): + def mock_get_contents(_s, _d: str, _p: Optional[Path], _o: str): raise BadDirpath("Nobody home") - monkeypatch.setattr(CacheManager, "find_entry", mock_find_entry) + monkeypatch.setattr(CacheManager, "get_contents", mock_get_contents) response = query_get_as("fio_2", "subdir1/f1_sym", HTTPStatus.NOT_FOUND) assert response.json == {"message": "Nobody home"} def test_file_info(self, query_get_as, monkeypatch): - """Mock a file cache node to check the returned data""" + """Mock a file cache node to check check that data is passed through""" name = "f1.json" - - def mock_find_entry(_s, _d: str, path: Optional[Path]): - return { - "details": CacheObject(path.name, path, None, None, 16, CacheType.FILE) - } - - monkeypatch.setattr(CacheManager, "find_entry", mock_find_entry) - response = query_get_as("fio_2", name, HTTPStatus.OK) - assert response.status_code == HTTPStatus.OK - assert response.json == { + file_data = { "name": name, "size": 16, "type": "FILE", "uri": f"https://localhost/api/v1/datasets/random_md5_string4/inventory/{name}", } - - def test_link_info(self, query_get_as, monkeypatch): - """Mock a symlink cache node to check the returned data""" - name = "test/slink" - - def mock_find_entry(_s, _d: str, path: Optional[Path]): - return { - "details": CacheObject( - path.name, - path, - Path("test1"), - CacheType.DIRECTORY, - None, - CacheType.SYMLINK, - ) - } - - monkeypatch.setattr(CacheManager, "find_entry", mock_find_entry) + monkeypatch.setattr(CacheManager, "get_contents", lambda s, d, p, o: file_data) response = query_get_as("fio_2", name, HTTPStatus.OK) - assert response.json == { - "name": "slink", - "type": "SYMLINK", - "link": "test1", - "link_type": "DIRECTORY", - "uri": "https://localhost/api/v1/datasets/random_md5_string4/contents/test1", - } - - def test_dir_info(self, query_get_as, monkeypatch): - """Confirm the returned data for a directory with various children""" - - def mock_find_entry(_s, _d: str, path: Optional[Path]): - base = path if path else Path("") - return { - "children": { - "default": { - "details": CacheObject( - "default", - base / "default", - None, - None, - None, - CacheType.DIRECTORY, - ) - }, - "file.txt": { - "details": CacheObject( - "file.txt", - base / "file.txt", - None, - None, - 42, - CacheType.FILE, - ) - }, - "badlink": { - "details": CacheObject( - "badlink", - base / "badlink", - Path("../../../.."), - CacheType.BROKEN, - None, - CacheType.SYMLINK, - ), - }, - "mountlink": { - "details": CacheObject( - "mountlink", - base / "mountlink", - Path("mount"), - CacheType.OTHER, - None, - CacheType.SYMLINK, - ), - }, - "symfile": { - "details": CacheObject( - "symfile", - base / "symfile", - Path("metadata.log"), - CacheType.FILE, - None, - CacheType.SYMLINK, - ) - }, - "symdir": { - "details": CacheObject( - "symdir", - base / "symdir", - Path("realdir"), - CacheType.DIRECTORY, - None, - CacheType.SYMLINK, - ) - }, - "mount": { - "details": CacheObject( - "mount", - base / "mount", - None, - None, - 20, - CacheType.OTHER, - ) - }, - }, - "details": CacheObject( - base.name, base, None, None, None, CacheType.DIRECTORY - ), - } - - monkeypatch.setattr(CacheManager, "find_entry", mock_find_entry) - response = query_get_as("fio_2", "sample1", HTTPStatus.OK) - assert response.json == { - "directories": [ - { - "name": "default", - "type": "DIRECTORY", - "uri": "https://localhost/api/v1/datasets/random_md5_string4/contents/sample1/default", - } - ], - "files": [ - { - "name": "badlink", - "type": "SYMLINK", - "link": "../../../..", - "link_type": "BROKEN", - "uri": "https://localhost/api/v1/datasets/random_md5_string4/inventory/sample1/badlink", - }, - { - "name": "file.txt", - "size": 42, - "type": "FILE", - "uri": "https://localhost/api/v1/datasets/random_md5_string4/inventory/sample1/file.txt", - }, - { - "name": "mount", - "type": "OTHER", - "uri": "https://localhost/api/v1/datasets/random_md5_string4/inventory/sample1/mount", - }, - { - "name": "mountlink", - "type": "SYMLINK", - "link": "mount", - "link_type": "OTHER", - "uri": "https://localhost/api/v1/datasets/random_md5_string4/inventory/sample1/mountlink", - }, - { - "name": "symdir", - "type": "SYMLINK", - "link": "realdir", - "link_type": "DIRECTORY", - "uri": "https://localhost/api/v1/datasets/random_md5_string4/contents/realdir", - }, - { - "name": "symfile", - "type": "SYMLINK", - "link": "metadata.log", - "link_type": "FILE", - "uri": "https://localhost/api/v1/datasets/random_md5_string4/inventory/metadata.log", - }, - ], - "name": "sample1", - "type": "DIRECTORY", - "uri": "https://localhost/api/v1/datasets/random_md5_string4/contents/sample1", - } + assert response.status_code == HTTPStatus.OK + assert response.json == file_data