Skip to content

Commit

Permalink
polish call cache docs & logs
Browse files Browse the repository at this point in the history
  • Loading branch information
mlin committed Jul 3, 2020
1 parent 0fdf61a commit 8b1643d
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 22 deletions.
13 changes: 5 additions & 8 deletions WDL/runtime/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
import json
import os
import logging
import threading
from pathlib import Path
from typing import Iterator, Dict, Any, Optional, Set, List, IO
from typing import Dict, Any, Optional, List
from contextlib import AbstractContextManager
from urllib.parse import urlparse, urlunparse
from fnmatch import fnmatchcase
Expand Down Expand Up @@ -37,7 +36,7 @@ def __init__(self, cfg: config.Loader, logger: logging.Logger):

try:
os.mkdir(self.call_cache_dir)
except Exception as e:
except Exception:
pass

def __enter__(self) -> "CallCache":
Expand Down Expand Up @@ -80,10 +79,10 @@ def get(
with open(file_path, "rb") as file_reader:
contents = file_reader.read()
except FileNotFoundError:
self._logger.info(f"Cache lookup unsuccessful for input_digest: {key}")
self._logger.info(_(f"call cache miss", cache_path=file_path))
return None
contents = json.loads(contents)
self._logger.notice(f"Cache found for input_digest: {key}") # pyre-fixme
self._logger.notice(_(f"call cache hit", cache_path=file_path)) # pyre-fixme
return values_from_json(contents, output_types) # pyre-fixme

def put(self, task_key: str, input_digest: str, outputs: Env.Bindings[Value.Base],) -> None:
Expand All @@ -103,9 +102,7 @@ def put(self, task_key: str, input_digest: str, outputs: Env.Bindings[Value.Base
json.dumps(values_to_json(outputs, namespace=""), indent=2), # pyre-ignore
filename,
)
self._logger.info(
f"Cache created for task_digest: {task_key}, input_digest: {input_digest}"
)
self._logger.info(_(f"call cache insert", cache_path=filename))

# specialized caching logic for file downloads (not sensitive to the downloader task details,
# and looked up in URI-derived folder structure instead of sqlite db)
Expand Down
3 changes: 2 additions & 1 deletion WDL/runtime/config_templates/default.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,14 @@ ignore_query = false
enable_patterns = ["*"]
disable_patterns = ["*.php", "*.aspx"]


[call_cache]
# When a task in a workflow is performed, cache the output of the task in a certain directory where it can
# be found later and reused for the same task definition/inputs
put = false
# enable retrieval of cached outputs
get = false
dir = "~/.cache/miniwdl"
dir = ~/.cache/miniwdl


[download_awscli]
Expand Down
17 changes: 12 additions & 5 deletions WDL/runtime/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import glob
import time
import random
import multiprocessing
import threading
import shutil
import shlex
Expand All @@ -25,7 +24,7 @@
from contextlib import ExitStack
import docker

from .. import Error, Type, Env, Value, StdLib, Tree, _util, Document
from .. import Error, Type, Env, Value, StdLib, Tree, _util
from .._util import (
write_atomic,
write_values_json,
Expand Down Expand Up @@ -830,9 +829,17 @@ def run_local_task(
key=f"{task.name}_{task_digest}/{input_digest}", output_types=task.effective_outputs
)
if cached:
logger.notice( # pyre-fixme
f"Succesfullly pulled cached outputs for task digest: {task_digest} and input_digest: {input_digest}"
)
for decl in task.outputs:
v = cached[decl.name]
vj = json.dumps(v.json)
logger.info(
_(
"cached output",
name=decl.name,
value=(v.json if len(vj) < 4096 else "(((large)))"),
)
)
logger.notice("done (cached)") # pyre-fixme
return (run_dir, cached)
# start plugin coroutines and process inputs through them
with compose_coroutines(
Expand Down
3 changes: 1 addition & 2 deletions WDL/runtime/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,11 @@
import traceback
import pickle
import threading
import copy
from concurrent import futures
from typing import Optional, List, Set, Tuple, NamedTuple, Dict, Union, Iterable, Callable, Any
from contextlib import ExitStack
import importlib_metadata
from .. import Env, Type, Value, Tree, StdLib, Document
from .. import Env, Type, Value, Tree, StdLib
from ..Error import InputError
from .task import run_local_task, _filenames, link_outputs
from .download import able as downloadable, run_cached as download
Expand Down
34 changes: 28 additions & 6 deletions docs/runner_reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,21 +74,43 @@ MINIWDL__TASK_RUNTIME__DEFAULTS='{"docker":"ubuntu:19.10"}'

## File download cache

Miniwdl automatically downloads input files supplied as URIs instead of locally-mounted filenames. It's also able to cache these downloads in a local directory, so that multiple workflow runs can reference files by URI without downloading them repeatedly. This permits efficient use of WDL input templates referring to public databases by URI (e.g. reference genomes, sequence databases, interval lists), without having to compromise portability by rewriting them with local paths.
Miniwdl automatically downloads input files supplied as URIs instead of locally-mounted filenames. It's also able to cache these downloads in a local directory, so that multiple workflow runs can reference files by URI without downloading them repeatedly. This permits efficient use of WDL input templates referring to public databases by URI (e.g. reference genomes, sequence databases, interval lists), without having to rewrite them with local, non-portable paths.

The download cache functionality must be enabled in the configuration. The relevant options, exemplified in the [`default.cfg`](https://github.com/chanzuckerberg/miniwdl/blob/main/WDL/runtime/config_templates/default.cfg) template, are in the `download_cache` section, especially `put = true`, `get = true`, and `dir`. Additional options such as `ignore_query`, `enable_patterns`, and `disable_patterns` provide control over which URIs will be cached. If the cache is enabled in persistent configuration, then `--no-cache` disables it for one run.
The download cache functionality must be enabled in the configuration; the relevant options are listed in the [`default.cfg`](https://github.com/chanzuckerberg/miniwdl/blob/main/WDL/runtime/config_templates/default.cfg) template, ``[download_cache]`` section. A minimal configuration might include:

```
[download_cache]
put = true
get = true
dir = /tmp/miniwdl_download_cache
```

Details:

* With the cache enabled in persistent configuration, `--no-cache` disables it for one run.
* The cache is **keyed by URI**: when a workflow starts with a URI file input, a cached file is used if previously stored for the same URI. This doesn't depend on which task/workflow is running, and doesn't use checksums or timestamps of the file contents. Therefore, the cache should only be used with immutable remote files, or if there's no need for immediate coherence with remote content changes.
* Enabling the cache changes **where downloaded files are stored**: if the cache is enabled, they're stored in the cache directory; otherwise, they're stored under the triggering run directory.
* URIs excluded from the cache by the enable/disable patterns fall back to being downloaded under the current run directory. Typically, write the patterns to **include reusable reference data while excluding any run-specific inputs** that might be supplied as URIs.
* URIs can be excluded from caching using the "pattern" options, in which case they'll be downloaded under the current run directory. Typically, write the patterns to **include reusable reference data while excluding any run-specific inputs** that might be supplied as URIs.
* If needed, the `miniwdl localize` subcommand can **"prime" the local cache** with URIs found in a given JSON input template (or a simple list of URIs) before actually running any workflow.
* Cached files that are no longer needed can simply be **deleted from the cache directory**, once they're no longer in use by a running workflow.
* Miniwdl itself doesn't delete files from the cache, but to support an **external cleanup process**, it updates the access timestamp (atime) and opens a shared `flock()` on any cached file it's using. The script [examples/clean_download_cache.sh](https://github.com/chanzuckerberg/miniwdl/blob/main/examples/clean_download_cache.sh) illustrates a process to shrink the cache to a desired maximum size, by evicting the least-recently used files that can be exclusively flocked (the latter condition needed only if the cleaner must run alongside concurrent workflows).

# Task output caching
# Task output cache (experimental)

Miniwdl is able to cache the output of tasks in a local directory, so that repeat runs of that task (with matching inputs) can reference those outputs via a digest of the task and its inputs.
Miniwdl can cache the output of task calls in a local directory, so that repeat runs of that task (with matching inputs) can reference those outputs via a digest of the task and its inputs.

The download cache functionality must be enabled in the configuration; the relevant options are listed in the [`default.cfg`](https://github.com/chanzuckerberg/miniwdl/blob/main/WDL/runtime/config_templates/default.cfg) template, ``[call_cache]`` section. A minimal configuration might include:

```
[call_cache]
put = true
get = true
dir = ~/.cache/miniwdl
```

Details:

* With the cache enabled in persistent configuration, `--no-cache` disables it for one run.
* Cached outputs are stored as `*.json` files in the cache directory, which can simply be deleted when no longer needed.
* Cached output reuse currently assumes that output files haven't been modified or moved/deleted from their original locations. This will evolve in subsequent miniwdl versions.

The task cache functionality must be enabled in the configuration. The relevant options exemplified in the (https://github.com/chanzuckerberg/miniwdl/blob/main/WDL/runtime/config_templates/default.cfg) template, are in the `call_cache` section, especially `put = true`, `get = true`, and `dir`. If the cache is enabled in persistent configuration, then `--no-cache` disables it for one run.

0 comments on commit 8b1643d

Please sign in to comment.