Skip to content

Commit

Permalink
ENH: Only cache DataSource outputs by default.
Browse files Browse the repository at this point in the history
New users often get confused when developing new algorithms -- they make changes but get the same result. This is because the data is fetched from the cache (non-unique pipeline definition). Based on #372 we decided to change the default behaviour for most nodes.
  • Loading branch information
mpu-creare committed Apr 17, 2020
1 parent f4ba19e commit fc34bb2
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 15 deletions.
18 changes: 9 additions & 9 deletions podpac/core/algorithm/test/test_stats.py
Expand Up @@ -43,13 +43,13 @@ def test_auto_chunk(self):
node = Min(source=source)

with podpac.settings:
podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
podpac.settings["CHUNK_SIZE"] = "auto"
node.eval(coords)

def test_chunked_fallback(self):
with podpac.settings:
podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False

class First(Reduce):
def reduce(self, x):
Expand All @@ -75,7 +75,7 @@ class BaseTests(object):

def test_full(self):
with podpac.settings:
podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
podpac.settings["CHUNK_SIZE"] = None

node = self.NodeClass(source=source)
Expand All @@ -91,15 +91,15 @@ def test_full(self):
def test_full_chunked(self):
with podpac.settings:
node = self.NodeClass(source=source, dims=coords.dims)
podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
podpac.settings["CHUNK_SIZE"] = 500
output = node.eval(coords)
# xr.testing.assert_allclose(output, self.expected_full)
np.testing.assert_allclose(output.data, self.expected_full.data)

def test_lat_lon(self):
with podpac.settings:
podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
podpac.settings["CHUNK_SIZE"] = None
node = self.NodeClass(source=source, dims=["lat", "lon"])
output = node.eval(coords)
Expand All @@ -108,7 +108,7 @@ def test_lat_lon(self):

def test_lat_lon_chunked(self):
with podpac.settings:
podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
podpac.settings["CHUNK_SIZE"] = 500
node = self.NodeClass(source=source, dims=["lat", "lon"])
output = node.eval(coords)
Expand All @@ -117,7 +117,7 @@ def test_lat_lon_chunked(self):

def test_time(self):
with podpac.settings:
podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
podpac.settings["CHUNK_SIZE"] = None
node = self.NodeClass(source=source, dims="time")
output = node.eval(coords)
Expand All @@ -126,7 +126,7 @@ def test_time(self):

def test_time_chunked(self):
with podpac.settings:
podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
podpac.settings["CHUNK_SIZE"] = 500
node = self.NodeClass(source=source, dims="time")
output = node.eval(coords)
Expand All @@ -135,7 +135,7 @@ def test_time_chunked(self):

def test_multiple_outputs(self):
with podpac.settings:
podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
podpac.settings["CHUNK_SIZE"] = None
node = self.NodeClass(source=multisource, dims=["lat", "lon"])
output = node.eval(coords)
Expand Down
10 changes: 9 additions & 1 deletion podpac/core/data/datasource.py
Expand Up @@ -144,7 +144,10 @@ class DataSource(Node):
Default is 'numpy'
cache_coordinates : bool
Whether to cache coordinates using the podpac ``cache_ctrl``. Default False.
cache_output : bool
Should the node's output be cached? If not provided or None, uses default based on
settings["CACHE_DATASOURCE_OUTPUT_DEFAULT"]. If True, outputs will be cached and retrieved from cache. If False,
outputs will not be cached OR retrieved from cache (even if they exist in cache).
Notes
-----
Expand All @@ -156,6 +159,7 @@ class DataSource(Node):

coordinate_index_type = tl.Enum(["slice", "list", "numpy"], default_value="numpy") # , "xarray", "pandas"],
cache_coordinates = tl.Bool(False)
cache_output = tl.Bool()

# privates
_interpolation = tl.Instance(Interpolation)
Expand All @@ -173,6 +177,10 @@ def _default_interpolation(self):
self._set_interpolation()
return self._interpolation

@tl.default("cache_output")
def _cache_output_default(self):
return settings["CACHE_DATASOURCE_OUTPUT_DEFAULT"]

# ------------------------------------------------------------------------------------------------------------------
# Properties
# ------------------------------------------------------------------------------------------------------------------
Expand Down
9 changes: 6 additions & 3 deletions podpac/core/node.py
Expand Up @@ -93,7 +93,10 @@ class Node(tl.HasTraits):
Attributes
----------
cache_output: bool
Should the node's output be cached? If not provided or None, uses default based on settings.
Should the node's output be cached? If not provided or None, uses default based on settings
(CACHE_NODE_OUTPUT_DEFAULT for general Nodes, and CACHE_DATASOURCE_OUTPUT_DEFAULT for DataSource nodes).
If True, outputs will be cached and retrieved from cache. If False, outputs will not be cached OR retrieved from cache (even if
they exist in cache).
cache_update: bool
Default is True. Should the node's cached output be updated from the source data?
cache_ctrl: :class:`podpac.core.cache.cache.CacheCtrl`
Expand Down Expand Up @@ -157,7 +160,7 @@ def _validate_units(self, d):

@tl.default("cache_output")
def _cache_output_default(self):
return settings["CACHE_OUTPUT_DEFAULT"]
return settings["CACHE_NODE_OUTPUT_DEFAULT"]

@tl.default("cache_ctrl")
def _cache_ctrl_default(self):
Expand Down Expand Up @@ -969,7 +972,7 @@ def wrapper(self, coordinates, output=None):
key = cache_key
cache_coordinates = coordinates.transpose(*sorted(coordinates.dims)) # order agnostic caching

if not self.cache_update and self.has_cache(key, cache_coordinates):
if not self.cache_update and self.cache_output and self.has_cache(key, cache_coordinates):
data = self.get_cache(key, cache_coordinates)
if output is not None:
order = [dim for dim in output.dims if dim not in data.dims] + list(data.dims)
Expand Down
7 changes: 5 additions & 2 deletions podpac/core/settings.py
Expand Up @@ -35,7 +35,8 @@
"UNSAFE_EVAL_HASH": uuid.uuid4().hex, # unique id for running unsafe evaluations
# cache
"DEFAULT_CACHE": ["ram"],
"CACHE_OUTPUT_DEFAULT": True,
"CACHE_DATASOURCE_OUTPUT_DEFAULT": True,
"CACHE_NODE_OUTPUT_DEFAULT": False,
"RAM_CACHE_MAX_BYTES": 1e9, # ~1GB
"DISK_CACHE_MAX_BYTES": 10e9, # ~10GB
"S3_CACHE_MAX_BYTES": 10e9, # ~10GB
Expand Down Expand Up @@ -109,8 +110,10 @@ class PodpacSettings(dict):
Notification email for when AWS usage reaches 80% of the `AWS_BUDGET_AMOUNT`
DEFAULT_CACHE : list
Defines a default list of cache stores in priority order. Defaults to `['ram']`.
CACHE_OUTPUT_DEFAULT : bool
CACHE_NODE_OUTPUT_DEFAULT : bool
Default value for node ``cache_output`` trait. If True, the outputs of nodes (eval) will be automatically cached.
CACHE_DATASOURCE_OUTPUT_DEFAULT : bool
Default value for DataSource nodes ``cache_output`` trait. If True, the outputs of nodes (eval) will be automatically cached.
RAM_CACHE_MAX_BYTES : int
Maximum RAM cache size in bytes.
Note, for RAM cache only, the limit is applied to the total amount of RAM used by the python process;
Expand Down

0 comments on commit fc34bb2

Please sign in to comment.