ENH: Only cache DataSource outputs by default.

New users often get confused when developing new algorithms -- they make changes but get the same result. This is because the data is fetched from the cache (non-unique pipeline definition). Based on #372 we decided to change the default behaviour for most nodes.
creare-com · Apr 17, 2020 · fc34bb2 · fc34bb2
1 parent f4ba19e
commit fc34bb2
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 15 deletions.
diff --git a/podpac/core/algorithm/test/test_stats.py b/podpac/core/algorithm/test/test_stats.py
@@ -43,13 +43,13 @@ def test_auto_chunk(self):
         node = Min(source=source)
 
         with podpac.settings:
-            podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
+            podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
             podpac.settings["CHUNK_SIZE"] = "auto"
             node.eval(coords)
 
     def test_chunked_fallback(self):
         with podpac.settings:
-            podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
+            podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
 
             class First(Reduce):
                 def reduce(self, x):
@@ -75,7 +75,7 @@ class BaseTests(object):
 
     def test_full(self):
         with podpac.settings:
-            podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
+            podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
             podpac.settings["CHUNK_SIZE"] = None
 
             node = self.NodeClass(source=source)
@@ -91,15 +91,15 @@ def test_full(self):
     def test_full_chunked(self):
         with podpac.settings:
             node = self.NodeClass(source=source, dims=coords.dims)
-            podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
+            podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
             podpac.settings["CHUNK_SIZE"] = 500
             output = node.eval(coords)
             # xr.testing.assert_allclose(output, self.expected_full)
             np.testing.assert_allclose(output.data, self.expected_full.data)
 
     def test_lat_lon(self):
         with podpac.settings:
-            podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
+            podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
             podpac.settings["CHUNK_SIZE"] = None
             node = self.NodeClass(source=source, dims=["lat", "lon"])
             output = node.eval(coords)
@@ -108,7 +108,7 @@ def test_lat_lon(self):
 
     def test_lat_lon_chunked(self):
         with podpac.settings:
-            podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
+            podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
             podpac.settings["CHUNK_SIZE"] = 500
             node = self.NodeClass(source=source, dims=["lat", "lon"])
             output = node.eval(coords)
@@ -117,7 +117,7 @@ def test_lat_lon_chunked(self):
 
     def test_time(self):
         with podpac.settings:
-            podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
+            podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
             podpac.settings["CHUNK_SIZE"] = None
             node = self.NodeClass(source=source, dims="time")
             output = node.eval(coords)
@@ -126,7 +126,7 @@ def test_time(self):
 
     def test_time_chunked(self):
         with podpac.settings:
-            podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
+            podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
             podpac.settings["CHUNK_SIZE"] = 500
             node = self.NodeClass(source=source, dims="time")
             output = node.eval(coords)
@@ -135,7 +135,7 @@ def test_time_chunked(self):
 
     def test_multiple_outputs(self):
         with podpac.settings:
-            podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
+            podpac.settings["CACHE_NODE_OUTPUT_DEFAULT"] = False
             podpac.settings["CHUNK_SIZE"] = None
             node = self.NodeClass(source=multisource, dims=["lat", "lon"])
             output = node.eval(coords)

diff --git a/podpac/core/data/datasource.py b/podpac/core/data/datasource.py
@@ -144,7 +144,10 @@ class DataSource(Node):
         Default is 'numpy'
     cache_coordinates : bool
         Whether to cache coordinates using the podpac ``cache_ctrl``. Default False.
-
+    cache_output : bool
+        Should the node's output be cached? If not provided or None, uses default based on 
+        settings["CACHE_DATASOURCE_OUTPUT_DEFAULT"]. If True, outputs will be cached and retrieved from cache. If False,
+        outputs will not be cached OR retrieved from cache (even if they exist in cache). 
     
     Notes
     -----
@@ -156,6 +159,7 @@ class DataSource(Node):
 
     coordinate_index_type = tl.Enum(["slice", "list", "numpy"], default_value="numpy")  # , "xarray", "pandas"],
     cache_coordinates = tl.Bool(False)
+    cache_output = tl.Bool()
 
     # privates
     _interpolation = tl.Instance(Interpolation)
@@ -173,6 +177,10 @@ def _default_interpolation(self):
         self._set_interpolation()
         return self._interpolation
 
+    @tl.default("cache_output")
+    def _cache_output_default(self):
+        return settings["CACHE_DATASOURCE_OUTPUT_DEFAULT"]
+
     # ------------------------------------------------------------------------------------------------------------------
     # Properties
     # ------------------------------------------------------------------------------------------------------------------

diff --git a/podpac/core/node.py b/podpac/core/node.py
@@ -93,7 +93,10 @@ class Node(tl.HasTraits):
     Attributes
     ----------
     cache_output: bool
-        Should the node's output be cached? If not provided or None, uses default based on settings.
+        Should the node's output be cached? If not provided or None, uses default based on settings 
+        (CACHE_NODE_OUTPUT_DEFAULT for general Nodes, and CACHE_DATASOURCE_OUTPUT_DEFAULT  for DataSource nodes). 
+        If True, outputs will be cached and retrieved from cache. If False, outputs will not be cached OR retrieved from cache (even if 
+        they exist in cache). 
     cache_update: bool
         Default is True. Should the node's cached output be updated from the source data?
     cache_ctrl: :class:`podpac.core.cache.cache.CacheCtrl`
@@ -157,7 +160,7 @@ def _validate_units(self, d):
 
     @tl.default("cache_output")
     def _cache_output_default(self):
-        return settings["CACHE_OUTPUT_DEFAULT"]
+        return settings["CACHE_NODE_OUTPUT_DEFAULT"]
 
     @tl.default("cache_ctrl")
     def _cache_ctrl_default(self):
@@ -969,7 +972,7 @@ def wrapper(self, coordinates, output=None):
         key = cache_key
         cache_coordinates = coordinates.transpose(*sorted(coordinates.dims))  # order agnostic caching
 
-        if not self.cache_update and self.has_cache(key, cache_coordinates):
+        if not self.cache_update and self.cache_output and self.has_cache(key, cache_coordinates):
             data = self.get_cache(key, cache_coordinates)
             if output is not None:
                 order = [dim for dim in output.dims if dim not in data.dims] + list(data.dims)

diff --git a/podpac/core/settings.py b/podpac/core/settings.py
@@ -35,7 +35,8 @@
     "UNSAFE_EVAL_HASH": uuid.uuid4().hex,  # unique id for running unsafe evaluations
     # cache
     "DEFAULT_CACHE": ["ram"],
-    "CACHE_OUTPUT_DEFAULT": True,
+    "CACHE_DATASOURCE_OUTPUT_DEFAULT": True,
+    "CACHE_NODE_OUTPUT_DEFAULT": False,
     "RAM_CACHE_MAX_BYTES": 1e9,  # ~1GB
     "DISK_CACHE_MAX_BYTES": 10e9,  # ~10GB
     "S3_CACHE_MAX_BYTES": 10e9,  # ~10GB
@@ -109,8 +110,10 @@ class PodpacSettings(dict):
         Notification email for when AWS usage reaches 80% of the `AWS_BUDGET_AMOUNT`
     DEFAULT_CACHE : list
         Defines a default list of cache stores in priority order. Defaults to `['ram']`.
-    CACHE_OUTPUT_DEFAULT : bool
+    CACHE_NODE_OUTPUT_DEFAULT : bool
         Default value for node ``cache_output`` trait. If True, the outputs of nodes (eval) will be automatically cached.
+    CACHE_DATASOURCE_OUTPUT_DEFAULT : bool
+        Default value for DataSource nodes ``cache_output`` trait. If True, the outputs of nodes (eval) will be automatically cached.
     RAM_CACHE_MAX_BYTES : int
         Maximum RAM cache size in bytes. 
         Note, for RAM cache only, the limit is applied to the total amount of RAM used by the python process;