fix testing issues and adjust to changes in tensorflow api post 2.0

data-workspaces · Apr 6, 2020 · 2379183 · 2379183
1 parent 20046c5
commit 2379183
Show file tree

Hide file tree

Showing 14 changed files with 104 additions and 42 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,7 +4,7 @@ python:
   - "3.6"
 install:
   - pip install -r requirements.txt
-  - pip install mypy pyflakes jupyter
+  - pip install mypy pyflakes jupyter joblib sklearn
 script:
   - pip install --editable `pwd`
   - cd tests; make install-rclone-deb test
diff --git a/dataworkspaces/commands/config.py b/dataworkspaces/commands/config.py
@@ -1,6 +1,8 @@
 # Copyright 2018,2019 by MPI-SWS and Data-ken Research. Licensed under Apache 2.0. See LICENSE.txt.
 import click
 from typing import Optional, Dict, Any, List
+
+assert List  # make pyflakes happy
 from abc import ABCMeta, abstractmethod
 
 from dataworkspaces.workspace import Workspace, Resource, LocalStateResourceMixin
@@ -96,8 +98,8 @@ class LocalResourceHandler(ParamConfigHandler):
     def __init__(self, resource: Resource, workspace: Workspace):
         assert isinstance(resource, LocalStateResourceMixin)
         super().__init__(resource.get_local_params(), resource.param_defs.local_defs)
-        self.resource = resource
-        self.workspace = workspace
+        self.resource = resource  # type: Resource
+        self.workspace = workspace  # type: Workspace
 
     def get_scope(self) -> str:
         return "local"

diff --git a/dataworkspaces/kits/scikit_learn.py b/dataworkspaces/kits/scikit_learn.py
@@ -35,10 +35,11 @@
 from dataworkspaces.kits.wrapper_utils import _DwsModelState, _add_to_hash
 
 from .jupyter import is_notebook, get_step_name_for_notebook, get_notebook_directory
+
 try:
     import joblib
 except ImportError as e:
-    raise ConfigurationError("Please install the joblib package (via \"pip install joblib\")") from e
+    raise ConfigurationError('Please install the joblib package (via "pip install joblib")') from e
 
 
 def _load_dataset_file(dataset_path, filename):

diff --git a/dataworkspaces/kits/tensorflow.py b/dataworkspaces/kits/tensorflow.py
@@ -135,6 +135,7 @@ def call(self, inputs):
 from os.path import join, isdir, exists, basename
 import re
 import glob
+from types import GeneratorType
 
 import tensorflow
 
@@ -435,16 +436,14 @@ def add_lineage_to_keras_model_class(
     * :func:`~compile` - captures the ``optimizer`` and ``loss_function`` parameter values
     * :func:`~fit` - captures the ``epochs`` and ``batch_size`` parameter values;
       if input is an API resource, capture hash values of training data, otherwise capture
-      input resource name.
-    * :func:`~fit_generator` - captues the ``epochs`` and ``steps_per_epoch`` parameter
-      values; if input is an API resource, wraps the generator and captures the hashes
-      of returned values from the generator as it is iterated through.
+      input resource name. If the input is an API resource, and it is either a Keras Sequence
+      or a generator, writes the generator and captures the hashes of returned values as it
+      is iterated through.
     * :func:`~evaluate` - captures the ``batch_size`` parameter value; if input is an
       API resource, capture hash values of test data, otherwise capture input resource
-      name; capture metrics and write them to results resource.
-    * :func:`~evaluate_generator` - captures the ``steps`` parameter value; if input is
-      an API resource, wraps the generator and captures the hashes of returned values
-      from the generator as it is iterated through.
+      name; capture metrics and write them to results resource. If the input is an API resource,
+      and it is either a Keras Sequence or a generator, writes the generator and captures
+      the hashes of returned values as it is iterated through.
     """
     if hasattr(Cls, "_dws_model_wrap") and Cls._dws_model_wrap is True:  # type: ignore
         print("dws>> %s or a superclass is already wrapped" % Cls.__name__)
@@ -506,6 +505,8 @@ def compile(
             )
 
         def fit(self, x, y=None, **kwargs):
+            """x, y can be arrays or x can be a generator.
+            """
             if "epochs" in kwargs:
                 self._dws_state.lineage.add_param("fit.epochs", kwargs["epochs"])
             else:
@@ -519,10 +520,23 @@ def fit(self, x, y=None, **kwargs):
                 _verify_eager_if_dataset(x, y, api_resource)
                 api_resource.init_hash_state()
                 hash_state = api_resource.get_hash_state()
-                _add_to_hash(x, hash_state)
-                if y is not None:
-                    _add_to_hash(y, hash_state)
-                api_resource.save_current_hash()  # in case we evaluate in a separate process
+                if isinstance(x, kerasutils.Sequence):
+                    if y is not None:
+                        raise NotSupportedError(
+                            "fit() method does not suppport a generator for x AND a y value"
+                        )
+                    x = _TfKerasSequenceWrapper(x, hash_state)
+                elif isinstance(x, GeneratorType):
+                    if y is not None:
+                        raise NotSupportedError(
+                            "fit() method does not suppport a generator for x AND a y value"
+                        )
+                    x = _wrap_generator(x, hash_state)
+                else:  # x and y are provided as full arrays
+                    _add_to_hash(x, hash_state)
+                    if y is not None:
+                        _add_to_hash(y, hash_state)
+                    api_resource.save_current_hash()  # in case we evaluate in a separate process
             if self.checkpoint_cb:
                 if "callbacks" in kwargs:
                     kwargs["callbacks"].append(self.checkpoint_cb)
@@ -597,13 +611,28 @@ def evaluate(self, x, y=None, **kwargs):
                 _verify_eager_if_dataset(x, y, api_resource)
                 api_resource.dup_hash_state()
                 hash_state = api_resource.get_hash_state()
-                _add_to_hash(x, hash_state)
-                if y is not None:
-                    _add_to_hash(y, hash_state)
-                api_resource.save_current_hash()
-                api_resource.pop_hash_state()
+                if isinstance(x, kerasutils.Sequence):
+                    if y is not None:
+                        raise NotSupportedError(
+                            "evaluate() method does not suppport a generator for x AND a y value"
+                        )
+                    x = _TfKerasSequenceWrapper(x, hash_state)
+                elif isinstance(x, GeneratorType):
+                    if y is not None:
+                        raise NotSupportedError(
+                            "evaluate() method does not suppport a generator for x AND a y value"
+                        )
+                    x = _wrap_generator(x, hash_state)
+                else:
+                    _add_to_hash(x, hash_state)
+                    if y is not None:
+                        _add_to_hash(y, hash_state)
             results = super().evaluate(x, y, **kwargs)
             assert len(results) == len(self.metrics_names)
+            if api_resource is not None:
+                api_resource.save_current_hash()
+                print("saved hash!!!")  # XXX
+                api_resource.pop_hash_state()
             self._dws_state.write_metrics_and_complete(
                 {n: v for (n, v) in zip(self.metrics_names, results)}
             )

diff --git a/dataworkspaces/resources/api_resource.py b/dataworkspaces/resources/api_resource.py
@@ -1,6 +1,9 @@
 import os
 from os.path import join, exists
 from typing import Tuple, Optional, List, Any
+
+assert List  # make pyflakes happy
+assert Any
 import hashlib
 
 import click

diff --git a/dataworkspaces/third_party/git_fat.py b/dataworkspaces/third_party/git_fat.py
@@ -7,6 +7,7 @@
 from os.path import dirname, abspath, expanduser, isfile, join, exists, curdir
 import subprocess
 import click
+from typing import Optional, Dict, Any
 
 from dataworkspaces.errors import ConfigurationError, InternalError
 
@@ -61,7 +62,8 @@ def run_git_fat(python2_exe, args, cwd=curdir, verbose=False):
     cmd = [python2_exe, fat_script]+args
     if verbose:
         click.echo("%s from %s" % (' '.join(cmd), cwd))
-        env = os.environ.copy()
+        env = os.environ.copy() # type: Optional[Dict[str,Any]]
+        assert env is not None
         env['GIT_FAT_VERBOSE'] = "1"
     else:
         env = None

diff --git a/dataworkspaces/utils/lineage_utils.py b/dataworkspaces/utils/lineage_utils.py
@@ -113,7 +113,9 @@ def validate_json_keys(obj, classobj, keys, filename=None):
         if key not in obj:
             raise JsonKeyError(classobj, key, filename=filename)
 
-
+# Note: This is using Python 3.6+ syntax and is incompatible for 3.5.
+# We could backport to 3.5 if there's demand, but there's other code that
+# currently makes 3.6+ asumptions.
 class ResourceRef(NamedTuple):
     """A namedtuple that is used to identify an input or output of a step.
     The ``name`` parameter is the name of a resource. The optional
@@ -1790,6 +1792,8 @@ def make_simplified_lineage_graph_for_resource(
     width=1024,
     height=800,
 ) -> None:
+    """In this graph, the nodes are resources and the edges are steps.
+    """
     nodes = []  # type: List[Dict[str, Any]]
     links = []  # type: List[Dict[str, Any]]
 

diff --git a/setup.py b/setup.py
@@ -18,6 +18,7 @@
     url="https://github.com/data-workspaces/data-workspaces-core",
     packages=find_packages(),
     include_package_data=True, # needed for copying data files at install time
+    python_requires=">=3.6",
     install_requires=[
         'click',
         'requests',

diff --git a/tests/dws-test-environment.yml b/tests/dws-test-environment.yml
@@ -0,0 +1,15 @@
+# Defines a conda environment which includes all the dependencies
+# needed to run all the tests (none skipped).
+# Note that dws itself is not installed, so you can install the
+# local in-development copy.
+name: dws-test-environment
+dependencies:
+  - python=3.6.*
+  - jupyter
+  - matplotlib
+  - pandas
+  - scikit-learn
+  - pip
+  - pip:
+    - tensorflow>=2.0
+    - pytest
diff --git a/tests/test_jupyter_kit.py b/tests/test_jupyter_kit.py
@@ -30,7 +30,7 @@
     ERROR = e
     JUPYTER=None
 
-@unittest.skipUnless(JUPYTER is not None, "No Jupyter install found: %s"%ERROR)
+@unittest.skipUnless(JUPYTER is not None, "SKIP: No Jupyter install found: %s"%ERROR)
 class TestJupyterKit(unittest.TestCase):
     def setUp(self):
         if exists(TEMPDIR):

diff --git a/tests/test_lineage_utils.py b/tests/test_lineage_utils.py
@@ -74,7 +74,7 @@ def test_cert_equality(self):
 
 
 
-class TestStoreMixin(metaclass=ABCMeta):
+class TstStoreMixin(metaclass=ABCMeta):
     """This is a mixin for testing a LineageStore. It is independent of
     the implementation. You can use this as a building block to
     implement implementation-specific tests
@@ -379,7 +379,7 @@ def check_for_cert(ref, hashval):
 SNAPSHOT1_DIR=os.path.join(SNAPSHOT_DIR, 'snapshot1')
 SNAPSHOT2_DIR=os.path.join(SNAPSHOT_DIR, 'snapshot2')
 
-class TestFileLineageStore(unittest.TestCase, TestStoreMixin):
+class TestFileLineageStore(unittest.TestCase, TstStoreMixin):
     """Tests for the lineage store api file-based implementation"""
     def setUp(self):
         if os.path.exists(TEMPDIR):

diff --git a/tests/test_sklearn_kit.py b/tests/test_sklearn_kit.py
@@ -9,6 +9,11 @@
     SKLEARN_INSTALLED=True
 except ImportError:
     SKLEARN_INSTALLED=False
+try:
+    import joblib
+    JOBLIB_INSTALLED=True
+except ImportError:
+    JOBLIB_INSTALLED=False
 
 class TestSklearnKit(SimpleCase):
     def _add_digits_dataset(self):
@@ -20,7 +25,6 @@ def _add_digits_dataset(self):
     def wrapper_tc(self, model_save_file):
         from sklearn.svm import SVC
         from sklearn.model_selection import train_test_split
-        from sklearn.externals import joblib
         import dataworkspaces.kits.scikit_learn as skkit
         self._setup_initial_repo(git_resources='code,results')
         self._add_digits_dataset()
@@ -50,7 +54,8 @@ def wrapper_tc(self, model_save_file):
         self.assertAlmostEqual(score, 0.9688, 3,
                                "Score of %s not almost equal to 0.9688" % score)
 
-    @unittest.skipUnless(SKLEARN_INSTALLED, "Sklearn not available")
+    @unittest.skipUnless(SKLEARN_INSTALLED, "SKIP: Sklearn not available")
+    @unittest.skipUnless(JOBLIB_INSTALLED, "SKIP: joblib not available")
     def test_wrapper(self):
         self.wrapper_tc('digits.joblib')
 

diff --git a/tests/test_tensorflow.py b/tests/test_tensorflow.py
@@ -57,7 +57,7 @@ def tearDown(self):
     def _take_snapshot(self):
         self._run_dws(['snapshot', 'S1'], cwd=WS_DIR)
 
-    @unittest.skipUnless(TF_INSTALLED, "Tensorflow not available")
+    @unittest.skipUnless(TF_INSTALLED, "SKIP: Tensorflow not available")
     def test_wrapper_for_numpy(self):
         """This test follows the basic classification tutorial.
         """
@@ -95,9 +95,9 @@ def test_wrapper_for_numpy(self):
         self.assertAlmostEqual(test_loss, data['metrics']['loss'])
         self._take_snapshot()
 
-    @unittest.skipUnless(TF_INSTALLED, "Tensorflow not available")
-    @unittest.skipUnless(NUMPY_INSTALLED, "numpy not installed")
-    @unittest.skipUnless(PANDAS_INSTALLED, 'pandas not available')
+    @unittest.skipUnless(TF_INSTALLED, "SKIP: Tensorflow not available")
+    @unittest.skipUnless(NUMPY_INSTALLED, "SKIP: numpy not installed")
+    @unittest.skipUnless(PANDAS_INSTALLED, 'SKIP: pandas not available')
     def test_wrapper_for_dataset(self):
         """This follows the csv tutorial (titanic data set)
         """
@@ -213,7 +213,7 @@ def normalize_numeric_data(data, mean, std):
         self.assertAlmostEqual(test_loss, data['metrics']['loss'])
         self._take_snapshot()
 
-    @unittest.skipUnless(TF_INSTALLED, "Tensorflow not available")
+    @unittest.skipUnless(TF_INSTALLED, "SKIP: Tensorflow not available")
     def test_wrapper_for_generators(self):
         """This test follows the basic classification tutorial, modified for using
         the fit_generator() and eval_generator() methods.
@@ -244,9 +244,9 @@ def test_wrapper_for_generators(self):
                       metrics=['accuracy'])
         g = generator_from_arrays(train_images, train_labels)
         self.assertTrue(inspect.isgenerator(g))
-        model.fit_generator(g, epochs=5, steps_per_epoch=2)
+        model.fit(g, epochs=5, steps_per_epoch=2)
         g2 = generator_from_arrays(test_images, test_labels)
-        test_loss, test_acc = model.evaluate_generator(g2, steps=len(test_labels), verbose=2)
+        test_loss, test_acc = model.evaluate(g2, steps=len(test_labels), verbose=2)
         print("test accuracy: %s" % test_acc)
         results_file = join(WS_DIR, 'results/results.json')
         self.assertTrue(exists(results_file), "missing file %s" % results_file)
@@ -256,7 +256,7 @@ def test_wrapper_for_generators(self):
         self.assertAlmostEqual(test_loss, data['metrics']['loss'])
         self._take_snapshot()
 
-    @unittest.skipUnless(TF_INSTALLED, "Tensorflow not available")
+    @unittest.skipUnless(TF_INSTALLED, "SKIP: Tensorflow not available")
     def test_wrapper_for_keras_sequence(self):
         """This test follows the basic classification tutorial, modified for using
         the fit_generator() and eval_generator() methods.
@@ -304,9 +304,9 @@ def __len__(self):
                       loss='sparse_categorical_crossentropy',
                       metrics=['accuracy'])
         g = KSequence(train_images, train_labels)
-        model.fit_generator(g, epochs=5, steps_per_epoch=2)
+        model.fit(g, epochs=5, steps_per_epoch=2)
         g2 = KSequence(test_images, test_labels)
-        test_loss, test_acc = model.evaluate_generator(g2, steps=len(test_labels), verbose=2)
+        test_loss, test_acc = model.evaluate(g2, steps=len(test_labels), verbose=2)
         print("test accuracy: %s" % test_acc)
         results_file = join(WS_DIR, 'results/results.json')
         self.assertTrue(exists(results_file), "missing file %s" % results_file)

diff --git a/tests/test_wrapper_utils.py b/tests/test_wrapper_utils.py
@@ -44,20 +44,20 @@ def test_pandas_df(self):
         _add_to_hash(df, self.hash_state)
         print(self.hash_state.hexdigest())
 
-    @unittest.skipUnless(pandas is not None, 'Pandas not available')
+    @unittest.skipUnless(pandas is not None, 'SKIP: Pandas not available')
     def test_pandas_series(self):
         s = pandas.Series([1,0,0,1,1], name='y')
         _add_to_hash(s, self.hash_state)
         print(self.hash_state.hexdigest())
 
-    @unittest.skipUnless(numpy is not None, "Numpy not available")
+    @unittest.skipUnless(numpy is not None, "SKIP: Numpy not available")
     def test_numpy(self):
         a = numpy.arange(45)
         _add_to_hash(a, self.hash_state)
         print(self.hash_state.hexdigest())
 
-    @unittest.skipUnless(tensorflow is not None and TF_VERSION==2, "Rensorflow 2 not availabile")
-    @unittest.skipUnless(numpy is not None, "Numpy is not available")
+    @unittest.skipUnless(tensorflow is not None and TF_VERSION==2, "SKIP: Tensorflow 2 not availabile")
+    @unittest.skipUnless(numpy is not None, "SKIP: Numpy is not available")
     def test_tensorflow_tensor(self):
         dataset = tensorflow.data.Dataset.from_tensor_slices(numpy.arange(100).reshape((10,10)))
         for i in dataset: