Skip to content

Commit

Permalink
fix testing issues and adjust to changes in tensorflow api post 2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
jfischer committed Apr 6, 2020
1 parent 20046c5 commit 2379183
Show file tree
Hide file tree
Showing 14 changed files with 104 additions and 42 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ python:
- "3.6"
install:
- pip install -r requirements.txt
- pip install mypy pyflakes jupyter
- pip install mypy pyflakes jupyter joblib sklearn
script:
- pip install --editable `pwd`
- cd tests; make install-rclone-deb test
6 changes: 4 additions & 2 deletions dataworkspaces/commands/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2018,2019 by MPI-SWS and Data-ken Research. Licensed under Apache 2.0. See LICENSE.txt.
import click
from typing import Optional, Dict, Any, List

assert List # make pyflakes happy
from abc import ABCMeta, abstractmethod

from dataworkspaces.workspace import Workspace, Resource, LocalStateResourceMixin
Expand Down Expand Up @@ -96,8 +98,8 @@ class LocalResourceHandler(ParamConfigHandler):
def __init__(self, resource: Resource, workspace: Workspace):
assert isinstance(resource, LocalStateResourceMixin)
super().__init__(resource.get_local_params(), resource.param_defs.local_defs)
self.resource = resource
self.workspace = workspace
self.resource = resource # type: Resource
self.workspace = workspace # type: Workspace

def get_scope(self) -> str:
return "local"
Expand Down
3 changes: 2 additions & 1 deletion dataworkspaces/kits/scikit_learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@
from dataworkspaces.kits.wrapper_utils import _DwsModelState, _add_to_hash

from .jupyter import is_notebook, get_step_name_for_notebook, get_notebook_directory

try:
import joblib
except ImportError as e:
raise ConfigurationError("Please install the joblib package (via \"pip install joblib\")") from e
raise ConfigurationError('Please install the joblib package (via "pip install joblib")') from e


def _load_dataset_file(dataset_path, filename):
Expand Down
63 changes: 46 additions & 17 deletions dataworkspaces/kits/tensorflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def call(self, inputs):
from os.path import join, isdir, exists, basename
import re
import glob
from types import GeneratorType

import tensorflow

Expand Down Expand Up @@ -435,16 +436,14 @@ def add_lineage_to_keras_model_class(
* :func:`~compile` - captures the ``optimizer`` and ``loss_function`` parameter values
* :func:`~fit` - captures the ``epochs`` and ``batch_size`` parameter values;
if input is an API resource, capture hash values of training data, otherwise capture
input resource name.
* :func:`~fit_generator` - captues the ``epochs`` and ``steps_per_epoch`` parameter
values; if input is an API resource, wraps the generator and captures the hashes
of returned values from the generator as it is iterated through.
input resource name. If the input is an API resource, and it is either a Keras Sequence
or a generator, writes the generator and captures the hashes of returned values as it
is iterated through.
* :func:`~evaluate` - captures the ``batch_size`` parameter value; if input is an
API resource, capture hash values of test data, otherwise capture input resource
name; capture metrics and write them to results resource.
* :func:`~evaluate_generator` - captures the ``steps`` parameter value; if input is
an API resource, wraps the generator and captures the hashes of returned values
from the generator as it is iterated through.
name; capture metrics and write them to results resource. If the input is an API resource,
and it is either a Keras Sequence or a generator, writes the generator and captures
the hashes of returned values as it is iterated through.
"""
if hasattr(Cls, "_dws_model_wrap") and Cls._dws_model_wrap is True: # type: ignore
print("dws>> %s or a superclass is already wrapped" % Cls.__name__)
Expand Down Expand Up @@ -506,6 +505,8 @@ def compile(
)

def fit(self, x, y=None, **kwargs):
"""x, y can be arrays or x can be a generator.
"""
if "epochs" in kwargs:
self._dws_state.lineage.add_param("fit.epochs", kwargs["epochs"])
else:
Expand All @@ -519,10 +520,23 @@ def fit(self, x, y=None, **kwargs):
_verify_eager_if_dataset(x, y, api_resource)
api_resource.init_hash_state()
hash_state = api_resource.get_hash_state()
_add_to_hash(x, hash_state)
if y is not None:
_add_to_hash(y, hash_state)
api_resource.save_current_hash() # in case we evaluate in a separate process
if isinstance(x, kerasutils.Sequence):
if y is not None:
raise NotSupportedError(
"fit() method does not suppport a generator for x AND a y value"
)
x = _TfKerasSequenceWrapper(x, hash_state)
elif isinstance(x, GeneratorType):
if y is not None:
raise NotSupportedError(
"fit() method does not suppport a generator for x AND a y value"
)
x = _wrap_generator(x, hash_state)
else: # x and y are provided as full arrays
_add_to_hash(x, hash_state)
if y is not None:
_add_to_hash(y, hash_state)
api_resource.save_current_hash() # in case we evaluate in a separate process
if self.checkpoint_cb:
if "callbacks" in kwargs:
kwargs["callbacks"].append(self.checkpoint_cb)
Expand Down Expand Up @@ -597,13 +611,28 @@ def evaluate(self, x, y=None, **kwargs):
_verify_eager_if_dataset(x, y, api_resource)
api_resource.dup_hash_state()
hash_state = api_resource.get_hash_state()
_add_to_hash(x, hash_state)
if y is not None:
_add_to_hash(y, hash_state)
api_resource.save_current_hash()
api_resource.pop_hash_state()
if isinstance(x, kerasutils.Sequence):
if y is not None:
raise NotSupportedError(
"evaluate() method does not suppport a generator for x AND a y value"
)
x = _TfKerasSequenceWrapper(x, hash_state)
elif isinstance(x, GeneratorType):
if y is not None:
raise NotSupportedError(
"evaluate() method does not suppport a generator for x AND a y value"
)
x = _wrap_generator(x, hash_state)
else:
_add_to_hash(x, hash_state)
if y is not None:
_add_to_hash(y, hash_state)
results = super().evaluate(x, y, **kwargs)
assert len(results) == len(self.metrics_names)
if api_resource is not None:
api_resource.save_current_hash()
print("saved hash!!!") # XXX
api_resource.pop_hash_state()
self._dws_state.write_metrics_and_complete(
{n: v for (n, v) in zip(self.metrics_names, results)}
)
Expand Down
3 changes: 3 additions & 0 deletions dataworkspaces/resources/api_resource.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os
from os.path import join, exists
from typing import Tuple, Optional, List, Any

assert List # make pyflakes happy
assert Any
import hashlib

import click
Expand Down
4 changes: 3 additions & 1 deletion dataworkspaces/third_party/git_fat.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from os.path import dirname, abspath, expanduser, isfile, join, exists, curdir
import subprocess
import click
from typing import Optional, Dict, Any

from dataworkspaces.errors import ConfigurationError, InternalError

Expand Down Expand Up @@ -61,7 +62,8 @@ def run_git_fat(python2_exe, args, cwd=curdir, verbose=False):
cmd = [python2_exe, fat_script]+args
if verbose:
click.echo("%s from %s" % (' '.join(cmd), cwd))
env = os.environ.copy()
env = os.environ.copy() # type: Optional[Dict[str,Any]]
assert env is not None
env['GIT_FAT_VERBOSE'] = "1"
else:
env = None
Expand Down
6 changes: 5 additions & 1 deletion dataworkspaces/utils/lineage_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,9 @@ def validate_json_keys(obj, classobj, keys, filename=None):
if key not in obj:
raise JsonKeyError(classobj, key, filename=filename)


# Note: This is using Python 3.6+ syntax and is incompatible for 3.5.
# We could backport to 3.5 if there's demand, but there's other code that
# currently makes 3.6+ asumptions.
class ResourceRef(NamedTuple):
"""A namedtuple that is used to identify an input or output of a step.
The ``name`` parameter is the name of a resource. The optional
Expand Down Expand Up @@ -1790,6 +1792,8 @@ def make_simplified_lineage_graph_for_resource(
width=1024,
height=800,
) -> None:
"""In this graph, the nodes are resources and the edges are steps.
"""
nodes = [] # type: List[Dict[str, Any]]
links = [] # type: List[Dict[str, Any]]

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
url="https://github.com/data-workspaces/data-workspaces-core",
packages=find_packages(),
include_package_data=True, # needed for copying data files at install time
python_requires=">=3.6",
install_requires=[
'click',
'requests',
Expand Down
15 changes: 15 additions & 0 deletions tests/dws-test-environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Defines a conda environment which includes all the dependencies
# needed to run all the tests (none skipped).
# Note that dws itself is not installed, so you can install the
# local in-development copy.
name: dws-test-environment
dependencies:
- python=3.6.*
- jupyter
- matplotlib
- pandas
- scikit-learn
- pip
- pip:
- tensorflow>=2.0
- pytest
2 changes: 1 addition & 1 deletion tests/test_jupyter_kit.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
ERROR = e
JUPYTER=None

@unittest.skipUnless(JUPYTER is not None, "No Jupyter install found: %s"%ERROR)
@unittest.skipUnless(JUPYTER is not None, "SKIP: No Jupyter install found: %s"%ERROR)
class TestJupyterKit(unittest.TestCase):
def setUp(self):
if exists(TEMPDIR):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_lineage_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def test_cert_equality(self):



class TestStoreMixin(metaclass=ABCMeta):
class TstStoreMixin(metaclass=ABCMeta):
"""This is a mixin for testing a LineageStore. It is independent of
the implementation. You can use this as a building block to
implement implementation-specific tests
Expand Down Expand Up @@ -379,7 +379,7 @@ def check_for_cert(ref, hashval):
SNAPSHOT1_DIR=os.path.join(SNAPSHOT_DIR, 'snapshot1')
SNAPSHOT2_DIR=os.path.join(SNAPSHOT_DIR, 'snapshot2')

class TestFileLineageStore(unittest.TestCase, TestStoreMixin):
class TestFileLineageStore(unittest.TestCase, TstStoreMixin):
"""Tests for the lineage store api file-based implementation"""
def setUp(self):
if os.path.exists(TEMPDIR):
Expand Down
9 changes: 7 additions & 2 deletions tests/test_sklearn_kit.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
SKLEARN_INSTALLED=True
except ImportError:
SKLEARN_INSTALLED=False
try:
import joblib
JOBLIB_INSTALLED=True
except ImportError:
JOBLIB_INSTALLED=False

class TestSklearnKit(SimpleCase):
def _add_digits_dataset(self):
Expand All @@ -20,7 +25,6 @@ def _add_digits_dataset(self):
def wrapper_tc(self, model_save_file):
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
import dataworkspaces.kits.scikit_learn as skkit
self._setup_initial_repo(git_resources='code,results')
self._add_digits_dataset()
Expand Down Expand Up @@ -50,7 +54,8 @@ def wrapper_tc(self, model_save_file):
self.assertAlmostEqual(score, 0.9688, 3,
"Score of %s not almost equal to 0.9688" % score)

@unittest.skipUnless(SKLEARN_INSTALLED, "Sklearn not available")
@unittest.skipUnless(SKLEARN_INSTALLED, "SKIP: Sklearn not available")
@unittest.skipUnless(JOBLIB_INSTALLED, "SKIP: joblib not available")
def test_wrapper(self):
self.wrapper_tc('digits.joblib')

Expand Down
20 changes: 10 additions & 10 deletions tests/test_tensorflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def tearDown(self):
def _take_snapshot(self):
self._run_dws(['snapshot', 'S1'], cwd=WS_DIR)

@unittest.skipUnless(TF_INSTALLED, "Tensorflow not available")
@unittest.skipUnless(TF_INSTALLED, "SKIP: Tensorflow not available")
def test_wrapper_for_numpy(self):
"""This test follows the basic classification tutorial.
"""
Expand Down Expand Up @@ -95,9 +95,9 @@ def test_wrapper_for_numpy(self):
self.assertAlmostEqual(test_loss, data['metrics']['loss'])
self._take_snapshot()

@unittest.skipUnless(TF_INSTALLED, "Tensorflow not available")
@unittest.skipUnless(NUMPY_INSTALLED, "numpy not installed")
@unittest.skipUnless(PANDAS_INSTALLED, 'pandas not available')
@unittest.skipUnless(TF_INSTALLED, "SKIP: Tensorflow not available")
@unittest.skipUnless(NUMPY_INSTALLED, "SKIP: numpy not installed")
@unittest.skipUnless(PANDAS_INSTALLED, 'SKIP: pandas not available')
def test_wrapper_for_dataset(self):
"""This follows the csv tutorial (titanic data set)
"""
Expand Down Expand Up @@ -213,7 +213,7 @@ def normalize_numeric_data(data, mean, std):
self.assertAlmostEqual(test_loss, data['metrics']['loss'])
self._take_snapshot()

@unittest.skipUnless(TF_INSTALLED, "Tensorflow not available")
@unittest.skipUnless(TF_INSTALLED, "SKIP: Tensorflow not available")
def test_wrapper_for_generators(self):
"""This test follows the basic classification tutorial, modified for using
the fit_generator() and eval_generator() methods.
Expand Down Expand Up @@ -244,9 +244,9 @@ def test_wrapper_for_generators(self):
metrics=['accuracy'])
g = generator_from_arrays(train_images, train_labels)
self.assertTrue(inspect.isgenerator(g))
model.fit_generator(g, epochs=5, steps_per_epoch=2)
model.fit(g, epochs=5, steps_per_epoch=2)
g2 = generator_from_arrays(test_images, test_labels)
test_loss, test_acc = model.evaluate_generator(g2, steps=len(test_labels), verbose=2)
test_loss, test_acc = model.evaluate(g2, steps=len(test_labels), verbose=2)
print("test accuracy: %s" % test_acc)
results_file = join(WS_DIR, 'results/results.json')
self.assertTrue(exists(results_file), "missing file %s" % results_file)
Expand All @@ -256,7 +256,7 @@ def test_wrapper_for_generators(self):
self.assertAlmostEqual(test_loss, data['metrics']['loss'])
self._take_snapshot()

@unittest.skipUnless(TF_INSTALLED, "Tensorflow not available")
@unittest.skipUnless(TF_INSTALLED, "SKIP: Tensorflow not available")
def test_wrapper_for_keras_sequence(self):
"""This test follows the basic classification tutorial, modified for using
the fit_generator() and eval_generator() methods.
Expand Down Expand Up @@ -304,9 +304,9 @@ def __len__(self):
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
g = KSequence(train_images, train_labels)
model.fit_generator(g, epochs=5, steps_per_epoch=2)
model.fit(g, epochs=5, steps_per_epoch=2)
g2 = KSequence(test_images, test_labels)
test_loss, test_acc = model.evaluate_generator(g2, steps=len(test_labels), verbose=2)
test_loss, test_acc = model.evaluate(g2, steps=len(test_labels), verbose=2)
print("test accuracy: %s" % test_acc)
results_file = join(WS_DIR, 'results/results.json')
self.assertTrue(exists(results_file), "missing file %s" % results_file)
Expand Down
8 changes: 4 additions & 4 deletions tests/test_wrapper_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,20 @@ def test_pandas_df(self):
_add_to_hash(df, self.hash_state)
print(self.hash_state.hexdigest())

@unittest.skipUnless(pandas is not None, 'Pandas not available')
@unittest.skipUnless(pandas is not None, 'SKIP: Pandas not available')
def test_pandas_series(self):
s = pandas.Series([1,0,0,1,1], name='y')
_add_to_hash(s, self.hash_state)
print(self.hash_state.hexdigest())

@unittest.skipUnless(numpy is not None, "Numpy not available")
@unittest.skipUnless(numpy is not None, "SKIP: Numpy not available")
def test_numpy(self):
a = numpy.arange(45)
_add_to_hash(a, self.hash_state)
print(self.hash_state.hexdigest())

@unittest.skipUnless(tensorflow is not None and TF_VERSION==2, "Rensorflow 2 not availabile")
@unittest.skipUnless(numpy is not None, "Numpy is not available")
@unittest.skipUnless(tensorflow is not None and TF_VERSION==2, "SKIP: Tensorflow 2 not availabile")
@unittest.skipUnless(numpy is not None, "SKIP: Numpy is not available")
def test_tensorflow_tensor(self):
dataset = tensorflow.data.Dataset.from_tensor_slices(numpy.arange(100).reshape((10,10)))
for i in dataset:
Expand Down

0 comments on commit 2379183

Please sign in to comment.