Additional checkpoint tests with distribution strategy.

Summary: - Use virtual devices API to do multi-GPU testing even with a single GPU. - Make assert_consumed explicit in TestCheckpoint#restore_latest. - Test restore from golden. - Test restore from non-distributed model. - Test save/restore cycle. - Test restore on create. - Test restore on create in replica context. PiperOrigin-RevId: 247986699 Change-Id: I46f3da2ae1201eddd779a5634ef147be32e38f92
google-deepmind · May 13, 2019 · 1a4e14e · 1a4e14e
1 parent 9956bbf
commit 1a4e14e
Show file tree

Hide file tree

Showing 2 changed files with 153 additions and 31 deletions.
diff --git a/sonnet/golden_checkpoints/BUILD b/sonnet/golden_checkpoints/BUILD
@@ -37,6 +37,7 @@ snt_py_test(
     data = glob(["*/**"]),
     deps = [
         ":goldens",
+        # absl/logging dep,
         # absl/testing:absltest dep,
         # absl/testing:parameterized dep,
         # six dep,

diff --git a/sonnet/golden_checkpoints/goldens_test.py b/sonnet/golden_checkpoints/goldens_test.py
@@ -23,6 +23,7 @@
 import os
 import pickle
 
+from absl import logging
 from absl.testing import absltest
 from absl.testing import parameterized
 import sonnet as snt
@@ -34,17 +35,20 @@
 class TestCheckpoint(object):
   """Wraps a tf.train.Checkpoint to make it more convenient for testing."""
 
-  def __init__(self, root=None, **kwargs):
-    if root is None:
+  def __init__(self, golden=None, **kwargs):
+    if golden is None:
       root = absltest.get_default_test_tmpdir()
+    else:
+      root = os.path.join("sonnet/golden_checkpoints/",
+                          golden.name)
     self._root = root
     self._prefix = os.path.join(self._root, "checkpoint")
     self._checkpoint = tf.train.Checkpoint(**kwargs)
 
   def save(self):
     self._checkpoint.save(file_prefix=self._prefix)
 
-  def restore_latest(self, assert_consumed=True):
+  def restore_latest(self, assert_consumed):
     status = self._checkpoint.restore(tf.train.latest_checkpoint(self._root))
     if assert_consumed:
       # Ensures that all values in the checkpoint have been consumed by some
@@ -58,13 +62,14 @@ def all_goldens(test_method):
   return parameterized.named_parameters(cases)(test_method)
 
 
-def mirrored_all_devices():
-  # NOTE: We avoid the default constructor so we mirror over CPU, CPU + GPU and
-  # all TPU cores (the default ctor currently does all CPUs or all GPUs).
-  all_visible_devices = [
-      d.name for d in tf.config.experimental.list_logical_devices()
-      if d.device_type != "TPU_SYSTEM"]
-  return tf.distribute.MirroredStrategy(devices=all_visible_devices)
+def mirrored_all_devices(device_type):
+  # NOTE: The explicit device list is required since currently MirroredStrategy
+  # only considers CPU and GPU devices. This means on TPU by default we only
+  # mirror on the local CPU.
+  devices = tf.config.experimental.list_logical_devices(device_type=device_type)
+  devices = [d.name for d in devices]
+  logging.info("Mirroring over %s", devices)
+  return tf.distribute.MirroredStrategy(devices=devices)
 
 
 def with_soft_placement(f):
@@ -100,7 +105,7 @@ def test_save_load(self, golden):
       variable.assign(tf.ones_like(variable))
 
     # Check restored values match the saved values.
-    checkpoint.restore_latest()
+    checkpoint.restore_latest(assert_consumed=True)
     for variable in all_variables:
       self.assertAllClose(variable.read_value(), goldens.range_like(variable))
 
@@ -125,7 +130,7 @@ def test_save_then_load_new_instance(self, golden):
       v2.assign(tf.ones_like(v2))
 
     checkpoint_1.save()
-    checkpoint_2.restore_latest()
+    checkpoint_2.restore_latest(assert_consumed=True)
 
     # Assert the parameters in both modules are the same.
     for variable in variables_2:
@@ -155,7 +160,7 @@ def test_restore_on_create(self, golden):
     variables_2 = golden.create_all_variables(module_2)
     status.assert_consumed()
     for variable in variables_2:
-      self.assertAllClose(variable.read_value(), goldens.range_like(variable))
+      self.assertAllEqual(variable.read_value(), goldens.range_like(variable))
 
     # Assert the output from both modules is the same.
     # TODO(tomhennigan) Handle modules with nested outputs.
@@ -166,40 +171,46 @@ def test_restore_on_create(self, golden):
   def test_restore_golden(self, golden):
     """Test restoring from a golden checkpoint still works."""
     module = golden.create_module()
-    root = os.path.join(
-        "sonnet/golden_checkpoints/",
-        golden.name)
-    checkpoint = TestCheckpoint(root=root, module=module)
+    checkpoint = TestCheckpoint(golden=golden, module=module)
     variables = golden.create_all_variables(module)
     for variable in variables:
       variable.assign(tf.zeros_like(variable))
-    checkpoint.restore_latest()
+    checkpoint.restore_latest(assert_consumed=True)
     for variable in variables:
-      self.assertAllClose(variable.read_value(), goldens.range_like(variable))
+      self.assertAllEqual(variable.read_value(), goldens.range_like(variable))
+
+
+class DistributionStrategyCheckpointTest(test_utils.TestCase,
+                                         parameterized.TestCase):
 
   @all_goldens
   def test_checkpoint_mirrored_strategy(self, golden):
-    strategy = mirrored_all_devices()
-    self.assertCheckpointDistributionStrategy(golden, strategy,
-                                              use_function=False)
+    strategy = mirrored_all_devices(self.primary_device)
+    self.assertCheckpointWithStrategy(golden, strategy, use_function=False)
 
   @all_goldens
   def test_checkpoint_mirrored_strategy_function(self, golden):
-    strategy = mirrored_all_devices()
-    self.assertCheckpointDistributionStrategy(golden, strategy,
-                                              use_function=True)
+    strategy = mirrored_all_devices(self.primary_device)
+    self.assertCheckpointWithStrategy(golden, strategy, use_function=True)
 
   @all_goldens
   def test_checkpoint_tpu_strategy(self, golden):
     if self.primary_device != "TPU":
       self.skipTest("Test requires a TPU")
 
     strategy = tf.distribute.experimental.TPUStrategy()
-    self.assertCheckpointDistributionStrategy(golden, strategy,
-                                              use_function=True)
+    self.assertCheckpointWithStrategy(golden, strategy, use_function=True)
 
-  def assertCheckpointDistributionStrategy(self, golden, strategy,
-                                           use_function=True):
+  def assertCheckpointWithStrategy(self, golden, strategy, use_function):
+    self.assertSaveRestore(golden, strategy, use_function)
+    self.assertRestoreFromGolden(golden, strategy)
+    self.assertRestoreFromNonDistributed(golden, strategy, use_function)
+    self.assertRestoreOnCreate(golden, strategy)
+    if self.primary_device != "TPU":
+      # TODO(b/130555244) Enable on TPU when functions can create variables.
+      self.assertRestoreOnCreateInReplicaContext(golden, strategy, use_function)
+
+  def assertSaveRestore(self, golden, strategy, use_function):
     with strategy.scope():
       module = golden.create_module()
       variables = golden.create_all_variables(module)
@@ -233,8 +244,7 @@ def forward():
       self.assertNotAllClose(y, before_save_ys)
 
     # Restore from the checkpoint and assert the module is in the same state.
-    status = checkpoint.restore_latest()
-    status.assert_consumed()
+    checkpoint.restore_latest(assert_consumed=True)
 
     for index, variable in enumerate(variables):
       # Parameters should be restored to their previous values.
@@ -244,6 +254,107 @@ def forward():
     if golden.deterministic:
       self.assertAllEqual(forward(), before_save_ys)
 
+  def assertRestoreFromGolden(self, golden, strategy):
+    with strategy.scope():
+      module = golden.create_module()
+      variables = golden.create_all_variables(module)
+    checkpoint = TestCheckpoint(golden=golden, module=module)
+    checkpoint.restore_latest(assert_consumed=True)
+    for variable in variables:
+      self.assertAllEqual(variable.read_value(), goldens.range_like(variable))
+
+  def assertRestoreFromNonDistributed(self, golden, strategy, use_function):
+    # Save a checkpoint from a non-distributed model.
+    module = golden.create_module()
+    normal_variables = golden.create_all_variables(module)
+    for index, variable in enumerate(normal_variables):
+      variable.assign(goldens.range_like(variable, start=(index + 1)))
+    checkpoint = TestCheckpoint(module=module)
+    checkpoint.save()
+
+    def run_forward(module):
+      forward = lambda: golden.forward(module)
+      if use_function:
+        forward = tf.function(forward)
+        if self.primary_device == "TPU":
+          # TODO(b/132329316) Remove when `xla.compile` allows tf.device(TPU).
+          forward = with_soft_placement(forward)
+      return forward()
+
+    if golden.deterministic:
+      y_before = run_forward(module)
+
+    # Create the same model (new params) in the strategy scope.
+    with strategy.scope():
+      module = golden.create_module()
+      strategy_variables = golden.create_all_variables(module)
+
+    # Ensure the distributed params are != the values in the checkpoint.
+    for normal, distributed in zip(normal_variables, strategy_variables):
+      distributed.assign(tf.zeros_like(distributed))
+      self.assertNotAllClose(normal.read_value(), distributed.read_value())
+
+    # Restore the checkpoint and ensure the parameters are the same.
+    checkpoint = TestCheckpoint(module=module)
+    checkpoint.restore_latest(assert_consumed=True)
+
+    for normal, distributed in zip(normal_variables, strategy_variables):
+      self.assertAllEqual(normal.read_value(), distributed.read_value())
+
+    if golden.deterministic:
+      y_after = run_forward(module)
+      self.assertAllEqual(y_before, y_after)
+
+  def assertRestoreOnCreate(self, golden, strategy):
+    # Save a checkpoint from a non-distributed model.
+    module = golden.create_module()
+    normal_variables = golden.create_all_variables(module)
+    for index, variable in enumerate(normal_variables):
+      variable.assign(goldens.range_like(variable, start=(index + 1)))
+    checkpoint = TestCheckpoint(module=module)
+    checkpoint.save()
+
+    # Create the same model (new params) in the strategy scope.
+    with strategy.scope():
+      module = golden.create_module()
+      checkpoint = TestCheckpoint(module=module)
+      status = checkpoint.restore_latest(assert_consumed=False)
+      golden.forward(module)
+      status.assert_consumed()
+      strategy_variables = golden.create_all_variables(module)
+
+    for normal, distributed in zip(normal_variables, strategy_variables):
+      self.assertAllEqual(normal.read_value(), distributed.read_value())
+
+  def assertRestoreOnCreateInReplicaContext(self, golden, strategy,
+                                            use_function):
+    with strategy.scope():
+      module = golden.create_module()
+
+    def forward():
+      return strategy.experimental_run_v2(lambda: golden.forward(module))
+
+    if use_function:
+      forward = tf.function(forward)
+      if self.primary_device == "TPU":
+        # TODO(b/132329316) Remove when `xla.compile` allows tf.device(TPU).
+        forward = with_soft_placement(forward)
+
+    checkpoint = TestCheckpoint(golden=golden, module=module)
+    status = checkpoint.restore_latest(assert_consumed=False)
+    result = forward()
+    status.assert_consumed()
+
+    if golden.deterministic:
+      result_iter = iter(strategy.experimental_local_results(result))
+      first_replica = next(result_iter)
+      for next_replica in result_iter:
+        self.assertAllEqual(first_replica, next_replica)
+
+    variables = golden.create_all_variables(module)
+    for variable in variables:
+      self.assertAllEqual(variable.read_value(), goldens.range_like(variable))
+
 
 class SavedModelTest(test_utils.TestCase, parameterized.TestCase):
 
@@ -369,6 +480,16 @@ def test_all_modules_covered(self):
     self.assertEqual(tested_modules | no_checkpoint_whitelist, all_sonnet_types)
 
 
+def setUpModule():
+  # If a physical GPU is available make sure TF sees at least two.
+  gpus = tf.config.experimental.list_physical_devices(device_type="GPU")
+  if len(gpus) == 1:
+    logging.info("Splitting one physical GPU into two logical GPUs.")
+    tf.config.experimental.set_virtual_device_configuration(
+        gpus[0],
+        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024),
+         tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
+
 if __name__ == "__main__":
   # tf.enable_v2_behavior()
   tf.test.main()