In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import time
import tensorflow as tf
import numpy as np

In [2]:
print(tf.__version__)

1.14.0


In [3]:
import sys
print(sys.executable)

/home/seung/.venv/py36tf/bin/python


In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "2, 3"

In [4]:
BATCH_SIZE = 512
EPOCHS = 100
NUM_GPUS = 2

In [5]:
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.fashion_mnist.load_data() #fashion_mnist

In [6]:
TRAINING_SIZE = len(train_images)
TEST_SIZE = len(test_images)
train_images = np.asarray(train_images, dtype=np.float32) / 255
# Convert the train images and add channels
train_images = train_images.reshape((TRAINING_SIZE, 28, 28, 1))
test_images = np.asarray(test_images, dtype=np.float32) / 255
# Convert the test images and add channels
test_images = test_images.reshape((TEST_SIZE, 28, 28, 1))

In [7]:
# How many categories we are predicting from (0-9)
LABEL_DIMENSIONS = 10

train_labels  = tf.keras.utils.to_categorical(train_labels, LABEL_DIMENSIONS)
test_labels = tf.keras.utils.to_categorical(test_labels, LABEL_DIMENSIONS)

# Cast the labels to floats, needed later
train_labels = train_labels.astype(np.float32)
test_labels = test_labels.astype(np.float32)

In [8]:
inputs = tf.keras.Input(shape=(28,28,1))  # Returns a placeholder tensor
x = tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation=tf.nn.relu)(inputs)
x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=2)(x)
x = tf.keras.layers.Conv2D(filters=64, kernel_size=(3, 3), activation=tf.nn.relu)(x)
x = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=2)(x)
x = tf.keras.layers.Conv2D(filters=64, kernel_size=(3, 3), activation=tf.nn.relu)(x)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(64, activation=tf.nn.relu)(x)
predictions = tf.keras.layers.Dense(LABEL_DIMENSIONS, activation=tf.nn.softmax)(x)

In [9]:
model = tf.keras.Model(inputs=inputs, outputs=predictions)

In [10]:
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)

In [11]:
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 28, 28, 1)         0         
_________________________________________________________________
conv2d (Conv2D)              (None, 26, 26, 32)        320       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 13, 13, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 11, 11, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 5, 5, 64)          0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 3, 3, 64)          36928     
_________________________________________________________________
flatten (Flatten)            (None, 576)               0         
__________

In [13]:
576*64+64

36928

### Create an Estimator

In [14]:
strategy = tf.contrib.distribute.MirroredStrategy(num_gpus=NUM_GPUS)
config = tf.estimator.RunConfig(train_distribute=strategy)

estimator = tf.keras.estimator.model_to_estimator(model, config=config)

INFO:tensorflow:Initializing RunConfig with distribution strategies.
INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Using the Keras model provided.
INFO:tensorflow:Using config: {'_keep_checkpoint_every_n_hours': 10000, '_num_worker_replicas': 1, '_num_ps_replicas': 0, '_device_fn': None, '_global_id_in_cluster': 0, '_distribute_coordinator_mode': None, '_save_checkpoints_secs': 600, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_experimental_distribute': None, '_log_step_count_steps': 100, '_keep_checkpoint_max': 5, '_protocol': None, '_tf_random_seed': None, '_service': None, '_is_chief': True, '_task_type': 'worker', '_master': '', '_train_distribute': <tensorflow.contrib.distribute.python.mirrored_strategy.MirroredStrategy object at 0x7fe6df5dfdd8>, '_eval_distribute': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe6df5dfeb8>, '_model_dir': '/tmp/tmplgy62idn', '_evaluation_master': '', '_task_id': 0, '_ses

### Create an Estimator input function
To pipe data into Estimators we need to define a data importing function which returns a tf.data dataset of  (images, labels) batches of our data.

In [15]:
def input_fn(images, labels, epochs, batch_size):
    # Convert the inputs to a Dataset. (E)
    dataset = tf.data.Dataset.from_tensor_slices((images, labels))

    # Shuffle, repeat, and batch the examples. (T)
    SHUFFLE_SIZE = 5000
    dataset = dataset.shuffle(SHUFFLE_SIZE).repeat(epochs).batch(batch_size)
    dataset = dataset.prefetch(None)

    # Return the dataset. (L)
    return dataset

In [16]:
test1 = input_fn(train_images, train_labels,epochs=EPOCHS, batch_size=BATCH_SIZE)

In [17]:
test1

<PrefetchDataset shapes: ((?, 28, 28, 1), (?, 10)), types: (tf.float32, tf.float32)>

### Train the Estimator

In [18]:
class TimeHistory(tf.train.SessionRunHook):
    def begin(self):
        self.times = []

    def before_run(self, run_context):
        self.iter_time_start = time.time()

    def after_run(self, run_context, run_values):
        self.times.append(time.time() - self.iter_time_start)

In [19]:
time_hist = TimeHistory()

estimator.train(input_fn=lambda:input_fn(train_images,
                                         train_labels,
                                         epochs=EPOCHS,
                                         batch_size=BATCH_SIZE), 
                hooks=[time_hist])

INFO:tensorflow:Device is available but not used by distribute strategy: /device:CPU:0
INFO:tensorflow:Device is available but not used by distribute strategy: /device:XLA_GPU:0
INFO:tensorflow:Device is available but not used by distribute strategy: /device:XLA_CPU:0
INFO:tensorflow:Configured nccl all-reduce.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:batch_all_reduce invoked for batches size = 10 with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Warm-starting with WarmStartSettings: WarmStartSettings(ckpt_to_initialize_from='/tmp/tmplgy62idn/keras/keras_model.ckpt', vars_to_warm_start='.*', var_name_to_vocab_info={}, var_name_to_prev_var_name={})
INFO:tensorflow:Warm-starting from: ('/tmp/tmplgy62idn/keras/keras_model.ckpt',)
INFO:tensorflow:Warm-starting variable: conv2d/kernel; prev_var_name: Unc

UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node conv2d/Conv2D (defined at /home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py:795)  = Conv2D[T=DT_FLOAT, data_format="NCHW", dilations=[1, 1, 1, 1], padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](conv2d/Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer, conv2d/Conv2D/ReadVariableOp)]]
	 [[{{node tower_1/loss/mul/_589}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:1", send_device_incarnation=1, tensor_name="edge_302_tower_1/loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'conv2d/Conv2D', defined at:
  File "/usr/lib/python3.5/threading.py", line 882, in _bootstrap
    self._bootstrap_inner()
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 795, in run
    self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/estimator/estimator.py", line 1195, in _call_model_fn
    model_fn_results = self._model_fn(features=features, **kwargs)
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/estimator/keras.py", line 278, in model_fn
    labels)
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/estimator/keras.py", line 201, in _clone_and_build_model
    optimizer_iterations=global_step)
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/keras/models.py", line 437, in clone_and_build_model
    clone = clone_model(model, input_tensors=input_tensors)
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/keras/models.py", line 256, in clone_model
    return _clone_functional_model(model, input_tensors=input_tensors)
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/keras/models.py", line 159, in _clone_functional_model
    **kwargs))
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/keras/engine/base_layer.py", line 757, in __call__
    outputs = self.call(inputs, *args, **kwargs)
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/keras/layers/convolutional.py", line 194, in call
    outputs = self._convolution_op(inputs, self.kernel)
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/ops/nn_ops.py", line 868, in __call__
    return self.conv_op(inp, filter)
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/ops/nn_ops.py", line 520, in __call__
    return self.call(inp, filter)
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/ops/nn_ops.py", line 204, in __call__
    name=self.name)
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 957, in conv2d
    data_format=data_format, dilations=dilations, name=name)
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
    op_def=op_def)
  File "/home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1770, in __init__
    self._traceback = tf_stack.extract_stack()

UnknownError (see above for traceback): Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node conv2d/Conv2D (defined at /home/seung/.venv/py35Keras2/lib/python3.5/site-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py:795)  = Conv2D[T=DT_FLOAT, data_format="NCHW", dilations=[1, 1, 1, 1], padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](conv2d/Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer, conv2d/Conv2D/ReadVariableOp)]]
	 [[{{node tower_1/loss/mul/_589}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:1", send_device_incarnation=1, tensor_name="edge_302_tower_1/loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


In [None]:
total_time =  sum(time_hist.times)
print('total time with {} GPUs: {} seconds'.format(NUM_GPUS,total_time))
avg_time_per_batch = np.mean(time_hist.times)
print('{} images/second with {} GPUs'.format(BATCH_SIZE*NUM_GPUS/avg_time_per_batch,NUM_GPUS))

```
total time with 2 GPUs: 7.6623451709747314 seconds
39156.68027284038 images/second with 2 GPUs

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.130                Driver Version: 384.130                   |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  TITAN Xp            Off  | 00000000:05:00.0 Off |                  N/A |
| 23%   42C    P0    61W / 250W |      0MiB / 12188MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  TITAN Xp            Off  | 00000000:06:00.0 Off |                  N/A |
| 23%   41C    P0    58W / 250W |      0MiB / 12189MiB |      2%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN Xp            Off  | 00000000:09:00.0 Off |                  N/A |
| 27%   51C    P2   176W / 250W |  11767MiB / 12189MiB |     60%      Default |
+-------------------------------+----------------------+----------------------+
|   3  TITAN Xp            Off  | 00000000:0A:00.0 Off |                  N/A |
| 23%   32C    P2   164W / 250W |  11767MiB / 12189MiB |     58%      Default |
+-------------------------------+----------------------+----------------------+

```

In [None]:
estimator.evaluate(lambda:input_fn(test_images, 
                                   test_labels,
                                   epochs=1,
                                   batch_size=BATCH_SIZE))

### References

* Multi-GPU training with Estimators, tf.keras and tf.data https://medium.com/tensorflow/multi-gpu-training-with-estimators-tf-keras-and-tf-data-ba584c3134db