https://keras.io/guides/distributed_training/

In [8]:
import time
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)

2.1.0


https://github.com/tensorflow/tensorflow/issues/28372
https://github.com/tensorflow/tensorflow/issues/21470
https://github.com/tensorflow/tensorflow/issues/33656
https://www.tensorflow.org/api_docs/python/tf/distribute/HierarchicalCopyAllReduce
https://www.tensorflow.org/versions/r2.1/api_docs/python/tf/distribute/MirroredStrategy

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

2 Physical GPUs, 2 Logical GPUs


In [19]:
#Option 0:
#Single GPU.
#strategy = tf.distribute.OneDeviceStrategy(device="/gpu:1")

#Option 1:
#Try using hierarchical copy.
#cto = tf.distribute.HierarchicalCopyAllReduce(num_packs=2)
#strategy = tf.distribute.MirroredStrategy(cross_device_ops=cto)

#Option 2:
#Reduce to first GPU:

cto = tf.distribute.ReductionToOneDevice()
strategy = tf.distribute.MirroredStrategy(cross_device_ops=cto)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [5]:
def get_compiled_model():
    # Make a simple 2-layer densely-connected neural network.
    inputs = keras.Input(shape=(784,))
    x = keras.layers.Dense(256, activation="relu")(inputs)
    x = keras.layers.Dense(256, activation="relu")(x)
    outputs = keras.layers.Dense(10)(x)
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(),
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
    )
    return model


def get_dataset():
    batch_size = 32
    num_val_samples = 10000

    # Return the MNIST dataset in the form of a `tf.data.Dataset`.
    (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

    # Preprocess the data (these are Numpy arrays)
    x_train = x_train.reshape(-1, 784).astype("float32") / 255
    x_test = x_test.reshape(-1, 784).astype("float32") / 255
    y_train = y_train.astype("float32")
    y_test = y_test.astype("float32")

    # Reserve num_val_samples samples for validation
    x_val = x_train[-num_val_samples:]
    y_val = y_train[-num_val_samples:]
    x_train = x_train[:-num_val_samples]
    y_train = y_train[:-num_val_samples]
    return (
        tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size),
        tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(batch_size),
        tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size),
    )

In [20]:
# Create a MirroredStrategy.
# strategy = tf.distribute.MirroredStrategy() #see strategy definition above
print("Number of devices: {}".format(strategy.num_replicas_in_sync))

# Open a strategy scope.
with strategy.scope():
    # Everything that creates variables should be under the strategy scope.
    # In general this is only model construction & `compile()`.
    model = get_compiled_model()


train_dataset, val_dataset, test_dataset = get_dataset()

Number of devices: 2


In [21]:
start = time.time()
# Train the model on all available devices.
model.fit(train_dataset, epochs=10, validation_data=val_dataset)
print("Ran in {} seconds".format(time.time() - start))

Train for 1563 steps, validate for 313 steps
Epoch 1/10
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').
INFO:tens

SINGLE RTX 2060 time: 29.36 seconds
SINGLE GTX 1060 time: 40.60 seconds

DUAL Hierarchical copy: 76.26 seconds
Dual Reduce: 86.83 seconds

In [None]:
# Test the model on all available devices.
model.evaluate(test_dataset)