In [1]:
import ipcmagic

In [35]:
%ipcluster start -n 2 --mpi

IPCluster is ready! (6 seconds)


In [36]:
%%px
import os
import math
import tensorflow as tf
from datetime import datetime
from tensorflow import keras

In [56]:
%%px
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
    cluster_resolver=tf.distribute.cluster_resolver.SlurmClusterResolver(),
    communication=tf.distribute.experimental.CollectiveCommunication.NCCL,
)

def decode(serialized_example):
    """Parses an image and label from the given `serialized_example`."""
    features = tf.io.parse_single_example(
        serialized_example,
        features={
            'image_raw': tf.io.FixedLenFeature([], tf.string),
            'label': tf.io.FixedLenFeature([], tf.int64),
        })
    label = tf.cast(features['label'], tf.int32)
    image = tf.io.decode_raw(features['image_raw'], tf.uint8)
    image = tf.reshape(image, (28, 28, 1))
    return image, label


def normalize(image, label):
    """Convert `image` from [0, 255] -> [-0.5, 0.5] floats."""
    image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
    return image, label


def get_train_set(filename, batch_size):
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

    dataset = tf.data.TFRecordDataset(filename)
    dataset = dataset.map(decode)
    # dataset = dataset.shuffle(128)
    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.map(normalize)
    dataset = dataset.with_options(options)
    return dataset

def get_val_set(filename, batch_size):
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

    dataset = tf.data.TFRecordDataset(filename)
    dataset = dataset.map(decode)
    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.map(normalize)
    dataset = dataset.with_options(options)
    return dataset

with strategy.scope():
    model = keras.Sequential([keras.layers.Conv2D(32, kernel_size=(3, 3),
                                                 activation='relu',
                                                 input_shape=(28, 28, 1)),
                              keras.layers.Conv2D(64, (3, 3), activation='relu'),
                              keras.layers.MaxPooling2D(pool_size=(2, 2)),
                              keras.layers.Dropout(0.25),
                              keras.layers.Flatten(),
                              keras.layers.Dense(128, activation='relu'),
                              keras.layers.Dropout(0.5),
                              keras.layers.Dense(10, activation='softmax')])

    opt = keras.optimizers.Adam(0.001)

    model.compile(optimizer=opt,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

tb_callback = tf.keras.callbacks.TensorBoard(
    log_dir = os.path.join('cnn_tfdistr_logs',
                           datetime.now().strftime("%d-%H%M")),
    histogram_freq = 1,
    profile_batch = '700,730')

In [57]:
%%px
BATCH_SIZE = 64
NUM_EPOCS = 2

fit = model.fit(get_train_set('../input_pipelines/tfrecords/train.tfrecords', BATCH_SIZE),
                # validation_data=get_val_set('../input_pipelines/tfrecords/test.tfrecords', BATCH_SIZE),
                epochs=NUM_EPOCS,
                verbose=2,
                steps_per_epoch=math.ceil(60000 / BATCH_SIZE),
                # validation_steps=math.ceil(10000 / BATCH_SIZE),
                callbacks=[tb_callback])

[stdout:0] 
Epoch 1/2
937/938 - 5s - loss: 0.2378 - accuracy: 0.9283
[stdout:1] 
Epoch 1/2
937/938 - 5s - loss: 0.2378 - accuracy: 0.9283


`tf.distributed` adds automatically `dataset = strategy.experimental_distribute_dataset(dataset)` to the dataset.

(!) Note that the validation losses and accuracies must be the same for both ranks.

In [14]:
%load_ext tensorboard

In [58]:
%tensorboard --logdir=cnn_tfdistr_logs

In [59]:
%ipcluster stop