In [15]:
import ipcmagic
from tb_cscs import tensorboard

In [2]:
%ipcluster start -n 2 --mpi

IPCluster is ready! (5 seconds)


In [3]:
%%px
import os
import tensorflow as tf
import horovod.tensorflow.keras as hvd
from datetime import datetime
from tb_cscs import tensorboard

In [16]:
%%px
hvd.init()


def decode(serialized_example):
    features = tf.io.parse_single_example(
        serialized_example,
        features={
            'image/encoded': tf.io.FixedLenFeature([], tf.string),
            'image/class/label': tf.io.FixedLenFeature([], tf.int64),
        })
    image = tf.image.decode_jpeg(features['image/encoded'], channels=3)
    image = tf.image.resize(image, (224, 224))
    label = tf.cast(features['image/class/label'], tf.int64)
    label = tf.one_hot(label, 1001)
    return image, label


data_dir = '/scratch/snx3000/stud50/imagenet/'
list_of_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir)]

dataset = tf.data.Dataset.list_files(list_of_files)
dataset = dataset.interleave(tf.data.TFRecordDataset,
                             cycle_length=120,
                             block_length=1,
                             num_parallel_calls=12)
dataset = dataset.map(decode)
dataset = dataset.batch(128)
dataset = dataset.shard(hvd.size(), hvd.rank())
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

model = tf.keras.applications.InceptionV3(weights=None,
                                          input_shape=(224, 224, 3),
                                          classes=1001)

optimizer = tf.keras.optimizers.SGD(lr=0.01, momentum=0.9)
optimizer = hvd.DistributedOptimizer(optimizer)

model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

tb_callback = tf.keras.callbacks.TensorBoard(log_dir=os.path.join('inceptionv3_logs',
                                                                  datetime.now().strftime("%d-%H%M")),
                                             histogram_freq=1,
                                             profile_batch='85,95')

hvd_callback = hvd.callbacks.BroadcastGlobalVariablesCallback(0)

In [17]:
%%px
fit = model.fit(dataset.take(100),
                epochs=1,
                callbacks=[hvd_callback, tb_callback])



In [12]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [13]:
%tensorboard --logdir=inceptionv3_logs

Reusing TensorBoard on port 6006 (pid 12324), started 0:29:57 ago. (Use '!kill 12324' to kill it.)

In [18]:
%ipcluster stop