# Testing `ipcmagic` with TensorFlow-1 and Horovod

 * The two nodes have different names
 * Both nodes are using the GPU (the GPU usage is about 3%).
 * The training time last just a couple of seconds.
 * After running `ipcluster stop` the GPU memory ussage goes to zero.
 
 > To run this notebook it's necessary to have the Horovod module loaded. Please add the following on the `$HOME/.jupyterhub.env` file
 ```bash
  module load Horovod/0.16.4-CrayGNU-19.10-tf-1.14.0
 ```
 Please, make sure that it doesn't conflict with any other line that you might have on your `$HOME/.jupyterhub.env` file.

In [None]:
import ipcmagic.local
import ipyparallel as ipp

In [None]:
%ipcluster --version

In [None]:
%ipcluster start -n 2 --mpi

In [None]:
c = ipp.Client()   # (profile='job_17669451')

In [None]:
c.ids

In [None]:
%%px
import socket
socket.gethostname()

In [None]:
%%px
import numpy as np
import tensorflow as tf
import horovod.tensorflow as hvd

In [None]:
%%px
hvd.init()

In [None]:
%%px
# Note that the generated rando data is different from one node to the other
nsamples = 1000
ref_slope = 2.0
ref_offset = 0.0
noise = np.random.random((nsamples, 1)) - 0.5
x_train = np.random.random((nsamples, 1)) - 0.5
y_train = ref_slope * x_train + ref_offset + noise

In [None]:
%%px
#input pipeline
dataset = tf.data.Dataset.from_tensor_slices((x_train.astype(np.float32),
                                              y_train.astype(np.float32)))
dataset = dataset.shard(hvd.size(), hvd.rank())
dataset = dataset.batch(500)
dataset = dataset.repeat(100)
iterator = dataset.make_one_shot_iterator()
next_item = iterator.get_next()

In [None]:
%%px
# Define the model
slope = tf.Variable(np.random.randn())
offset = tf.Variable(np.random.randn())

x, y = next_item  # The model is the continuation of the pipeline

y_hat = slope * x + offset

loss = tf.losses.mean_squared_error(y_hat, y)

opt = tf.train.GradientDescentOptimizer(.5)
train = hvd.DistributedOptimizer(opt).minimize(loss)

In [None]:
%%px
hooks = [hvd.BroadcastGlobalVariablesHook(0)]

In [None]:
%%px
history = []

with tf.train.MonitoredTrainingSession(hooks=hooks) as sess:
    # Initialization of the variables `slope` and `offset`
    # is done automatically by tf.train.MonitoredTrainingSession
    print('rank', hvd.rank(),
          'inital slope   = %12.6f\n       initial offset = %12.6f' %
          sess.run((slope, offset)))
    while not sess.should_stop():
        _, loss_val, m, n = sess.run((train, loss, slope, offset))
        history.append([sess.run(slope), sess.run(offset), loss_val])

### Plotting the SGD

Plot the path taken by the SGD during training. This shows the path taken by both workers. They must be identical, if they aren't, it means that something on the distributed SGD algorithm went wrong.

>To plot the path individually, the options `--target 0` and `--target 1` of `%%px` can be used.

In [None]:
%%px --target 0
%matplotlib inline
import numpy as np
import matplotlib
import matplotlib.pyplot as plt


def loss_function_field(m, n, xref, yref):
    return np.mean(np.square(yref - m * xref - n))


slope_hist = np.array(history)[:, 0]
offset_hist = np.array(history)[:, 1]

# Create [slope x offset] grid for contour plot
_m = np.arange(-0, 4.01, 0.1)
_n = np.arange(-0.5, 0.51, 0.1)
M, N = np.meshgrid(_m, _n)

Z = np.zeros(M.shape)
for i in range(M.shape[0]):
    for j in range(M.shape[1]):
        Z[i, j] = loss_function_field(M[i, j], N[i, j],
                                      x_train, y_train)

# matplotlib.rcParams['figure.figsize'] = (10.0, 10.0)

cp = plt.contour(M, N, Z, 50, vmin=Z.min(), vmax=Z.max(), alpha=0.4)
plt.clabel(cp, cp.levels[:6])
plt.colorbar()
m = slope_hist[-1]
n = offset_hist[-1]
plt.plot(slope_hist, offset_hist, '.-', lw=1)
plt.plot([ref_slope], [ref_offset], 'rx', ms=10)
plt.xlim([_m.min(), _m.max()])
plt.ylim([_n.min(), _n.max()])
plt.xlabel('Slope')
plt.ylabel('Offset')
plt.show()

In [None]:
%ipcluster stop