In [1]:
from __future__ import print_function
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon
from time import time
mx.random.seed(1)

In [2]:
batch_size = 256
num_inputs = 784
num_outputs = 10
num_gpus = 4
learning_rate = .1

In [3]:
ctx = [mx.gpu(i) for i in range(num_gpus)]

In [4]:
def transform(data, label):
    return nd.transpose(data.astype(np.float32), (2,0,1))/255, label.astype(np.float32)

train_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=True, transform=transform),
                                   batch_size, shuffle=True, num_workers=4)
test_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=False, transform=transform),
                                  batch_size, shuffle=False, num_workers=4)

  label = np.fromstring(fin.read(), dtype=np.uint8).astype(np.int32)
  data = np.fromstring(fin.read(), dtype=np.uint8)


In [5]:
num_fc = 512
net = gluon.nn.Sequential()
with net.name_scope():
    net.add(gluon.nn.Conv2D(channels=20, kernel_size=5, activation='relu'))
    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
    net.add(gluon.nn.Conv2D(channels=50, kernel_size=5, activation='relu'))
    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
    # The Flatten layer collapses all axis, except the first one, into one axis.
    net.add(gluon.nn.Flatten())
    net.add(gluon.nn.Dense(num_fc, activation="relu"))
    net.add(gluon.nn.Dense(num_outputs))

In [6]:
net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), force_reinit=True, ctx=ctx)

In [7]:
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

In [8]:
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': learning_rate})

In [9]:
# Load training data into GPUs, each data_l contains arrays deployed to gpu1/2/3/4
# there will be 235 loop iterations
train_data_l = []
train_label_l = []
for data,label in train_data:
    train_data_l.append(gluon.utils.split_and_load(data, ctx))
    train_label_l.append(gluon.utils.split_and_load(label, ctx))

In [10]:
# Load test data inro GPUs
test_data_l = []
test_label_l = []
for data,label in test_data:
    test_data_l.append(gluon.utils.split_and_load(data, ctx))
    test_label_l.append(gluon.utils.split_and_load(label, ctx))

In [11]:
len(train_data_l)

235

In [12]:
len(train_data_l[234])

4

In [13]:
len(train_data_l[0][0])

64

In [14]:
type(train_data_l[0][0])

mxnet.ndarray.ndarray.NDArray

In [15]:
train_data_l[0][0].shape

(64, 1, 28, 28)

datal[0-234][0-3][0-63]

data_l - List with 235 elements, each element of data_l is
List of 4 elements, each of these 4 elems is
NDArray of shape: (64, 1, 28, 28)


In [16]:
# Default accuracy function (this only works on one GPU and won't work for ctx = [gpu(0), gpu(1),])
def evaluate_accuracy(data_iterator, net):
    acc = mx.metric.Accuracy()
    for i, (data, label) in enumerate(data_iterator):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = net(data)
        predictions = nd.argmax(output, axis=1)
        acc.update(preds=predictions, labels=label)
    return acc.get()[1]

In [17]:
# Suggession by feevos: produces error
def eval_acc_feevos1(net, data_l, label_l):
    acc = mx.metric.Accuracy() # Single accuracy 
    for i, (data, label) in enumerate(zip(data_l,label_l)):
        data = data.as_in_context(mx.gpu(0))
        label = nd.array(label) # keep this in cpu context, since this is already done inside the definition of Accuracy
        pred = nd.argmax(net(d),axis=1).as_in_context(mx.cpu())
        acc.update(preds=pred,labels=label)
    return (acc.get()[1])

In [18]:
# Suggession by feevos: produces error
def eval_acc_feevos2(net, data_l, label_l):
    acc = mx.metric.Accuracy() # Single accuracy 
    for i, (data, label) in enumerate(zip(data_l, label_l)):
        # data = gluon.utils.split_and_load(data, ctx)
        label = nd.array(label) # keep this in cpu context, since this is already done inside the definition of Accuracy   
        # Perform inference on each separate GPU 
        pred = [nd.argmax(net(X)).as_in_context(mx.cpu()) for X in data]
        pred = nd.concat(*pred,dim=0) # Collect results
        acc.update(preds=pred,labels=label) # update single accuracy

    return (acc.get()[1])

In [19]:
# This works, but ugly, slow and requires loading labels into GPUs, which is redundant!
# As we see below accuracy calculation adds ~20 seconds into epoch time
# See more at: https://discuss.mxnet.io/t/evaluate-accuracy-on-multi-gpu-machine/1972
def eval_acc(net, data_l, label_l):
    acc = [mx.metric.Accuracy() for i in range(num_gpus)]
    for i, (data, label) in enumerate(zip(data_l, label_l)): # loop on 235 batches
        D=[data[n].as_in_context(mx.gpu(n)) for n in range(0,num_gpus)]
        L=[label[n].as_in_context(mx.gpu(n)) for n in range(0,num_gpus)]
        P = [nd.argmax(net(d), axis=1) for d in D]
        [a.update(preds=p, labels=l) for p, a, l in zip(P, acc, L)]
    return sum([a.get()[1] for a in acc])/num_gpus

In [20]:
epochs = 11
smoothing_constant = .01
test_acc = train_acc = 0

for e in range(epochs):
    train_loss = 0.
    tic = time()
    c=1
    for data, label in train_data: # read the batch (batch_size rows) from train_data, see batch_size in DataLoader
        data_list = gluon.utils.split_and_load(data, ctx) # split batch_size into num_gpu devices
        label_list = gluon.utils.split_and_load(label, ctx)

        with autograd.record():
            losses = [softmax_cross_entropy(net(X), y)
                      for X, y in zip(data_list, label_list)]
        for l in losses:
            l.backward()

        trainer.step(batch_size)
        # Sum losses over all devices
        train_loss += sum([l.sum().asscalar() for l in losses])
        
    if (e % 5 == 0): # calculate accuracy every 5th epoch
        test_acc = eval_acc(net, test_data_l, test_label_l) #eval_acc_cpu(net, test_data_l, test_label_l)
        train_acc = eval_acc(net, train_data_l, train_label_l) #eval_acc_cpu(net, train_data_l, train_label_l)
    
    print("Epoch %d: Loss: %.3f, train_accuracy %.3f, test_accuracy %.3f, Time %.1f sec" % 
          (e, train_loss/len(train_data)/batch_size, train_acc, test_acc, time()-tic))

Epoch 0: Loss: 0.513, train_accuracy 0.963, test_accuracy 0.967, Time 27.5 sec
Epoch 1: Loss: 0.104, train_accuracy 0.963, test_accuracy 0.967, Time 7.1 sec
Epoch 2: Loss: 0.073, train_accuracy 0.963, test_accuracy 0.967, Time 7.2 sec
Epoch 3: Loss: 0.057, train_accuracy 0.963, test_accuracy 0.967, Time 7.2 sec
Epoch 4: Loss: 0.048, train_accuracy 0.963, test_accuracy 0.967, Time 7.2 sec
Epoch 5: Loss: 0.041, train_accuracy 0.989, test_accuracy 0.987, Time 27.4 sec
Epoch 6: Loss: 0.035, train_accuracy 0.989, test_accuracy 0.987, Time 7.0 sec
Epoch 7: Loss: 0.032, train_accuracy 0.989, test_accuracy 0.987, Time 7.2 sec
Epoch 8: Loss: 0.028, train_accuracy 0.989, test_accuracy 0.987, Time 7.1 sec
Epoch 9: Loss: 0.025, train_accuracy 0.989, test_accuracy 0.987, Time 7.1 sec
Epoch 10: Loss: 0.022, train_accuracy 0.994, test_accuracy 0.989, Time 27.6 sec


In [None]:
net.save_params("models/cnn_4gpu_mnist.par")