<a href="https://colab.research.google.com/github/gmihaila/machine_learning_toolbox/blob/master/test_mxnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Tue_Jun_12_23:07:04_CDT_2018
Cuda compilation tools, release 9.2, V9.2.148


In [2]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



# Handwritten Digit Recognition [GPU use]

Inspiration: 

* https://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-scratch.html


In this tutorial, we'll give you a step by step walk-through of how to build a hand-written digit classifier using the [MNIST](https://en.wikipedia.org/wiki/MNIST_database) dataset. For someone new to deep learning, this exercise is arguably the "Hello World" equivalent.]

In [0]:
!pip install mxnet-cu92
!pip install mxnet-cu92mkl

In [5]:
from mxnet.test_utils import get_mnist
from mxnet.io import NDArrayIter
from mxnet import gpu
from mxnet import gluon
from mxnet import nd
from mxnet import autograd
from time import time



# Given a list of data that spans multiple GPUs, we then define a function to sum the data and broadcast the results to each GPU.
def allreduce(data):
    # sum on data[0].context, and then broadcast
    for i in range(1, len(data)):
        data[0][:] += data[i].copyto(data[0].context)
    for i in range(1, len(data)):
        data[0].copyto(data[i])
        
        

# Given a data batch, we define a function that splits this batch and copies each part into the corresponding GPU.
def split_and_load(data, ctx):
    n, k = data.shape[0], len(ctx)
    assert (n//k)*k == n, '# examples is not divided by # devices'
    idx = list(range(0, n+1, n//k))
    return [data[idx[i]:idx[i+1]].as_in_context(ctx[i]) for i in range(k)]
  
  
def train_batch(batch, params, ctx, lr):
    # split the data batch and load them on GPUs
    data = split_and_load(batch.data[0], ctx)
    label = split_and_load(batch.label[0], ctx)
    # run forward on each GPU
    with autograd.record():
        losses = [loss(lenet(X, W), Y)
                  for X, Y, W in zip(data, label, params)]
    # run backward on each gpu
    for l in losses:
        l.backward()
    # aggregate gradient over GPUs
    for i in range(len(params[0])):
        allreduce([params[c][i].grad for c in range(len(ctx))])
    # update parameters with SGD on each GPU
    for p in params:
        SGD(p, lr/batch.data[0].shape[0])



# initialize parameters
scale = .01
W1 = nd.random_normal(shape=(20,1,3,3))*scale
b1 = nd.zeros(shape=20)
W2 = nd.random_normal(shape=(50,20,5,5))*scale
b2 = nd.zeros(shape=50)
W3 = nd.random_normal(shape=(800,128))*scale
b3 = nd.zeros(shape=128)
W4 = nd.random_normal(shape=(128,10))*scale
b4 = nd.zeros(shape=10)

params = [W1, b1, W2, b2, W3, b3, W4, b4]


# network and loss
def lenet(X, params):
    # first conv
    h1_conv = nd.Convolution(data=X, weight=params[0], bias=params[1], kernel=(3,3), num_filter=20)
    h1_activation = nd.relu(h1_conv)
    h1 = nd.Pooling(data=h1_activation, pool_type="max", kernel=(2,2), stride=(2,2))
    # second conv
    h2_conv = nd.Convolution(data=h1, weight=params[2], bias=params[3], kernel=(5,5), num_filter=50)
    h2_activation = nd.relu(h2_conv)
    h2 = nd.Pooling(data=h2_activation, pool_type="max", kernel=(2,2), stride=(2,2))
    h2 = nd.flatten(h2)
    # first fullc
    h3_linear = nd.dot(h2, params[4]) + params[5]
    h3 = nd.relu(h3_linear)
    # second fullc
    yhat = nd.dot(h3, params[6]) + params[7]
    return yhat

loss = gluon.loss.SoftmaxCrossEntropyLoss()

# plain SGD
def SGD(params, lr):
    for p in params:
        p[:] = p - lr * p.grad
        
        

# The following function copies the parameters into a particular GPU and initializes the gradients.
def get_params(params, ctx):
    new_params = [p.copyto(ctx) for p in params]
    for p in new_params:
        p.attach_grad()
    return new_params

  
  
# For inference, we simply let it run on the first GPU. We leave a data parallelism implementation as an exercise.  
def valid_batch(batch, params, ctx):
    data = batch.data[0].as_in_context(ctx[0])
    pred = nd.argmax(lenet(data, params[0]), axis=1)
    return nd.sum(pred == batch.label[0].as_in_context(ctx[0])).asscalar()
  
  
  

def run(num_gpus, batch_size, lr):
    # the list of GPUs will be used
    ctx = [gpu(i) for i in range(num_gpus)]
    print('Running on {}'.format(ctx))

    # data iterator
    mnist = get_mnist()
    train_data = NDArrayIter(mnist["train_data"], mnist["train_label"], batch_size)
    valid_data = NDArrayIter(mnist["test_data"], mnist["test_label"], batch_size)
    print('Batch size is {}'.format(batch_size))

    # copy parameters to all GPUs
    dev_params = [get_params(params, c) for c in ctx]
    for epoch in range(5):
        # train
        start = time()
        train_data.reset()
        for batch in train_data:
            train_batch(batch, dev_params, ctx, lr)
        nd.waitall()  # wait all computations are finished to benchmark the time
        print('Epoch %d, training time = %.1f sec'%(epoch, time()-start))

        # validating
        valid_data.reset()
        correct, num = 0.0, 0.0
        for batch in valid_data:
            correct += valid_batch(batch, dev_params, ctx)
            num += batch.data[0].shape[0]
        print('         validation accuracy = %.4f'%(correct/num))
        

        
GPU_COUNT = 1 # increase if you have more

# RUN
run(GPU_COUNT, 64*GPU_COUNT, .3)


Running on [gpu(0)]
Batch size is 64
Epoch 0, training time = 5.1 sec
         validation accuracy = 0.9633
Epoch 1, training time = 5.0 sec
         validation accuracy = 0.9807
Epoch 2, training time = 4.9 sec
         validation accuracy = 0.9828
Epoch 3, training time = 4.9 sec
         validation accuracy = 0.9856
Epoch 4, training time = 4.9 sec
         validation accuracy = 0.9839


# Handwritten Digit Recognition [CPU use]

Inspiration: 
* https://mxnet.incubator.apache.org/versions/master/tutorials/python/mnist.html
* https://mxnet.incubator.apache.org/versions/master/tutorials/python/mnist.ipynb

In this tutorial, we'll give you a step by step walk-through of how to build a hand-written digit classifier using the [MNIST](https://en.wikipedia.org/wiki/MNIST_database) dataset. For someone new to deep learning, this exercise is arguably the "Hello World" equivalent.

In [0]:
!pip install mxnet

In [2]:
import mxnet as mx


mnist = mx.test_utils.get_mnist()



# Fix the seed
mx.random.seed(42)

# Set the compute context, GPU is available otherwise CPU
# ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
ctx = mx.cpu()



batch_size = 100
train_iter = mx.io.NDArrayIter(mnist['train_data'], mnist['train_label'], batch_size, shuffle=True)
val_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)



data = mx.sym.var('data')
# Flatten the data from 4-D shape into 2-D (batch_size, num_channel*width*height)
data = mx.sym.flatten(data=data)



# The first fully-connected layer and the corresponding activation function
fc1  = mx.sym.FullyConnected(data=data, num_hidden=128)
act1 = mx.sym.Activation(data=fc1, act_type="relu")

# The second fully-connected layer and the corresponding activation function
fc2  = mx.sym.FullyConnected(data=act1, num_hidden = 64)
act2 = mx.sym.Activation(data=fc2, act_type="relu")


# MNIST has 10 classes
fc3  = mx.sym.FullyConnected(data=act2, num_hidden=10)
# Softmax with cross entropy loss
mlp  = mx.sym.SoftmaxOutput(data=fc3, name='softmax')



import logging
logging.getLogger().setLevel(logging.DEBUG)  # logging to stdout
# create a trainable module on compute context
mlp_model = mx.mod.Module(symbol=mlp, context=ctx)
mlp_model.fit(train_iter,  # train data
              eval_data=val_iter,  # validation data
              optimizer='sgd',  # use SGD to train
              optimizer_params={'learning_rate':0.1},  # use fixed learning rate
              eval_metric='acc',  # report accuracy during training
              batch_end_callback = mx.callback.Speedometer(batch_size, 100), # output progress for each 100 data batches
              num_epoch=10)  # train for at most 10 dataset passes



test_iter = mx.io.NDArrayIter(mnist['test_data'], None, batch_size)
prob = mlp_model.predict(test_iter)
assert prob.shape == (10000, 10)


test_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)
# predict accuracy of mlp
acc = mx.metric.Accuracy()
mlp_model.score(test_iter, acc)
print(acc)

INFO:root:Epoch[0] Batch [100]	Speed: 44518.15 samples/sec	accuracy=0.114059
INFO:root:Epoch[0] Batch [200]	Speed: 42243.82 samples/sec	accuracy=0.112900
INFO:root:Epoch[0] Batch [300]	Speed: 44004.01 samples/sec	accuracy=0.108800
INFO:root:Epoch[0] Batch [400]	Speed: 42883.56 samples/sec	accuracy=0.112100
INFO:root:Epoch[0] Batch [500]	Speed: 43843.43 samples/sec	accuracy=0.137200
INFO:root:Epoch[0] Train-accuracy=0.221111
INFO:root:Epoch[0] Time cost=1.396
INFO:root:Epoch[0] Validation-accuracy=0.297000
INFO:root:Epoch[1] Batch [100]	Speed: 44087.78 samples/sec	accuracy=0.452970
INFO:root:Epoch[1] Batch [200]	Speed: 43527.35 samples/sec	accuracy=0.699900
INFO:root:Epoch[1] Batch [300]	Speed: 44139.42 samples/sec	accuracy=0.772100
INFO:root:Epoch[1] Batch [400]	Speed: 43953.38 samples/sec	accuracy=0.807200
INFO:root:Epoch[1] Batch [500]	Speed: 43395.98 samples/sec	accuracy=0.821900
INFO:root:Epoch[1] Train-accuracy=0.840000
INFO:root:Epoch[1] Time cost=1.387
INFO:root:Epoch[1] Validat

EvalMetric: {'accuracy': 0.9666}


EvalMetric: {'accuracy': 0.9674}
