Default devise is cpu

In [None]:
from mxnet import nd, cpu, gpu, gluon, autograd
from mxnet.gluon import nn
from mxnet.gluon.data.vision import datasets, transforms
import time

In [13]:
x = nd.ones((3,4), ctx=gpu())
x
# i have only one gpu
# x.copyto(gpu(1))


[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]
<NDArray 3x4 @gpu(0)>

In [14]:
y = nd.random.uniform(shape=(3,4), ctx=gpu())
x+y


[[1.3925502 1.8212544 1.0012403 1.2185879]
 [1.307889  1.3717465 1.2730181 1.3112395]
 [1.2925439 1.5859082 1.8407545 1.7850714]]
<NDArray 3x4 @gpu(0)>

In [17]:
net = nn.Sequential()
net.add(
    nn.Conv2D(channels=6, kernel_size=5, activation='relu'),
    nn.MaxPool2D(pool_size=2, strides=2),
    nn.Conv2D(channels=16, kernel_size=3, activation='relu'),
    nn.MaxPool2D(pool_size=2, strides=2),
    nn.Flatten(),
    nn.Dense(120, activation="relu"),
    nn.Dense(84, activation="relu"),
    nn.Dense(10)
)
net.load_parameters('net.params', ctx=gpu(0))

x = nd.random.uniform(shape=(1,1,28,28), ctx=gpu(0))
output = net(x)
print(output)
output.shape


[[ 1.2280461  -0.8499762   1.1064829   0.8627089   0.10542075 -1.4364507
   1.9387282  -1.2366142   0.01517534 -1.6967722 ]]
<NDArray 1x10 @gpu(0)>


(1, 10)

Multi-GPU training
i have only one gpu

In [18]:
batch_size = 256
transformer = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(0.13, 0.31)])
train_data = gluon.data.DataLoader(
    datasets.FashionMNIST(train=True).transform_first(transformer),
    batch_size, shuffle=True, num_workers=4)
valid_data = gluon.data.DataLoader(
    datasets.FashionMNIST(train=False).transform_first(transformer),
    batch_size, shuffle=False, num_workers=4)

In [23]:
# Diff 1: Use two GPUs for training.
# devices = [gpu(0), gpu(1)]  # write multiple gpu into query.
devices = [gpu(0)]  # i have only one cpu and one gpu
# Diff 2: reinitialize the parameters and place them on multiple GPUs
net.collect_params().initialize(force_reinit=True, ctx=devices)  # copy parameters to gpu
# Loss and trainer are the same as before
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})
for epoch in range(10):
    train_loss = 0.
    tic = time.time()
    for data, label in train_data:
        # Diff 3: split batch and load into corresponding devices
        data_list = gluon.utils.split_and_load(data, devices)  # split for multiple gpu if you have
        label_list = gluon.utils.split_and_load(label, devices)  # split for multiple gpu if you have
        # Diff 4: run forward and backward on each devices.
        # MXNet will automatically run them in parallel
        with autograd.record():
            losses = [softmax_cross_entropy(net(X), y)
                      for X, y in zip(data_list, label_list)]
        # calculate all losses
        for l in losses:
            l.backward()
        trainer.step(batch_size)
        # Diff 5: sum losses over all devices
        train_loss += sum([l.sum().asscalar() for l in losses])
    print("Epoch %d: loss %.3f, in %.1f sec" % (
        epoch, train_loss/len(train_data)/batch_size, time.time()-tic))

Epoch 0: loss 1.659, in 5.9 sec
Epoch 1: loss 0.702, in 6.2 sec
Epoch 2: loss 0.558, in 5.9 sec
Epoch 3: loss 0.496, in 5.9 sec
Epoch 4: loss 0.449, in 6.0 sec
Epoch 5: loss 0.416, in 6.4 sec
Epoch 6: loss 0.389, in 6.2 sec
Epoch 7: loss 0.369, in 5.9 sec
Epoch 8: loss 0.350, in 5.9 sec
Epoch 9: loss 0.337, in 5.9 sec
