In [44]:
from __future__ import print_function
import mxnet as mx
import numpy as np
from mxnet import nd, autograd
ctx = mx.gpu()

In [45]:
num_inputs = 784
num_outputs = 10
batch_size = 64
def transform(data, label):
    return data.astype(np.float32)/255, label.astype(np.float32)
train_data = mx.gluon.data.DataLoader(mx.gluon.data.vision.MNIST(train=True, transform=transform),
                                      batch_size, shuffle=True)
test_data = mx.gluon.data.DataLoader(mx.gluon.data.vision.MNIST(train=False, transform=transform),
                                     batch_size, shuffle=False)

  64 x 28 x 28 x 1 和 64  训练集一个批的大小

In [46]:

#######################
#  Set some constants so it's easy to modify the network later
#######################
num_hidden = 256
weight_scale = .01

#######################
#  Allocate parameters for the first hidden layer
#######################
W1 = nd.random_normal(shape=(num_inputs, num_hidden), scale=weight_scale, ctx=ctx)
b1 = nd.random_normal(shape=num_hidden, scale=weight_scale, ctx=ctx)

#######################
#  Allocate parameters for the second hidden layer
#######################
W2 = nd.random_normal(shape=(num_hidden, num_hidden), scale=weight_scale, ctx=ctx)
b2 = nd.random_normal(shape=num_hidden, scale=weight_scale, ctx=ctx)

#######################
#  Allocate parameters for the output layer
#######################
W3 = nd.random_normal(shape=(num_hidden, num_outputs), scale=weight_scale, ctx=ctx)
b3 = nd.random_normal(shape=num_outputs, scale=weight_scale, ctx=ctx)



In [47]:
params = [W1, b1, W2, b2, W3, b3]

求导之前要注册, 为每个参数的倒数申请内存

In [48]:
for param in params:
    param.attach_grad()

定义relu函数

In [49]:
def relu(X):
    return nd.maximum(X, nd.zeros_like(X))

In [50]:
def softmax(y_liner):
    exps = nd.exp(y_liner-nd.max(y_liner))
    partitions = nd.nansum(exps, axis=1).reshape((-1, 1))
    return nd.divide(exps, partitions)

In [51]:
def cross_entropy(yhat, y):
    return - nd.nansum(nd.log(yhat) * y, axis=1)

# Define the model

In [52]:
def net(X):
    layer1 = relu(nd.dot(X, W1) + b1)
    layer2 = relu(nd.dot(layer1, W2) + b2)
    y_layer = nd.dot(layer2, W3) + b3
    return softmax(y_layer)

In [53]:
def SGD(params, lr):
    for param in params:
        param[:] = param - lr * param.grad

In [54]:

def evaluate_accuracy(data_iterator, net):
    numerator = 0.
    denominator = 0.
    for i, (data, label) in enumerate(data_iterator):
        data = data.as_in_context(ctx).reshape((-1, 784))
        label = label.as_in_context(ctx)
        output = net(data)
        predictions = nd.argmax(output, axis=1)
        numerator += nd.sum(predictions == label)
        denominator += data.shape[0]
    return (numerator / denominator).asscalar()

In [55]:
epochs = 10
learning_rate = .001
smoothing_constant = .01

for e in range(epochs):
    for i, (data, label) in enumerate(train_data):
        data = data.as_in_context(ctx).reshape((-1, 784))
        label = label.as_in_context(ctx)
        label_one_hot = nd.one_hot(label, 10)
        with autograd.record():
            output = net(data)
            loss = cross_entropy(output, label_one_hot)
        loss.backward()
        SGD(params, learning_rate)

        ##########################
        #  Keep a moving average of the losses
        ##########################
        curr_loss = nd.mean(loss).asscalar()
        moving_loss = (curr_loss if ((i == 0) and (e == 0))
                       else (1 - smoothing_constant) * moving_loss + (smoothing_constant) * curr_loss)

    test_accuracy = evaluate_accuracy(test_data, net)
    train_accuracy = evaluate_accuracy(train_data, net)
    print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" %
          (e, moving_loss, train_accuracy, test_accuracy))

Epoch 0. Loss: 0.469116926087, Train_acc 0.877217, Test_acc 0.8763


Epoch 1. Loss: 0.276336041597, Train_acc 0.925617, Test_acc 0.9254


Epoch 2. Loss: 0.195274604596, Train_acc 0.9486, Test_acc 0.9476


Epoch 3. Loss: 0.153054735652, Train_acc 0.959067, Test_acc 0.9572


Epoch 4. Loss: 0.128155154713, Train_acc 0.9674, Test_acc 0.9631


Epoch 5. Loss: 0.100336893821, Train_acc 0.975433, Test_acc 0.9689


Epoch 6. Loss: 0.0922036774503, Train_acc 0.978983, Test_acc 0.9701


Epoch 7. Loss: 0.0799359683752, Train_acc 0.981417, Test_acc 0.97


Epoch 8. Loss: 0.0661243833132, Train_acc 0.984933, Test_acc 0.9732


Epoch 9. Loss: 0.0566082565396, Train_acc 0.984133, Test_acc 0.9743
