In [16]:
import matplotlib.pyplot as plt
from mxnet import autograd, nd, gluon
from mxnet.gluon.data.vision import transforms

In [17]:
def get_dataloader(batch_size):
    transformer = transforms.Compose([
        transforms.ToTensor()
    ])
    train = gluon.data.vision.datasets.FashionMNIST(train=True)
    train = train.transform_first(transformer)
    train_iter = gluon.data.DataLoader(train, batch_size, shuffle=True, num_workers=4)
    
    test = gluon.data.vision.datasets.FashionMNIST(train=True)
    test = test.transform_first(transformer)
    test_iter = gluon.data.DataLoader(test, batch_size, shuffle=False, num_workers=4)
    return train_iter, test_iter

In [18]:
def show_images(X, nrows, ncols):
    _, axes = plt.subplots(nrows, ncols)
    axes = axes.flatten()
    for img, ax in zip(X, axes):
        ax.imshow(img)
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)
    return axes

In [19]:
def init_params(num_inputs, num_hidden1, num_hidden2, num_output):
    W1 = nd.random.normal(scale=0.01, shape=(num_inputs, num_hidden1))
    b1 = nd.zeros(num_hidden1)
    W2 = nd.random.normal(scale=0.01, shape=(num_hidden1, num_hidden2))
    b2 = nd.zeros(num_hidden2)
    W3 = nd.random.normal(scale=0.01, shape=(num_hidden2, num_output))
    b3 = nd.zeros(num_output)
    params = [W1, b1, W2, b2, W3, b3]
    for param in params:
        param.attach_grad()
    return params

In [20]:
def accuracy(net, params, num_inputs, drop_probs, loss, data_iter):
    acc = 0
    lnet = 0
    size = 0
    for X, y in data_iter:
        result = net(X, params, num_inputs, drop_probs)
        l = loss(result, y)
        lnet += l.sum().asscalar()
        y_hat = softmax(result)
        acc += (y_hat.argmax(axis=1) == y.astype('float32')).sum().asscalar()
        size += len(y)
    return acc / size, lnet / size

In [21]:
def softmax(X):
    X_exp = X.exp()
    normalization = X_exp.sum(axis=1, keepdims=True)
    return X_exp / normalization

In [22]:
def dropout(h, drop_prob):
    mask = nd.random.uniform(0, 1, shape=h.shape) > drop_prob
    return (mask * h) / (1.0-drop_prob)

In [23]:
def sgd(params, lr, batch_size):
    for param in params:
        param[:] = param - (lr / batch_size) * param.grad

In [31]:
def net(X, params, num_inputs, drop_probs):
    X = X.reshape((-1, num_inputs))
    W1, b1, W2, b2, W3, b3 = params
#     print(W1.sum())
    H1 = nd.dot(X, W1) + b1
    if autograd.is_training():
        H1 = dropout(H1, drop_probs[0])
    H1 = H1.relu()
    H2 = nd.dot(H1, W2) + b2
    if autograd.is_training():
        H2 = dropout(H2, drop_probs[1])
    H2 = H2.relu()
    return nd.dot(H2, W3) + b3

In [25]:
batch_size = 256
train_iter, test_iter = get_dataloader(batch_size)
num_inputs = 28*28
num_hidden1, num_hidden2 = 256, 256
num_output = 10
drop_probs = [0.0, 0.0]

params = init_params(num_inputs, num_hidden1, num_hidden2, num_output)

In [32]:
epochs = 5
lr = 0.1
loss = gluon.loss.SoftmaxCrossEntropyLoss()
for epoch in range(epochs):
    for X, y in train_iter:
        with autograd.record():
            result = net(X, params, num_inputs, drop_probs)            
#             result = model(X)
            l = loss(result, y)
        l.backward()
        for param in params:
            param[:] = param - (lr/batch_size) * param.grad
#         sgd(params, lr, batch_size)
    epoch_acc, epoch_loss = accuracy(net, params, num_inputs, drop_probs,loss,  train_iter)
    print('Epoch %d, acc: %f loss: %f' % (epoch, epoch_acc, epoch_loss))

Epoch 0, acc: 0.798250 loss: 0.545852
Epoch 1, acc: 0.827333 loss: 0.483884
Epoch 2, acc: 0.840633 loss: 0.448361
Epoch 3, acc: 0.854817 loss: 0.406807
Epoch 4, acc: 0.861367 loss: 0.389544


In [None]:
for X, y in train_iter:
    show_images(X.squeeze(axis=1).asnumpy(), 2, 5)
    result = model(X)
    y_hat = softmax(result)
    print((y_hat.argmax(axis=1) == y.astype('float32')).sum())
    acc = accuracy(model,gluon.loss.SoftmaxCrossEntropyLoss(), train_iter)
    print(acc)
    break