In [126]:
import matplotlib.pyplot as plt
import mxnet
from mxnet import autograd, nd, gluon
from mxnet.gluon.data.vision import transforms

In [127]:
def get_dataloader(batch_size, num_workers):
    transformer = transforms.Compose([
        transforms.ToTensor()
    ])
    train = gluon.data.vision.datasets.FashionMNIST(train=True)
    train = train.transform_first(transformer)
    train_iter = gluon.data.DataLoader(train, batch_size, shuffle=True, num_workers=num_workers)
    test = gluon.data.vision.datasets.FashionMNIST(train=False)
    test = test.transform_first(transformer)
    test_iter = gluon.data.DataLoader(test, batch_size, shuffle=True, num_workers=num_workers)
    return train_iter, test_iter

In [128]:
def softmax(X):
    X_exp = X.exp()
    normalization_constant = X_exp.sum(axis=1, keepdims=True)
    return X_exp / normalization_constant

In [168]:
def cross_entropy(y_hat, y):
    return -nd.pick(y_hat, y).log()

In [169]:
def sgd(params, lr, batch_size):
    for param in params:
        param[:] = param - (lr / batch_size) * param.grad

In [170]:
def accuracy(y_hat, y):
    return (y_hat.argmax(axis=1)==y.astype('float32')).sum()

In [171]:
def evaluate_acc(net,W, b, num_features, data_iter):
    accumulator = 0
    size = 0
    for X, y in data_iter:
        y_hat = net(X, W, b, num_features)
        accumulator += accuracy(y_hat, y)
        size += len(y)
    return accumulator / size

In [172]:
def net(X, W, b, num_features):
    return softmax(nd.dot(X.reshape(-1, num_features), W) + b)

In [173]:
batch_size = 256
num_workers = 4
train_iter, test_iter = get_dataloader(batch_size, num_workers)

In [174]:
num_inputs = 28 * 28
num_outputs = 10
W = nd.random.normal(scale=0.01, shape=(num_inputs, num_outputs))
b = nd.zeros(num_outputs)
W.attach_grad()
b.attach_grad()

In [175]:
num_epochs = 10
lr = 0.1
for epochs in range(num_epochs):
    for X, y in train_iter:
        with autograd.record():
            y_hat = net(X, W, b, num_inputs)
            l = cross_entropy(y_hat, y)
        l.backward()
#         sgd([W, b], lr, batch_size)
        W[:] = W - (lr / batch_size) * W.grad
        b[:] = b - (lr / batch_size) * b.grad
    epoch_acc = evaluate_acc(net, W, b, num_inputs, train_iter)
    print("Epoch %d, acc: %f" % (epochs, epoch_acc.asscalar()))
        

Epoch 0, acc: 0.805417
Epoch 1, acc: 0.820217
Epoch 2, acc: 0.829133
Epoch 3, acc: 0.834200
Epoch 4, acc: 0.839000
Epoch 5, acc: 0.841550
Epoch 6, acc: 0.844400
Epoch 7, acc: 0.845233
Epoch 8, acc: 0.846100
Epoch 9, acc: 0.846767


In [154]:
for X, y in train_iter:
    print(X.shape)
    print(y.shape)
    y_hat = net(X, W, b, num_inputs)
    print(cross_entropy(y_hat, y).shape)
    print(accuracy(y_hat, y)/len(y))
    break

(256, 1, 28, 28)
(256,)
(256,)

[0.11328125]
<NDArray 1 @cpu(0)>
