In [95]:
import matplotlib.pyplot as plt
import mxnet
from mxnet import autograd, nd, gluon
from mxnet.gluon.data.vision import transforms

In [96]:
def get_dataloader(batch_size, num_workers):
    transformer = transforms.Compose([
        transforms.ToTensor()
    ])
    train = gluon.data.vision.datasets.FashionMNIST(train=True)
    train = train.transform_first(transformer)
    train_iter = gluon.data.DataLoader(train, batch_size, shuffle=True, num_workers=num_workers)
    test = gluon.data.vision.datasets.FashionMNIST(train=False)
    test = test.transform_first(transformer)
    test_iter = gluon.data.DataLoader(test, batch_size, shuffle=True, num_workers=num_workers)
    return train_iter, test_iter

In [97]:
def softmax(X):
    X_exp = X.exp()
    normalization_constant = X_exp.sum(axis=1, keepdims=True)
    return X_exp / normalization_constant

In [98]:
def cross_entropy(y_hat, y):
    return -nd.pick(y_hat, y, axis=1)

In [99]:
def sgd(params, lr, batch_size):
    for param in params:
        param[:] = param - (lr / batch_size) * param.grad

In [100]:
def accuracy(y_hat, y):
    return (y_hat.argmax(axis=1)==y.astype('float32')).sum()

In [101]:
def evaluate_acc(net,W, b, num_features, data_iter):
    accumulator = 0
    size = 0
    for X, y in data_iter:
        y_hat = net(X, W, b, num_features)
        accumulator += accuracy(y_hat, y)
        size += len(y)
    print(accumulator)
    return accumulator / size

In [102]:
def net(X, W, b, num_features):
    return softmax(nd.dot(X.reshape(-1, num_features), W) + b)

In [103]:
batch_size = 256
num_workers = 4
train_iter, test_iter = get_dataloader(batch_size, num_workers)

In [104]:
num_inputs = 784
num_outputs = 10
W = nd.random.normal(scale=0.01, shape=(num_inputs, num_outputs))
b = nd.zeros(num_outputs)
W.attach_grad()
b.attach_grad()

In [124]:
num_epochs = 10
lr = 0.1
for epochs in range(num_epochs):
    for X, y in train_iter:
        with autograd.record():
            y_hat = net(X, W, b, num_inputs)
            l = cross_entropy(y_hat, y)
        l.backward()
#         sgd([W, b], lr, batch_size)
        W[:] = W - (lr / batch_size) * W.grad
        b[:] = b - (lr / batch_size) * b.grad
    epoch_acc = evaluate_acc(net, W, b, num_inputs, train_iter)
    print("Epoch %d, acc: %f" % (epochs, epoch_acc.asscalar()))
        


[40504.]
<NDArray 1 @cpu(0)>
Epoch 0, acc: 0.675067

[41122.]
<NDArray 1 @cpu(0)>
Epoch 1, acc: 0.685367

[42817.]
<NDArray 1 @cpu(0)>
Epoch 2, acc: 0.713617

[44973.]
<NDArray 1 @cpu(0)>
Epoch 3, acc: 0.749550

[45543.]
<NDArray 1 @cpu(0)>
Epoch 4, acc: 0.759050

[45997.]
<NDArray 1 @cpu(0)>
Epoch 5, acc: 0.766617

[46272.]
<NDArray 1 @cpu(0)>
Epoch 6, acc: 0.771200

[46486.]
<NDArray 1 @cpu(0)>
Epoch 7, acc: 0.774767

[46629.]
<NDArray 1 @cpu(0)>
Epoch 8, acc: 0.777150

[46824.]
<NDArray 1 @cpu(0)>
Epoch 9, acc: 0.780400


In [123]:
for X, y in train_iter:
    print(X.shape)
    print(y.shape)
    y_hat = net(X, W, b, num_inputs)
    print(cross_entropy(y_hat, y).shape)
    print(accuracy(y_hat, y)/len(y))
    break

(256, 1, 28, 28)
(256,)
(256,)

[0.08203125]
<NDArray 1 @cpu(0)>
