In [212]:
from mxnet import np, npx, autograd
from d2l import mxnet as d2l
npx.set_np()
from pdb import set_trace

## Define Dataset

In [213]:
def get_iterator(ds, batch_size=100):
    features, labels = ds
    num_observations = len(labels)
    for start_idx in range(0, num_observations, batch_size):
        end_idx = min(num_observations, (start_idx + batch_size))
        yield features[start_idx:end_idx], labels[start_idx:end_idx]

In [214]:
batch_size = 200
train_dl, test_dl = d2l.load_data_fashion_mnist(batch_size)

In [215]:
abc = next(iter(train_dl))

In [33]:
28*28

784

## Define Model

In [216]:
num_inputs, num_hidden1, num_hidden2, num_outputs = 28*28, 256, 256, 10
W1 = np.random.normal(scale=0.01, size=(num_inputs, num_hidden1))
W2 = np.random.normal(scale=0.01, size=(num_hidden1, num_hidden2))
W3 = np.random.normal(scale=0.01, size=(num_hidden2, num_outputs))
b1 = np.zeros(shape=(1, num_hidden1))
b2 = np.zeros(shape=(1, num_hidden2))
b3 = np.zeros(shape=(1, num_outputs))
params = [(W1, b1), (W2, b2), (W3, b3)]
for W, b in params:
    W.attach_grad()
    b.attach_grad()

In [217]:
def dropout_layer(activations, dropout_prob=0.1):
    if dropout_prob == 1:
        return np.zeros_like(activations)
    elif dropout_prob == 0:
        return activations
    mask = (np.random.uniform(size=activations.shape) > dropout_prob).astype(int)
    return (activations*mask)/(1 - dropout_prob)

In [218]:
def model(X, params, dropout_prob=[0]*(len(params) - 1), is_train=False):
    final_layer_idx = len(params) - 1
    activations = X.reshape(len(X), -1)
    for idx, (W, b) in enumerate(params):
        activations = np.dot(activations, W)
        if idx == final_layer_idx:
            break
        activations = npx.relu(activations)
        if is_train:
            activations = dropout_layer(activations, dropout_prob[idx])
    return activations

## Helpers for Labels

In [199]:
def softmax(output_activations):
    exponentiated_activations = np.exp(output_activations)
    partition_function = exponentiated_activations.sum(axis=1, keepdims=True)
    return exponentiated_activations/partition_function

In [93]:
def get_labels_from_softmax(probs):
    return np.argmax(probs, axis=1)

## Define Loss

In [226]:
def softmax_from_logits(yhat, y):
    num_samples = y.shape[0]
    yhat = softmax(yhat)
    return -np.log(yhat[range(num_samples), y])

In [194]:
yhat = softmax(act)
loss_fn(yhat, np.array([0, 1]))

array([2.302482 , 2.3025331])

## Define SGD

In [227]:
def sgd(params, batch_size, lr):
    for W, b in params:
        W[:] = W - W.grad*lr/batch_size
        b[:] = b - b.grad*lr/batch_size

## Training Loop

In [236]:
num_epochs = 3
dropout_prob = [0.1, 0.3]
lr = 0.1
for epoch in range(num_epochs):
    for X, y in iter(train_dl):
        batch_size = X.shape[0]
        with autograd.record():
            yhat = model(X, params, dropout_prob, True)
            loss = softmax_from_logits(yhat, y)
        loss.backward()
        sgd(params, batch_size, lr)
    print(f'Epoch: {epoch}, Loss: {loss.mean()}')

Epoch: 0, Loss: 0.4073313
Epoch: 1, Loss: 0.39693454
Epoch: 2, Loss: 0.32485893


## Accuracy

In [210]:
def get_accuracy(dl, model):
    num_samples = 0
    num_correct = 0
    for X, y in iter(dl):
        logit_yhat = model(X, params)
        yhat = softmax(logit_yhat)
        preds = get_labels_from_softmax(yhat)
        num_correct += (preds.astype('int32') == y).sum()
        num_samples += X.shape[0]
    return num_correct/num_samples

In [237]:
get_accuracy(test_dl, model)

array(0.8595)