# Simple NN for MNIST (NumPy) \*\*Only train top layer\*\*
Sam Greydanus. 27 April 2017. MIT License.

### Comments
When I explore alternative methods to training neural nets with backprop (e.g. the U loss method), I will use this notebook as a baseline for NN performance on the MNIST when only the top layer is trained with backprop. Max accuracy usually ~60-70% if you train it 20000+ steps

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=False)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [2]:
lr = 1e-3
batch_size = 32
global_step = 0
print_every = 250
total_steps = 10000

D_hidden = 128 # hidden layer size
D_side = 28
D_img = D_side**2 # dimensionality of image
D_labels = 10 # number of labels

## Model Initialization

In [3]:
model = {}
# first layer
model['W1'] = np.random.randn(D_img,D_hidden) / np.sqrt(D_hidden) # Xavier initialization
model['b1'] = np.random.randn(1,D_hidden) / np.sqrt(D_hidden)
#second layer
model['W2'] = np.random.randn(D_hidden,D_hidden) / np.sqrt(D_hidden)
model['b2'] = np.random.randn(1,D_hidden) / np.sqrt(D_hidden)
#third layer
model['W3'] = np.random.randn(D_hidden,D_labels) / np.sqrt(D_labels)
model['b3'] = np.random.randn(1,D_labels) / np.sqrt(D_labels)

for (k,v) in model.items(): print("{} : {}".format(k, v.shape))

W1 : (784, 128)
b1 : (1, 128)
W2 : (128, 128)
b2 : (1, 128)
W3 : (128, 10)
b3 : (1, 10)


## Forward functions

In [4]:
def xW_plus_b(x, W, b):
    return np.dot(x,W) + b # in some cases you can even drop the bias b

def relu(x):
    x[x<0] = 0
    return x

def softmax(x):
    maxes = np.amax(x, axis=1, keepdims=True)
    e = np.exp(x - maxes) # improves numerics
    dist = e / np.sum(e, axis=1, keepdims=True)
    return dist

## Backward functions

In [5]:
def dsoftmax(h, y, batch_size):
    h[range(batch_size),y] -= 1
    return h/y.shape[0] # divide by batch size

def drelu(dz, h):
    dz[h <= 0] = 0 # backprop relu
    return dz

def dxW_plus_b(dh, W):
    return np.dot(dh, W.T)

## Forward pass

In [6]:
def forward(X, model):
    # evaluate class scores, [N x K]
    hs = [] # we'll need the h's for computing gradients
    
    z1 = xW_plus_b(X, model['W1'], model['b1'])
    h1 = relu(z1) ; hs.append(h1)
    
    z2 = xW_plus_b(h1, model['W2'], model['b2'])
    h2 = relu(z2) ; hs.append(h2)
    
    z3 = xW_plus_b(h2, model['W3'], model['b3'])
    h3 = z3 ; hs.append(h3)
    
    probs = softmax(h3)
    return probs, hs

In [7]:
# evaluate test set accuracy
def test_accuracy(model):
    X = mnist.test.images
    y = mnist.test.labels
    scores, _ = forward(X, model)
    predicted_class = np.argmax(scores, axis=1)
    return (np.mean(predicted_class == y))

## Backward pass

In [8]:
def backward(y, probs, X, hs, model):
    grads = { k : np.zeros_like(v) for k,v in model.items() }
    dh3 = dsoftmax(probs, y, batch_size)
    
    # third hidden layer
    grads['W3'] = np.dot(hs[-2].T, dh3)
    grads['b3'] = np.sum(dh3, axis=0, keepdims=True)
    
    # second hidden layer
    dh2 = dxW_plus_b(dh3, model['W3'])
    dh2 = drelu(dh2, hs[-2]) # backprop through relu
    grads['W2'] = np.zeros_like(np.dot(hs[-3].T, dh2))
    grads['b2'] = np.zeros_like(np.sum(dh2, axis=0, keepdims=True))

    # first hidden layer
    dh1 = dxW_plus_b(dh2, model['W2'])
    dh1 = drelu(dh1, hs[-3]) # backprop through relu
    grads['W1'] = np.zeros_like(np.dot(X.T, dh1))
    grads['b1'] = np.zeros_like(np.sum(dh1, axis=0, keepdims=True))
    return grads

## Train loop

In [9]:
# generic train loop
running_loss = None ; interp = 0.99
for global_step in range(global_step, total_steps+global_step+1):
    
    # forward
    X, y = mnist.train.next_batch(batch_size)
    probs, hs = forward(X, model)
    
    y_logprobs = -np.log(probs[range(batch_size),y]) # cross-entropy loss
    loss = np.sum(y_logprobs)/batch_size
    running_loss = loss if running_loss is None else interp*running_loss + (1-interp)*loss
    
    # backward
    grads = backward(y, probs, X, hs, model) # data gradients
    model = {k : model[k] - lr*grads[k] for (k,v) in grads.items()} # update parameters

    # ======== DISPLAY PROGRESS ======== #
    if global_step % print_every == 0:
        if global_step / 4 % print_every == 0:
            print('accuracy: {:.2f}%'.format(100*test_accuracy(model)))
        print('\tstep {}: loss: {:.4f}'
              .format(global_step, running_loss))

accuracy: 10.38%
	step 0: loss: 3.4576
	step 250: loss: 2.8574
	step 500: loss: 2.6067
	step 750: loss: 2.5110
accuracy: 13.13%
	step 1000: loss: 2.4569
	step 1250: loss: 2.4019
	step 1500: loss: 2.3503
	step 1750: loss: 2.3251
accuracy: 15.67%
	step 2000: loss: 2.2984
	step 2250: loss: 2.2698
	step 2500: loss: 2.2469
	step 2750: loss: 2.2201
accuracy: 18.51%
	step 3000: loss: 2.1846
	step 3250: loss: 2.1728
	step 3500: loss: 2.1402
	step 3750: loss: 2.1044
accuracy: 22.74%
	step 4000: loss: 2.0981
	step 4250: loss: 2.0762
	step 4500: loss: 2.0554
	step 4750: loss: 2.0238
accuracy: 27.45%
	step 5000: loss: 1.9995
	step 5250: loss: 1.9687
	step 5500: loss: 1.9518
	step 5750: loss: 1.9590
accuracy: 32.61%
	step 6000: loss: 1.9202
	step 6250: loss: 1.9119
	step 6500: loss: 1.8824
	step 6750: loss: 1.8732
accuracy: 37.22%
	step 7000: loss: 1.8654
	step 7250: loss: 1.8353
	step 7500: loss: 1.8161
	step 7750: loss: 1.7960
accuracy: 41.05%
	step 8000: loss: 1.7919
	step 8250: loss: 1.7736
	st