In [1]:
import keras
from keras.datasets import mnist

Using TensorFlow backend.


In [2]:
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [3]:
np.arange(4)

array([0, 1, 2, 3])

In [3]:
(x_train, y_train),(x_valid, y_valid) = mnist.load_data()

In [4]:
y_train = keras.utils.to_categorical(y_train, 10)
y_valid = keras.utils.to_categorical(y_valid, 10)

In [5]:
y_valid.shape

(10000, 10)

In [6]:
from sklearn.model_selection import train_test_split
x_t, x_v, y_t, y_v = x_train, x_valid, y_train, y_valid
#x_t, x_v, y_t, y_v = train_test_split(x_valid, y_valid, test_size=0.2, random_state=42) 

In [7]:
print(x_t.shape, y_t.shape, x_v.shape, y_v.shape)

(60000, 28, 28) (60000, 10) (10000, 28, 28) (10000, 10)


In [8]:
x_t = x_t.astype('float32')
x_v = x_v.astype('float32')
x_t /= 255
x_v /= 255

In [10]:
x_t = x_t.reshape(x_t.shape[0], 784)
x_v = x_v.reshape(x_v.shape[0], 784)

In [11]:
def sigmoid(z):
    return (1 / (1 + np.exp(-z)))

def sigmoid_(z):
    return (sigmoid(z)*(1-sigmoid(z)))


In [33]:
class Neuralnet(object):
    def __init__(self, size):
        self.layers = len(size)
        self.size = size
        self.weights = [np.random.randn(y, x) for x, y in zip(size[:-1], size[1:])]
        self.biases = [np.random.randn(y, 1) for y in size[1:]]
        
    def forward(self, a):
        a = a.T
        for w, b in zip(self.weights, self.biases):
            a = sigmoid((w @ a) + b)
        return a.T
    
    def SGD(self, X, y, epochs, lr, bs, test_data = False):
        
        if test_data: 
            X_t, y_t = test_data
            n_test = len(X_t)
        n = len(X)
        for e in range(epochs):
            X, y = shuffle(X, y, random_state=42)
            X_batches = [X[k:k+bs] for k in range(0, n, bs)]
            y_batches = [y[k:k+bs] for k in range(0, n, bs)]
            for x_, y_ in zip(X_batches, y_batches):
                self.update_minibatch(lr/bs, x_, y_)
            
            if test_data:
                print ("Epoch {0}: {1} / {2}".format(e, self.evaluate(X_t, y_t), n_test))
            else:
                print ("Epoch {} complete".format(e))
                
    def update_minibatch(self,lr, x, y):
        dW, db = self.backprop(x,y)
        for i in range(len(self.weights)):
            self.weights[i] = self.weights[i] - lr * dW[i]
            self.biases[i] = self.biases[i] - lr * db[i]
            
            
        
    def backprop(self,x,y):
        dw = [np.zeros(w.shape) for w in self.weights]
        db = [np.zeros(b.shape) for b in self.biases]
        a = x.T
        activations = [a]
        zs = []
        for w, b in zip(self.weights, self.biases):
            z = w @ a + b
            zs.append(z)
            a = sigmoid(z)
            activations.append(a)
        delta = (a - y.T) * sigmoid_(z)
        #print(delta.shape)
        dw[-1] = delta @ activations[-2].T
        db[-1] = np.reshape(np.average(delta, axis=1),(db[-1].shape[0],1))
        
        for l in range(2, self.layers):
            delta = self.weights[-l+1].T @ delta * sigmoid_(zs[-l])
            #print(delta.shape)
            dw[-l] = delta @ activations[-l-1].T
            db[-l] = np.reshape(np.average(delta, axis=1),(db[-l].shape[0],1))
        
        return dw, db
        
        
    def evaluate(self, x, y):
        y_ = [np.argmax(a) for a in self.forward(x)]
        y = [np.argmax(l) for l in y]
        return sum(int(a==l) for a,l in zip(y_, y))

In [37]:
net = Neuralnet([784, 10])

In [39]:
net.SGD(x_t, y_t, 30, 3.0, 16, test_data=(x_v, y_v))

Epoch 0: 4939 / 10000
Epoch 1: 4963 / 10000
Epoch 2: 4991 / 10000
Epoch 3: 5015 / 10000
Epoch 4: 5094 / 10000
Epoch 5: 5672 / 10000
Epoch 6: 5685 / 10000
Epoch 7: 5689 / 10000
Epoch 8: 5692 / 10000
Epoch 9: 5703 / 10000
Epoch 10: 5698 / 10000
Epoch 11: 5717 / 10000
Epoch 12: 5715 / 10000
Epoch 13: 5715 / 10000
Epoch 14: 5714 / 10000
Epoch 15: 5709 / 10000
Epoch 16: 5730 / 10000
Epoch 17: 5723 / 10000
Epoch 18: 5724 / 10000
Epoch 19: 5716 / 10000
Epoch 20: 5736 / 10000
Epoch 21: 5730 / 10000
Epoch 22: 5739 / 10000
Epoch 23: 5734 / 10000
Epoch 24: 5740 / 10000
Epoch 25: 5714 / 10000
Epoch 26: 5724 / 10000
Epoch 27: 5736 / 10000
Epoch 28: 5739 / 10000
Epoch 29: 5732 / 10000


# Single fwd-pass and backprop for 1 batch

In [41]:
bs = 32 #batch size

In [42]:
x = x_t[:bs].T; x.shape # take batch and transpose of ease of calculations - Features

(784, 32)

In [43]:
y = y_t[:bs].T; y.shape # take batch and transpose of ease of calculations - Labels

(10, 32)

In [44]:
w1 = np.random.randn(128, 784) # Initialise weights for 1st layer-128 number of neurons, -784 features incoming
b1 = np.zeros((128,1))         # Initialise biases for 1st layer  -128 number of neurons
w2 = np.random.randn(10, 128)  # Initialise weights for 2nd layer -10 number of neurons, -128 features incoming
b2 = np.zeros((10,1))          # Initialise biases for 2nd layer -10 number of neurons

## Learning loop

### Forward pass

In [45]:
z1 = w1 @ x + b1; z1.shape # Vectorize calculations for 1st layer

(128, 32)

In [46]:
a1 = sigmoid(z1); a1.shape # Adding the non-linearity - sigmoid activation

(128, 32)

In [47]:
z2 = w2 @ a1 + b2; z2.shape # Vectorize calculations for 2nd/output layer

(10, 32)

In [48]:
a2 = sigmoid(z2); a2.shape # Adding the non-linearity - sigmoid activation

(10, 32)

### Quadratic cost

In [49]:
c = (a2 - y); c.shape # Quadratic cost derivative

(10, 32)

In [50]:
zp2 = sigmoid_(z2); zp2.shape # Sigmoid prime for output activations

(10, 32)

### Backprop

In [51]:
delta2 = c * zp2; delta2.shape # Error for last layer

(10, 32)

In [None]:
db2 = np.reshape(np.average(delta2, axis=1),(10,1)); db2.shape # Derivative of cost w.r.t biases 2nd/output layer

In [None]:
dw2 = delta2 @ a1.T; dw2.shape # Derivative of cost w.r.t weights 2nd/output layer

In [None]:
delta1 = (w2.T @ delta2) * sigmoid_(z1); delta1.shape # Cost/Error w.r.t 1st layer

In [None]:
db1 = np.reshape(np.average(delta1, axis=1),(10,1)); db1.shape # Derivative of cost w.r.t biases 1st hidden layer

In [None]:
dw1 = delta1 @ x.T; dw1.shape # # Derivative of cost w.r.t weights for 1st hidden layer  layer

### Update weights

In [None]:
w1 = w1 - 0.01*dw1/bs
b1 = b1 - 0.01*db1/bs
w2 = w2 - 0.01*dw2/bs
b2 = b2 - 0.01*db2/bs