In [None]:
""" 
    By github.com/vzki (Vadim Tschernezki).
    Low-level implementation of a Multi Layer Perceptron (MLP) in Python.
    Utilizes batch matrix operations (forwarding and backwarding per sample*s*, not per sample).
"""

In [15]:
import numpy as np
import time

In [16]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [3]:
nb_samples_tr = 5000
nb_samples_te = 1000

ds_tr = mnist.train
xs_tr = ds_tr.images[:nb_samples_tr]
ys_tr = ds_tr.labels[:nb_samples_tr]

ds_te = mnist.test
xs_te = ds_te.images[:nb_samples_te]
ys_te = ds_te.labels[:nb_samples_te]

In [4]:
def sigmoid(a, derived = False):
    
    if derived == True:
        
        return sigmoid(a) * (1 - sigmoid(a))
    
    else:
        
        return 1 / (1 + np.exp(-a))

In [5]:
def calc_accuracy(pred, targ):
    
    nb_samples = float(pred.shape[0])
    
    labels_targ = np.argmax(targ, axis = 1)
    labels_pred = np.argmax(pred, axis = 1)
    
    return np.sum(labels_targ == labels_pred) / nb_samples
    

In [6]:
class Dense:
    
    def __init__(self, nb_inputs, nb_neurons):
        """ Note that a dense layer is different from an input layer.
            An input layer does not have any activation function or similar.
            In terms of classical architecture definition, this dense layer is the
            "second" layer of a neural network. """
        
        # example dimensions calculation: [3, 2] (w) x [2, 1] (x) = [3, 1] (z)
        self._weights = np.random.randn(nb_inputs, nb_neurons) / np.sqrt(nb_inputs)
        
        # weights for biases (biases are equal to 1, but weights get updated)
        self._biases  = np.zeros((1, nb_neurons))
        
        self._delta = None
        
    def forward(self, a_prev):
        
        self._a_prev = a_prev
        
        self._z = (a_prev).dot(self._weights) + self._biases
        
        self._a = sigmoid(self._z)
        
        return self._a
    
    def backward(self, delta_next, weights_next):
        
        return (delta_next).dot(weights_next.T) * sigmoid(self._z, derived = True)

In [7]:
class MultiLayerPerceptron:
    
    def __init__(self):
        
        self._layers = []
        
    def add(self, dense):
        
        self._layers.append(dense)
        
    def build(self):
        
        self._deltas = [None] * len(self._layers)
        
    def forward(self, x):
        
        output = np.array(x)
        
        for l in self._layers:
            
            output = l.forward(output)
            
            # print output
            
        return output
    
    def backward(self, targ, pred):
        
        self._layers[-1]._delta = - (targ - pred) * (sigmoid(self._layers[-1]._z, derived = True))
        
        l_next = self._layers[-1]

        for i in reversed(range(len(self._layers[:-1]))):
            
            self._layers[i]._delta = self._layers[i].backward(l_next._delta, l_next._weights)
               
            l_next = self._layers[i]
        
    def update(self, learning_rate):
        
        for i in reversed(range(len(self._layers))):
            
            self._layers[i]._weights -= learning_rate * self._layers[i]._a_prev.T.dot(self._layers[i]._delta)
            
            self._layers[i]._biases -= learning_rate * np.sum(self._layers[i]._delta, axis = 0, keepdims = True)

In [8]:
mlp = MultiLayerPerceptron()

mlp.add(Dense(784, 24))
mlp.add(Dense(24, 12))
mlp.add(Dense(12, 10))

mlp.build()

In [9]:
pred = mlp.forward(xs_te)

# display first 20 predictions and labels without training

print "Predicting labels without training."

print np.argmax(pred, axis = 1)[:20]
print np.argmax(ys_te, axis = 1)[:20]

Predicting labels without training.
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[7 2 1 0 4 1 4 9 5 9 0 6 9 0 1 5 9 7 3 4]


In [11]:
# batch gradient descent

sz_batch   = 40
nb_epochs  = 20

time_start = time.time()

for j in range(nb_epochs):

    for i in range(0, nb_samples_tr, sz_batch):

        pred = mlp.forward(xs_tr[i : i + sz_batch])

        mlp.backward(ys_tr[i : i + sz_batch], pred)

        mlp.update(learning_rate = 0.1)
        
    pred_te = mlp.forward(xs_te)
    
    print "Epoch {}. Testing accurcacy: {}".format(j + 1, calc_accuracy(pred_te, ys_te))

time_end = time.time()

print "Training took {} seconds.".format(time_end - time_start)

Epoch 1. Testing accurcacy: 0.379
Epoch 2. Testing accurcacy: 0.712
Epoch 3. Testing accurcacy: 0.811
Epoch 4. Testing accurcacy: 0.851
Epoch 5. Testing accurcacy: 0.867
Epoch 6. Testing accurcacy: 0.877
Epoch 7. Testing accurcacy: 0.887
Epoch 8. Testing accurcacy: 0.895
Epoch 9. Testing accurcacy: 0.897
Epoch 10. Testing accurcacy: 0.897
Epoch 11. Testing accurcacy: 0.895
Epoch 12. Testing accurcacy: 0.899
Epoch 13. Testing accurcacy: 0.899
Epoch 14. Testing accurcacy: 0.899
Epoch 15. Testing accurcacy: 0.899
Epoch 16. Testing accurcacy: 0.9
Epoch 17. Testing accurcacy: 0.903
Epoch 18. Testing accurcacy: 0.903
Epoch 19. Testing accurcacy: 0.907
Epoch 20. Testing accurcacy: 0.907
Training took 2.61773300171 seconds.


In [12]:
pred = mlp.forward(xs_te)

# display first 20 predictions and labels with training

print "Predicting labels after training."

print np.argmax(pred, axis = 1)[:20]
print np.argmax(ys_te, axis = 1)[:20]

Predicting labels after training.
[7 2 1 0 4 1 4 9 6 9 0 6 9 0 1 5 9 7 3 4]
[7 2 1 0 4 1 4 9 5 9 0 6 9 0 1 5 9 7 3 4]


In [13]:
print "Accuracy for test dataset:", calc_accuracy(pred, ys_te)

Accuracy for test dataset: 0.907
