# Numpy Neural Network

Inspired by code [here](https://github.com/MichalDanielDobrzanski/DeepLearningPython).

In [None]:
import random
import numpy as np
import pickle
import gzip
import json
import sys

## network.py

In [1]:
class Network:
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]
        
    def feedforward(self, a):
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a
    
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        training_data = list(training_data)
        n = len(training_data)
        
        if test_data:
            test_data = list(test_data)
            n_test = len(test_data)
            
        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)]
            
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
                
            if test_data:
                print(f"Epoch {j} : {self.evaluate(test_data)} / {n_test}")
            else:
                print(f"Epoch {j} complete")
        
    def update_mini_batch(self, mini_batch, eta):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [w-(eta/len(mini_batch))*nw for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(mini_batch))*nb for b, nb in zip(self.biases, nabla_b)]
        
    def backprop(self, x, y):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        
        # feedforward
        activation = x 
        activations = [x]
        zs = []
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation) + b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
            
        # backward pass
        delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
            
        return (nabla_b, nabla_w)
    
    def evaluate(self, test_data):
        test_results = [(np.argmax(self.feedforward(x)), y) for (x, y) in test_data]
        return sum(int(x==y) for (x, y) in test_results)
    
    def cost_derivative(self, output_activations, y):
        return (output_activations-y)          

## mnist_loader.py

In [2]:
def load_data():
    f = gzip.open('mnist.pkl.gz', 'rb')
    training_data, validation_data, test_data = pickle.load(f, encoding="latin1")
    f.close()
    return (training_data, validation_data, test_data)

def load_data_wrapper():
    tr_d, va_d, te_d = load_data()
    training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
    training_results = [vectorized_result(y) for y in tr_d[1]]
    training_data = zip(training_inputs, training_results)
    validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
    validation_data = zip(validation_inputs, va_d[1])
    test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
    test_data = zip(test_inputs, te_d[1])
    return (training_data, validation_data, test_data)

def vectorized_result(j):
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e        

def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))    

In [3]:
training_data, validation_data, test_data = load_data_wrapper()
training_data = list(training_data)

In [4]:
net = Network([784, 30, 10])
net.SGD(training_data, 30, 10, 3.0, test_data=test_data)

Epoch 0 : 9099 / 10000
Epoch 1 : 9177 / 10000
Epoch 2 : 9314 / 10000
Epoch 3 : 9374 / 10000
Epoch 4 : 9348 / 10000
Epoch 5 : 9424 / 10000
Epoch 6 : 9436 / 10000
Epoch 7 : 9453 / 10000
Epoch 8 : 9455 / 10000
Epoch 9 : 9485 / 10000
Epoch 10 : 9508 / 10000
Epoch 11 : 9476 / 10000
Epoch 12 : 9486 / 10000
Epoch 13 : 9523 / 10000
Epoch 14 : 9509 / 10000
Epoch 15 : 9500 / 10000
Epoch 16 : 9511 / 10000
Epoch 17 : 9503 / 10000
Epoch 18 : 9492 / 10000
Epoch 19 : 9513 / 10000
Epoch 20 : 9510 / 10000
Epoch 21 : 9534 / 10000
Epoch 22 : 9534 / 10000
Epoch 23 : 9532 / 10000
Epoch 24 : 9528 / 10000
Epoch 25 : 9549 / 10000
Epoch 26 : 9520 / 10000
Epoch 27 : 9510 / 10000
Epoch 28 : 9539 / 10000
Epoch 29 : 9531 / 10000


## network2.py

In [5]:
class QuadraticCost:
    @staticmethod
    def fn(a, y):
        return 0.5*np.linalg.norm(a-y)**2
    
    @staticmethod
    def delta(z, a, y):
        return (a-y) * sigmoid_prime(z)
    
    
class CrossEntropyCost:
    @staticmethod
    def fn(a, y):
        return np.sum(np.nan_to_num(-y*np.log(a)-(1-y)*np.log(1-a)))
    
    @staticmethod
    def delta(z, a, y):
        return (a-y)
    
    
class Network:
    def __init__(self, sizes, cost=CrossEntropyCost):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.default_weight_initializer()
        self.cost=cost
        
    def default_weight_initializer(self):
        self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
        self.weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in zip(self.sizes[:-1], self.sizes[1:])]
        
    def large_weight_initializer(self):
        self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
        self.weights = [np.random.randn(y, x) for x, y in zip(self.sizes[:-1], self.sizes[1:])]    
        
    def feedforward(self, a):
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a
    
    def SGD(self, training_data, epochs, mini_batch_size, eta, 
           lmbda = 0.0,
           evaluation_data=None,
           monitor_evaluation_cost=False,
           monitor_evaluation_accuracy=False,
           monitor_training_cost=False,
           monitor_training_accuracy=False,
           early_stopping_n=0):
        best_accuracy=1
        
        training_data = list(training_data)
        n = len(training_data)
        
        if evaluation_data:
            evaluation_data = list(evaluation_data)
            n_data = len(evaluation_data)
            
            
        best_accuracy=0
        no_accuracy_change=0
        
        evaluation_cost, evaluation_accuracy = [], []
        training_cost, training_accuracy = [], []
        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [training_data[k:k+mini_batch_size] for k in range(0,n,mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta, lmbda, len(training_data))
                
            print(f"Epoch {j} training complete")
            
            if monitor_training_cost:
                cost = self.total_cost(training_data, lmbda)
                training_cost.append(cost)
                print(f"Cost on training data: {cost}")
            if monitor_training_accuracy:
                accuracy = self.accuracy(training_data, convert=True)
                training_accuracy.append(accuracy)
                print(f"Accuracy on training data: {accuracy} / {n}")
            if monitor_evaluation_cost:
                cost = self.total_cost(evaluation_data, lmbda, convert=True)
                evaluation_cost.append(cost)
                print(f"Cost on evaluation data: {cost}")
            if monitor_evaluation_accuracy:
                accuracy = self.accuracy(evaluation_data)
                evaluation_accuracy.append(accuracy)
                print(f"Accuracy on evaluation data: {accuracy} / {n_data}")
                
            # Early stopping
            if early_stopping_n > 0:
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    no_accuracy_change = 0
                else:
                    no_accuracy_change +=1
                    
                if no_accuracy_change == early_stopping_n:
                    return evaluation_cost, evaluation_accuracy, training_cost, training_accuracy
                
        return evaluation_cost, evaluation_accuracy, training_cost, training_accuracy
    
    def update_mini_batch(self, mini_batch, eta, lmbda, n):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb +dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [(1-eta*(lmbda/n))*w-(eta/len(mini_batch))*nw for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(mini_batch))*nb for b, nb in zip(self.biases, nabla_b)]
        
    def backprop(self, x, y):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        
        # feedforward
        activation = x 
        activations = [x]
        zs = []
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation) + b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
            
        # backward pass
        delta = (self.cost).delta(zs[-1], activations[-1], y)
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
        
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
            
        return (nabla_b, nabla_w)
    
    def accuracy(self, data, convert=False):
        if convert:
            results = [(np.argmax(self.feedforward(x)), np.argmax(y)) for (x, y) in data]
        else:
            results = [(np.argmax(self.feedforward(x)), y) for (x, y) in data]
        result_accuracy = sum(int(x == y) for (x, y) in results)
        return result_accuracy
    
    def total_cost(self, data, lmbda, convert=False):
        cost = 0.0
        for x, y in data:
            a = self.feedforward(x)
            if convert: y = vectorized_result(y)
            cost += self.cost.fn(a, y)/len(data)
            cost += 0.5*(lmbda/len(data))*sum(np.linalg.norm(w)**2 for w in self.weights)
        return cost
    
    def save(self, filename):
        data = {
            "sizes":self.sizes,
            "weights":[w.tolist() for w in self.weights],
            "biases": [b.tolist() for b in self.biases],
            "cost": str(self.cost.__name__)
        }
        f = open(filename, "w")
        json.dump(data, f)
        f.close()
        
    def load(filename):
        f = open(filename, "r")
        data = json.load(f)
        f.close()
        cost = getattr(sys.modules[__name__], data["cost"])
        net = Network(data["sizes"], cost=cost)
        net.weights = [np.array(w) for w in data["weights"]]
        net.biases = [np.array(b) for b in data["biases"]]
        return net

### Example

In [6]:
net = Network([784, 30, 10], cost=CrossEntropyCost)
# net.large_weight_initializer()
net.SGD(training_data, 30, 10, 0.1, lmbda=5.0, evaluation_data=validation_data, monitor_evaluation_accuracy=True)

Epoch 0 training complete
Accuracy on evaluation data: 9237 / 10000
Epoch 1 training complete
Accuracy on evaluation data: 9417 / 10000
Epoch 2 training complete
Accuracy on evaluation data: 9465 / 10000
Epoch 3 training complete
Accuracy on evaluation data: 9508 / 10000
Epoch 4 training complete
Accuracy on evaluation data: 9503 / 10000
Epoch 5 training complete
Accuracy on evaluation data: 9507 / 10000
Epoch 6 training complete
Accuracy on evaluation data: 9569 / 10000
Epoch 7 training complete
Accuracy on evaluation data: 9579 / 10000
Epoch 8 training complete
Accuracy on evaluation data: 9562 / 10000
Epoch 9 training complete
Accuracy on evaluation data: 9573 / 10000
Epoch 10 training complete
Accuracy on evaluation data: 9588 / 10000
Epoch 11 training complete
Accuracy on evaluation data: 9574 / 10000
Epoch 12 training complete
Accuracy on evaluation data: 9595 / 10000
Epoch 13 training complete
Accuracy on evaluation data: 9610 / 10000
Epoch 14 training complete
Accuracy on evalu

([],
 [9237,
  9417,
  9465,
  9508,
  9503,
  9507,
  9569,
  9579,
  9562,
  9573,
  9588,
  9574,
  9595,
  9610,
  9614,
  9612,
  9598,
  9607,
  9620,
  9607,
  9629,
  9638,
  9625,
  9624,
  9621,
  9632,
  9594,
  9628,
  9607,
  9615,
  9632,
  9634,
  9632,
  9625,
  9653,
  9629,
  9621,
  9622,
  9627,
  9627,
  9650,
  9658,
  9632,
  9652,
  9644,
  9623,
  9642,
  9643,
  9641,
  9647,
  9653,
  9635,
  9644,
  9624,
  9607,
  9627,
  9638,
  9654,
  9632,
  9648,
  9646,
  9648,
  9663,
  9647,
  9628,
  9657,
  9650,
  9647,
  9653,
  9653,
  9637,
  9645,
  9635,
  9635,
  9651,
  9655,
  9656,
  9665,
  9657,
  9629,
  9655,
  9659,
  9650,
  9660,
  9627,
  9646,
  9650,
  9664,
  9653,
  9652,
  9657,
  9652,
  9634,
  9641,
  9654,
  9640,
  9658,
  9658,
  9655,
  9650],
 [],
 [])

### Overfitting

In [7]:
net = Network([784, 30, 10], cost=CrossEntropyCost)
net.large_weight_initializer()
net.SGD(training_data[:1000], 400, 10, 0.5, evaluation_data=test_data, monitor_evaluation_accuracy=True, monitor_training_cost=True)

Epoch 0 training complete
Cost on training data: 2.318560164532979
Accuracy on evaluation data: 0 / 0
Epoch 1 training complete
Cost on training data: 1.5914227178620113
Accuracy on evaluation data: 0 / 0
Epoch 2 training complete
Cost on training data: 1.2534895290552646
Accuracy on evaluation data: 0 / 0
Epoch 3 training complete
Cost on training data: 1.0253958025712793
Accuracy on evaluation data: 0 / 0
Epoch 4 training complete
Cost on training data: 0.8671122704172909
Accuracy on evaluation data: 0 / 0
Epoch 5 training complete
Cost on training data: 0.8080035917493397
Accuracy on evaluation data: 0 / 0
Epoch 6 training complete
Cost on training data: 0.7041638021491149
Accuracy on evaluation data: 0 / 0
Epoch 7 training complete
Cost on training data: 0.6100652205318042
Accuracy on evaluation data: 0 / 0
Epoch 8 training complete
Cost on training data: 0.5730831263121682
Accuracy on evaluation data: 0 / 0
Epoch 9 training complete
Cost on training data: 0.49952649874465677
Accur

([],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


### Regularization

In [8]:
net = Network([784, 30, 10], cost=CrossEntropyCost)
net.large_weight_initializer()
net.SGD(training_data[:1000], 400, 10, 0.5, 
        evaluation_data=test_data, 
        lmbda=0.1, 
        monitor_evaluation_cost=True,
        monitor_evaluation_accuracy=True, 
        monitor_training_cost=True,
        monitor_training_accuracy=True
       )

Epoch 0 training complete
Cost on training data: 1175.3274228735127
Accuracy on training data: 533 / 1000
Cost on evaluation data: 0.0
Accuracy on evaluation data: 0 / 0
Epoch 1 training complete
Cost on training data: 1165.2704373173228
Accuracy on training data: 766 / 1000
Cost on evaluation data: 0.0
Accuracy on evaluation data: 0 / 0
Epoch 2 training complete
Cost on training data: 1156.0327073403653
Accuracy on training data: 809 / 1000
Cost on evaluation data: 0.0
Accuracy on evaluation data: 0 / 0
Epoch 3 training complete
Cost on training data: 1147.121681797924
Accuracy on training data: 836 / 1000
Cost on evaluation data: 0.0
Accuracy on evaluation data: 0 / 0
Epoch 4 training complete
Cost on training data: 1138.3169487488597
Accuracy on training data: 880 / 1000
Cost on evaluation data: 0.0
Accuracy on evaluation data: 0 / 0
Epoch 5 training complete
Cost on training data: 1129.5787235914045
Accuracy on training data: 902 / 1000
Cost on evaluation data: 0.0
Accuracy on eval

([0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,