## Monkeynet: Order of operations:
- Create Layers
- Create Network from Layers
- Create Loss
- Create gradients
- Create a crappy backprop
- Create a good backprop
- Create a data loader


### Fooling around with the notebook. I typically experiment on a notebook before switching to code files
### I do not recommend using this. Although it might be easier to understand for some...

In [1]:
import numpy as np
import random
from numpy import ndarray as Tensor
from typing import Callable, Sequence, Iterator, List, Tuple

In [24]:
class Layer:
    
    def __init__(self)->None:
        pass
    
    def feedforward(self, inputs):
        raise NotImplementedError
    
    def backprop(self, inputs, outputs):
        return NotImplementedError

In [25]:
class FullyConnected(Layer):
    def __init__(self, input_size, output_size)->None:
        self.input_size = input_size
        self.output_size = output_size
        
        self.weights = np.random.randn(self.output_size, self.input_size)
        self.biases = np.random.randn(self.output_size, 1)
        
        self.grad_w = np.zeros(self.weights.shape)
        self.grad_b = np.zeros(self.biases.shape)
        
        self.grad_ws = []
        self.grad_bs = []
        
    def activate(self, inputs: Tensor)->Tensor:
        
        self.z = self.weights @ inputs + self.biases
        self.activation = self.sigmoid(self.z)
        return self.activation
    
    
    def sigmoid(self, z: Tensor) -> Tensor:
        return 1/(1 + (np.exp(-z)))
    
    def sigmoid_deriv(self, z: Tensor) -> Tensor:
        return self.sigmoid(z) * (1 - self.sigmoid(z))

In [26]:
class Network:
    
    """
    TODO: Need to account for the bias in the first layer. as in 
    i need to remove it
    """
    
    def __init__(self, layers: Sequence[FullyConnected])->None:
        self.layers = layers
        self.activations = []
        self.zs = []
        
    def forward_pass(self, inputs: Tensor) -> Tensor:
        self.activations = []
        self.activations.append(inputs)
        self.zs = []
        for layer in self.layers:
            inputs = layer.activate(inputs)
            self.zs.append(layer.z)
            self.activations.append(inputs)
        return inputs

        
    def backprop(self, inputs: Tensor, outputs: Tensor) -> None:

        self.delta = (self.activations[-1] - outputs) * self.sigmoid_deriv(self.zs[-1])
        self.layers[-1].delta = self.delta
        self.layers[-1].grad_b = self.delta
        self.layers[-1].grad_w = self.delta @ self.activations[-2].T
        
        self.layers[-1].grad_bs.append(self.layers[-1].grad_b)
        self.layers[-1].grad_ws.append(self.layers[-1].grad_w)

        for l in range(2, len(self.layers)+1):

            self.delta = (self.layers[-l+1].weights.T @ self.delta) * self.sigmoid_deriv(self.zs[-l])
            
            self.layers[-l].grad_b = self.delta
            self.layers[-l].grad_bs.append(self.layers[-l].grad_b)
            
            self.layers[-l].grad_w = self.delta @ self.activations[-l-1].T
            self.layers[-l].grad_ws.append(self.layers[-l].grad_w)
                                                                                      
                                                                                
            
    def sigmoid(self, z: Tensor) -> Tensor:
        return 1/(1 + (np.exp(-z)))
    
    def sigmoid_deriv(self, z: Tensor) -> Tensor:
        return self.sigmoid(z) * (1 - self.sigmoid(z))
    
    def loss(self, predicted: Tensor, actual: Tensor)-> float:
        
        pred_idx = np.argmax(predicted)
        actual_idx = np.argmax(actual)
        
        pred_arr = np.zeros(predicted.shape)
        pred_arr[pred_idx] = 1
        
        actual_arr = np.zeros(predicted.shape)
        actual_arr[actual_idx] = 1
        
        return np.sum((actual_arr - pred_arr) ** 2)
    
    def accuracy(self, predicted: Tensor, actual: Tensor)-> float:
        
        pred_idx = np.argmax(predicted)
        actual_idx = np.argmax(actual)
        
        return 1 if pred_idx == actual_idx else 0
        
        

In [27]:
class Optimizer:
    
    def __init__(self) -> None:
        pass
    
    def update(self) ->None:
        raise NotImplementedError
    

class SGD(Optimizer):
    
    def __init__(self, batch_size: int, learning_rate: float, network: Network) -> None:
        self.eta = learning_rate
        self.network = network
        self.batch_size = batch_size
        
    def update(self) -> Tensor:
        """
        1. Pass inputs intto the network
        2. For all the inputs passed, add up the weight and bias grads for each layer in the network
        3. adjust the weight and bias by the above
        
        w_l = w_l - eta*(1/m)*sum(grad_w)
        b_l = b_l - eta*(1/m)*sum(grad_b)
        """
        
        for layer in self.network.layers:
            # clear the list of each layers' gradients after each update
            layer.weights = layer.weights - (self.eta * (1/len(layer.grad_ws)) * np.sum(layer.grad_ws,axis=0))
            layer.grad_ws = []
            
            layer.biases = layer.biases - self.eta * (1/len(layer.grad_bs)) * np.sum(layer.grad_bs,axis=0)
            layer.grad_bs = []
            

In [30]:
class DataLoader:
    
    def __init__(self, batch_size: int=1) -> None:
        self.batch_size = batch_size
    
    def get_batches(self, inputs: Tensor, outputs: Tensor) -> Iterator[Tuple]:
        idx = np.arange(0, len(inputs), self.batch_size)
        np.random.shuffle(idx)

        shuffled_batches = [(inputs[k:k+self.batch_size], outputs[k:k+self.batch_size]) for k in idx]
        return shuffled_batches
    

In [46]:
class Trainer:
    
    def __init__(self) -> None:
        pass
    
    def train(self,
              network: Network,
              inputs: Tensor,
              outputs: Tensor,
              optimizer: Optimizer,
              batch_size,
              num_epochs,
             ) -> None:
        
        # Start here for training:
        for i in range(num_epochs):
            
            loss_epoch = 0.0
            acc_epoch = 0.0
            
            for batch in DataLoader(batch_size).get_batches(inputs, outputs):
                for inp, out in zip(batch[0], batch[1]):
                    network.forward_pass(inp)
                    network.backprop(inp, out)
                    acc_epoch += network.accuracy(network.forward_pass(inp), out)
                    loss_epoch += network.loss(network.forward_pass(inp), out)
                optimizer.update()
                    
            # print("epoch : {} | loss : {}".format(i, loss_epoch))
            print("epoch : {} | acc : {}".format(i, 100*acc_epoch/len(inputs)))
            
        


In [38]:
import pickle

In [2]:
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()

In [35]:
trd= list(training_data)

In [36]:
trd_inp = []
trd_out = []

In [37]:
for tpl in trd:
    trd_inp.append(tpl[0])
    trd_out.append(tpl[1])

In [38]:
mnist_inputs = np.stack(trd_inp)

In [39]:
mnist_outputs = np.stack(trd_out)

In [40]:
mnist_inputs.shape

(50000, 784, 1)

In [41]:
mnist_outputs.shape

(50000, 10, 1)

In [32]:
trd, vd, tsd = mnist_loader.load_data()

In [None]:
# this does the trick!

In [33]:
def label_to_vector(label):
    """We will conver the individual number classes, 
    like 5,4,3,7,8... in the downbloaded mnsit
    database into a (10,1) vector X with X[index] = 1 
    where index is the number itself"""
    vec = np.zeros((10,1))
    vec[label] = 1
    return vec

In [34]:
train_input = np.array([np.reshape(tr_i, (784, 1)) for tr_i in trd[0]])
train_output = np.array([label_to_vector(tr_o) for tr_o in trd[1]])

test_input = np.array([np.reshape(ts_i, (784, 1)) for ts_i in tsd[0]])
test_output = np.array([label_to_vector(ts_o) for ts_o in tsd[1]])

vd_input = np.array([np.reshape(vd_i, (784, 1)) for vd_i in vd[0]])
vd_output = np.array([label_to_vector(vd_o) for vd_o in vd[1]])

In [35]:
test_input.shape

(10000, 784, 1)

In [39]:
with open('mnist.pkl', 'rb') as f:
    trd, vd, tsd = pickle.load(f, encoding="latin1")

train_input = np.array([np.reshape(tr_i, (784, 1)) for tr_i in trd[0]])
train_output = np.array([label_to_vector(tr_o) for tr_o in trd[1]])

test_input = np.array([np.reshape(ts_i, (784, 1)) for ts_i in tsd[0]])
test_output = np.array([label_to_vector(ts_o) for ts_o in tsd[1]])

vd_input = np.array([np.reshape(vd_i, (784, 1)) for vd_i in vd[0]])
vd_output = np.array([label_to_vector(vd_o) for vd_o in vd[1]])

def label_to_vector(label):
    """We will conver the individual number classes, 
    like 5,4,3,7,8... in the downbloaded mnsit
    database into a (10,1) vector X with X[index] = 1 
    where index is the number itself"""
    vec = np.zeros((10,1))
    vec[label] = 1
    return vec

In [43]:
vd_output.shape

(10000, 10, 1)

In [None]:
# Define your layers
fc1 = FullyConnected(784,30)
fc2 = FullyConnected(30,10)
# fc3 = FullyConnected(50,10)


# Create your network
nn = Network([fc1, fc2])


# Ready the optimizer:
sgd = SGD(batch_size=30,learning_rate=3,network=nn)


trainer = Trainer()
trainer.train(network=nn,
              inputs=mnist_inputs,
              outputs=mnist_outputs,
              optimizer=sgd,
              batch_size=30,
              num_epochs=30
             )
