## Network implementation

In [None]:
from datetime import datetime
import gzip, pickle
import numpy as np

f = gzip.open('mnist.pkl.gz')
data = pickle.load(f, encoding='latin1')

train_data = data[0][0]
valid_data, valid_labels = data[1]
test_data, test_labels = data[2]

# onehot encoding for valid and test labels
train_labels = np.zeros((train_data.shape[0], 10))
train_labels[np.arange(train_labels.shape[0]), data[0][1]] = 1

In [None]:
class NN(object):
    
    def __init__(self, hidden_dims=(1024,2048), n_hidden=2, mode='train', datapath=None, model_path=None,
                batchsize=64, lr=0.0001, epsilon=1e-9, activation="ReLU", initialization="normal"):
        
        self.dims = (784,) + hidden_dims + (10,)
        self.n_hidden = n_hidden
        self.mode = mode
        self.datapath = datapath
        self.model_path = model_path
        self.batchsize = batchsize
        self.lr = lr
        self.epsilon = epsilon
        
        # set activation function for hidden layers
        if activation == "ReLU":
            self.activation = self.ReLU
            self.activation_prime = self.ReLU_prime
        else:
            raise ValueError('Invalid activation function specified: ' + str(activation))
        
        # network parameters
        self.W = []
        self.b = [np.zeros(self.dims[i]) for i in range(1, len(self.dims))]
        
        # weight initialization
        self.initialize_weights(n_hidden, self.dims, initialization)
        
        # keep count of completed training epochs
        self.epoch_cnt = 0
        
    
    def initialize_weights(self, n_hidden, dims, option):
        
        if option == "zero":
            for i in range(n_hidden + 1):
                self.W.append( np.zeros((dims[i+1], dims[i])) )
        
        elif (option == "normal"):
            for i in range(n_hidden + 1):
                self.W.append( np.random.normal(0, 1, (dims[i+1], dims[i])) )
        
        elif (option == "glorot"):
            for i in range(n_hidden + 1):
                di = (6/(dims[i]+dims[i+1]))**.5
                self.W.append( np.random.uniform(-di, di, (dims[i+1], dims[i])) )
        
        else:
            raise ValueError('Invalid weight initialization specified: ' + str(option))
    
    
    def forward(self, input): # missing params
        
        cache = [input] # to store z = Wx + b of every layer for backprop
        
        out = input
        for i in range(self.n_hidden):
            out = self.W[i] @ out + self.b[i] # compute z = Wx + b
            cache.append(out) # store z for backprop
            out = self.activation(out) # compute a = activation(z)
        
        out = self.W[-1] @ out + self.b[-1]
        cache.append(out)
        out = self.softmax(out)
        
        return out, cache
    
    def forward_mini(self, input): # missing params
        
        cache = [input] # to store z = Wx + b of every layer for backprop
        
        out = input
        for i in range(self.n_hidden):
            out = out @ self.W[i].T + self.b[i] # compute z = Wx + b
            cache.append(out) # store z for backprop
            out = self.activation(out) # compute a = activation(z)
        
        out = out @ self.W[-1].T + self.b[-1]
        cache.append(out)
        out = self.softmax(out)
        
        return out, cache
    
    
    def ReLU(self, input):
        return np.maximum(0, input)
    
    def ReLU_prime(self, input):
        return np.where(input > 0, 1, 0)
    
    def loss(self, prediction, labels):
        return -np.sum(labels * np.log(prediction + self.epsilon)) / self.batchsize
    
    def softmax_archive(self, input): # missing params
        rescaled = input - np.amax(input) # for numerical stability
        input_exp = np.exp(rescaled)
        return input_exp / np.sum(input_exp)
    
    def softmax(self, input): # missing params
        rescaled = input - np.amax(input, axis=1)[:,np.newaxis] # for numerical stability
        input_exp = np.exp(rescaled)
        return input_exp / np.sum(input_exp, axis=1)[:,np.newaxis]
    
    def backward(self, cache, labels): # missing params
        
        # cache contains z = Wx + b of every layer (including first layer where z = x)
        grads_a = [0] * (self.n_hidden + 1)
        
        grads_a[-1] = self.softmax(cache[-1]) - labels
        for i in range(self.n_hidden-1, -1, -1):
            grads_a[i] = grads_a[i+1] @ self.W[i+1] * self.activation_prime(cache[i+1])
        
        grads_W = [0] * (self.n_hidden + 1)
        grads_b = [0] * (self.n_hidden + 1)
        
        for i in range(self.n_hidden, -1, -1):
            grads_W[i] = np.outer(grads_a[i], self.activation(cache[i]))
            grads_b[i] = grads_a[i]
            
        return (grads_W, grads_b)
    
    
    def backward_mini(self, cache, labels): # missing params
        
        # cache contains z = Wx + b of every layer (including first layer where z = x)
        grads_a = [0] * (self.n_hidden + 1)
        
        grads_a[-1] = self.softmax(cache[-1]) - labels
        for i in range(self.n_hidden-1, -1, -1):
            grads_a[i] = grads_a[i+1] @ self.W[i+1] * self.activation_prime(cache[i+1])
        
        grads_W = [0] * (self.n_hidden + 1)
        grads_b = [0] * (self.n_hidden + 1)
        
        for i in range(self.n_hidden, -1, -1):
            grads_W[i] = grads_a[i].T @ self.activation(cache[i]) / self.batchsize
            grads_b[i] = np.mean(grads_a[i], axis=0)
            
        return (grads_W, grads_b)
    
    
    def update(self, grads): # missing params
        for i in range(self.n_hidden + 1):
            self.W[i] -= self.lr * grads[0][i]
            self.b[i] -= self.lr * grads[1][i]
    
    
    def train(self, train_data, train_labels, n_epochs):
        
        for epoch in range(n_epochs):
            
            print(datetime.now(), "-", "Epoch", self.epoch_cnt+1, end=": ") 
            
            loss = 0
            for i in range(train_data.shape[0]):
                predictions, cache = self.forward(train_data[i])
                loss += self.loss(predictions, train_labels[i])
                
                grads = self.backward(cache, train_labels[i])
                self.update(grads)
            
            print("loss =", loss)
            self.epoch_cnt += 1
            
            
    def train_mini(self, train_data, train_labels, n_epochs):
        
        for epoch in range(n_epochs):
            
            print(datetime.now(), "-", "Epoch", self.epoch_cnt+1, end=": ") 
            
            loss = 0
            for i in range(int(train_data.shape[0] / self.batchsize)):
                begin = i * self.batchsize
                end = (i+1) * self.batchsize
                
                predictions, cache = self.forward_mini(train_data[begin:end])
                loss += self.loss(predictions, train_labels[begin:end])
                
                grads = self.backward_mini(cache, train_labels[begin:end])
                self.update(grads)
            
            print("loss =", loss)
            self.epoch_cnt += 1
        
        # self.finite_diff()
        
    
    def test(self, test_data, test_labels):
        
        successes = 0
        
        for i in range(test_data.shape[0]):
            predictions, cache = self.forward(test_data[i])
            if np.argmax(predictions) == test_labels[i]:
                successes += 1
        
        print(str(100 * successes / test_data.shape[0]) + "% success")
        
        
    def test_mini(self, test_data, test_labels):
        
        predictions, cache = self.forward_mini(test_data)
        successes = (np.argmax(predictions, axis=1) == test_labels)
        
        print(str(100 * np.sum(successes) / test_data.shape[0]) + "% success")
        
    def finite_diff(self, sample, label):
        
        ### Work in progress ###
        
        weight_save = self.W[1]
        for i, k in np.arange(5), np.arange(5)+1:
            N = k * 10^i
            epsilon = 1 / N
            grad = []
            for p in range(10):
                
                prediction, cache = self.forward(sample)
                loss = self.loss(prediction, label)
                grads = self.backward(cache, label)

## Trying with glorot initialization

In [None]:
hidden_dims=(256,512,1024)
n_hidden=3
mode='train'
datapath=None
model_path=None

np.random.seed(6135)
net_glorot = NN(hidden_dims, n_hidden, mode, datapath, model_path, batchsize=64, initialization="glorot", lr=.0001)

In [None]:
for i in range(25):
    net_glorot.train_mini(train_data, train_labels, 1)
    net_glorot.test_mini(valid_data, valid_labels)

In [None]:
net_normal.test_mini(train_data, data[0][1])
# net_glorot.test_mini(test_data, test_labels)

## Trying with normal initialization

In [None]:
hidden_dims=(256,512,1024)
n_hidden=3
mode='train'
datapath=None
model_path=None

np.random.seed(6135)
net_normal = NN(hidden_dims, n_hidden, mode, datapath, model_path, batchsize=64, initialization="normal", lr=.0001)

In [None]:
for i in range(25):
    net_normal.train_mini(train_data, train_labels, 1)
    net_normal.test_mini(valid_data, valid_labels)

In [None]:
net_normal.test_mini(train_data, data[0][1])
# net_normal.test_mini(test_data, test_labels)