## Network implementation

In [None]:
from datetime import datetime
import gzip, pickle
import numpy as np

f = gzip.open('mnist.pkl.gz')
data = pickle.load(f, encoding='latin1')

train_data = data[0][0]
valid_data, valid_labels = data[1]
test_data, test_labels = data[2]

# onehot encoding for valid and test labels
train_labels = np.zeros((train_data.shape[0], 10))
train_labels[np.arange(train_labels.shape[0]), data[0][1]] = 1

In [None]:
class NN(object):
    
    def __init__(self, hidden_dims=(1024,2048), n_hidden=2, mode='train', datapath=None, model_path=None,
                batchsize=64, lr=0.0001, delta=.5, activation="ReLU", initialization="normal", epsilon=1e-9):
        
        self.dims = (784,) + hidden_dims + (10,)
        self.n_hidden = n_hidden
        self.mode = mode
        self.datapath = datapath
        self.model_path = model_path
        self.batchsize = batchsize
        self.lr = lr
        self.delta = delta
        self.epsilon = epsilon
        
        # set activation function for hidden layers
        if activation == "ReLU":
            self.activation = self.ReLU
            self.activation_prime = self.ReLU_prime
        elif activation == "sigmoid":
            self.activation = self.sigmoid
            self.activation_prime = self.sigmoid_prime
        elif activation == "tanh":
            self.activation = self.tanh
            self.activation_prime = self.tanh_prime
        else:
            raise ValueError('Invalid activation function specified: ' + str(activation))
        
        # network parameters
        self.W = []
        self.b = [np.zeros(self.dims[i]) for i in range(1, len(self.dims))]
        
        # weight initialization
        self.initialize_weights(n_hidden, self.dims, initialization)
        self.parameter_count()
        
        # keep count of completed training epochs
        self.epoch_cnt = 1
    
    ################################################
    #  Weights initialization and parameter count  #
    ################################################
    
    def initialize_weights(self, n_hidden, dims, option):
        
        if option == "zero":
            for i in range(n_hidden + 1):
                self.W.append( np.zeros((dims[i+1], dims[i])) )
        
        elif (option == "normal"):
            for i in range(n_hidden + 1):
                self.W.append( np.random.normal(0, 1, (dims[i+1], dims[i])) )
        
        elif (option == "glorot"):
            for i in range(n_hidden + 1):
                di = np.sqrt(6/(dims[i]+dims[i+1]))
                self.W.append( np.random.uniform(-di, di, (dims[i+1], dims[i])) )
        
        else:
            raise ValueError('Invalid weight initialization specified: ' + str(option))
        
    def parameter_count(self):
        
        for i in range(self.n_hidden + 1):
            print("Layer", i+1, end=" - ")
            print("W:" + str(self.W[i].shape), end="\t")
            print("b:" + str(self.b[i].shape))
        
        weights_cnt = sum(map(np.size, self.W))
        bias_cnt = sum(map(np.size, self.b))
        print("\nTotal number of parameters:", weights_cnt + bias_cnt)
    
    #################################################
    #  Activation functions  and their derivatives  #
    #################################################
    
    def ReLU(self, input):
        return np.maximum(0, input)
    
    def ReLU_prime(self, input):
        return np.where(input > 0, 1, 0)
    
    def sigmoid(self, input):
        return 1 / (1 + np.exp(-input))
    
    def sigmoid_prime(self, input):
        return self.sigmoid(input) * (1 - self.sigmoid(input))
    
    def tanh(self, input):
        return 2 / (1 + np.exp(-2*input)) - 1
    
    def tanh_prime(self, input):
        return 1 - self.tanh(input)**2
    
    def softmax(self, input):
        rescaled = input - np.amax(input, axis=1)[:,np.newaxis] # for numerical stability
        input_exp = np.exp(rescaled)
        return input_exp / np.sum(input_exp, axis=1)[:,np.newaxis]
    
    ########################################
    #  Forward, backward, loss and update  #
    ########################################
    
    def forward(self, input):
        
        # cache stores z = Wx + b of every layer for backprop (z=x for first layer)
        cache = [input]
        
        out = input
        for i in range(self.n_hidden):
            out = out @ self.W[i].T + self.b[i] # compute z = Wx + b
            cache.append(out) # store z for backprop
            out = self.activation(out) # compute a = activation(z)
        
        out = out @ self.W[-1].T + self.b[-1]
        cache.append(out)
        out = self.softmax(out)
        
        return out, cache
    
    def loss(self, prediction, labels):
        return -np.sum(labels * np.log(prediction + self.epsilon))
    
    def backward(self, cache, labels):
        
        # cache contains z = Wx + b of every layer (z=x for first layer)
        
        grads_a = [0] * (self.n_hidden + 1)
        
        grads_a[-1] = self.softmax(cache[-1]) - labels
        for i in range(self.n_hidden-1, -1, -1):
            grads_a[i] = grads_a[i+1] @ self.W[i+1] * self.activation_prime(cache[i+1])
        
        grads_W = [0] * (self.n_hidden + 1)
        grads_b = [0] * (self.n_hidden + 1)
        
        for i in range(self.n_hidden, -1, -1):
            grads_W[i] = grads_a[i].T @ self.activation(cache[i]) / labels.shape[0]
            grads_b[i] = np.mean(grads_a[i], axis=0)
            
        return (grads_W, grads_b)
    
    def update(self, grads, lr):
        for i in range(self.n_hidden + 1):
            self.W[i] -= lr * grads[0][i]
            self.b[i] -= lr * grads[1][i]
        
    ##############################
    #  Train and test functions  #
    ##############################
    
    def train(self, train_data, train_labels, n_epochs):
        
        for epoch in range(n_epochs):
            
            print(datetime.now(), "-", "Epoch", self.epoch_cnt, end=": ") 
            
            # learning rate for this epoch
            t = max(1, self.epoch_cnt)
            lr = self.lr / t**self.delta
            
            epoch_loss = 0
            start, end = 0, self.batchsize
            
            while start < train_data.shape[0]:
                
                # forward pass and mini-batch loss
                predictions, cache = self.forward(train_data[start:end])
                epoch_loss += self.loss(predictions, train_labels[start:end])
                
                # backward pass and update
                grads = self.backward(cache, train_labels[start:end])
                self.update(grads, lr)
                
                # start/end for next mini-batch
                start = end
                end += self.batchsize
            
            mean_loss = epoch_loss / train_data.shape[0]
            print("loss =", mean_loss)
            
            self.epoch_cnt += 1
        
        # self.finite_diff()
        
        
    def test(self, test_data, test_labels):
        
        predictions, cache = self.forward(test_data)
        successes = (np.argmax(predictions, axis=1) == test_labels)
        
        print(str(100 * np.sum(successes) / test_data.shape[0]) + "% success")
    
    ############################################
    #  Finite difference computation funciton  #
    ############################################
    
    def finite_diff(self, sample, label):
        
        ### Work in progress ###
        
        weight_save = self.W[1]
        for i, k in np.arange(5), np.arange(5)+1:
            N = k * 10^i
            epsilon = 1 / N
            grad = []
            for p in range(10):
                
                prediction, cache = self.forward(sample)
                loss = self.loss(prediction, label)
                grads = self.backward(cache, label)

## Network parameters

In [None]:
hidden_dims = (320, 2048)
n_hidden = 2
mode = 'train'
datapath = None
model_path = None
batchsize = 4
lr = .001
delta = .5
activation = "ReLU"

## Trying with glorot initialization

In [None]:
np.random.seed(6135)
net_glorot = NN(hidden_dims, n_hidden, mode, datapath, model_path,
                batchsize, lr, delta, activation, initialization="glorot")

In [None]:
for i in range(10):
    net_glorot.train(train_data, train_labels, 1)
    net_glorot.test(valid_data, valid_labels)

In [None]:
net_glorot.test(train_data, data[0][1])
# net_glorot.test(test_data, test_labels)

## Trying with normal initialization

In [None]:
np.random.seed(6135)
net_normal = NN(hidden_dims, n_hidden, mode, datapath, model_path,
                batchsize, lr, delta, activation, initialization="normal")

In [None]:
for i in range(10):
    net_normal.train(train_data, train_labels, 1)
    net_normal.test(valid_data, valid_labels)

In [None]:
net_normal.test(train_data, data[0][1])
# net_normal.test(test_data, test_labels)

## Trying with zero initialization

In [None]:
np.random.seed(6135)
net_zero = NN(hidden_dims, n_hidden, mode, datapath, model_path,
              batchsize, lr, delta, activation, initialization="zero")

In [None]:
for i in range(10):
    net_zero.train(train_data, train_labels, 1)
    net_zero.test(valid_data, valid_labels)

In [None]:
net_zero.test(train_data, data[0][1])
# net_zero.test(test_data, test_labels)