## Network implementation

In [326]:
from datetime import datetime
import gzip, pickle
import numpy as np

f = gzip.open('mnist.pkl.gz')
data = pickle.load(f, encoding='latin1')

train_data = data[0][0]
valid_data, valid_labels = data[1]
test_data, test_labels = data[2]

# onehot encoding for valid and test labels
train_labels = np.zeros((train_data.shape[0], 10))
train_labels[np.arange(train_labels.shape[0]), data[0][1]] = 1

In [338]:
class NN(object):
    
    def __init__(self, hidden_dims=(1024,2048), n_hidden=2, mode='train', datapath=None, model_path=None,
                batchsize=64, lr=0.001, epsilon=1e-8, activation="ReLU", initialization="normal"):
        
        self.dims = (784,) + hidden_dims + (10,)
        self.n_hidden = n_hidden
        self.mode = mode
        self.datapath = datapath
        self.model_path = model_path
        self.batchsize = batchsize
        self.lr = lr
        self.epsilon = epsilon
        
        # set activation function for hidden layers
        if activation == "ReLU":
            self.activation = self.ReLU
            self.activation_prime = self.ReLU_prime
        else:
            raise ValueError('Invalid activation function specified: ' + str(activation))
        
        # network parameters
        self.W = []
        self.b = [np.zeros(self.dims[i]) for i in range(1, len(self.dims))]
        
        # weight initialization
        self.initialize_weights(n_hidden, self.dims, initialization)
        
        # keep count of completed training epochs
        self.epoch_cnt = 0
        
    
    def initialize_weights(self, n_hidden, dims, option):
        
        if option == "zero":
            for i in range(n_hidden + 1):
                self.W.append( np.zeros((dims[i+1], dims[i])) )
        
        elif (option == "normal"):
            for i in range(n_hidden + 1):
                self.W.append( np.random.normal(0, 1, (dims[i+1], dims[i])) )
        
        elif (option == "glorot"):
            for i in range(n_hidden + 1):
                di = (6/(dims[i]+dims[i+1]))**.5
                self.W.append( np.random.uniform(-di, di, (dims[i+1], dims[i])) )
        
        else:
            raise ValueError('Invalid weight initialization specified: ' + str(option))
    
    
    def forward(self, input): # missing params
        
        cache = [input] # to store z = Wx + b of every layer for backprop
        
        out = input
        for i in range(self.n_hidden):
            out = self.W[i] @ out + self.b[i] # compute z = Wx + b
            cache.append(out) # store z for backprop
            out = self.activation(out) # compute a = activation(z)
        
        out = self.W[-1] @ out + self.b[-1]
        cache.append(out)
        out = self.softmax(out)
        
        return out, cache
    
    
    def ReLU(self, input):
        return np.maximum(0, input)
    
    def ReLU_prime(self, input):
        return np.where(input > 0, 1, 0)
    
    def loss(self, prediction, labels):
        return -np.sum(labels * np.log(prediction + self.epsilon)) / self.batchsize
    
    def softmax(self, input): # missing params
        rescaled = input - np.amax(input) # for numerical stability
        input_exp = np.exp(rescaled)
        return input_exp / np.sum(input_exp)
    
    def backward(self, cache, labels): # missing params
        
        # cache contains z = Wx + b of every layer (including first layer where z = x)
        grads_a = [0] * (self.n_hidden + 1)
        
        grads_a[-1] = self.softmax(cache[-1]) - labels
        for i in range(self.n_hidden-1, -1, -1):
            grads_a[i] = grads_a[i+1] @ self.W[i+1] * self.activation_prime(cache[i+1])
        
        grads_W = [0] * (self.n_hidden + 1)
        grads_b = [0] * (self.n_hidden + 1)
        
        for i in range(self.n_hidden, -1, -1):
            grads_W[i] = np.outer(grads_a[i], self.activation(cache[i]))
            grads_b[i] = grads_a[i]
            
        return (grads_W, grads_b)
    
    
    def update(self, grads): # missing params
        for i in range(self.n_hidden + 1):
            self.W[i] -= self.lr * grads[0][i]
            self.b[i] -= self.lr * grads[1][i]
    
    
    def train(self, train_data, train_labels, n_epochs):
        
        for epoch in range(n_epochs):
            
            print(datetime.now(), "-", "Epoch", self.epoch_cnt+1, end=": ") 
            
            loss = 0
            for i in range(train_data.shape[0]):
                predictions, cache = self.forward(train_data[i])
                loss += self.loss(predictions, train_labels[i])
                
                grads = self.backward(cache, train_labels[i])
                self.update(grads)
            
            print("loss =", loss)
            self.epoch_cnt += 1
        
    
    def test(self, test_data, test_labels):
        
        successes = 0
        
        for i in range(test_data.shape[0]):
            predictions, cache = self.forward(test_data[i])
            if np.argmax(predictions) == test_labels[i]:
                successes += 1
        
        print(str(100 * successes / test_data.shape[0]) + "% success")

## Trying with glorot initialization

In [339]:
hidden_dims=(1024,2048)
n_hidden=2
mode='train'
datapath=None
model_path=None

net = NN(hidden_dims, n_hidden, mode, datapath, model_path, batchsize=1, initialization="glorot")

In [340]:
net.train(train_data[:1000], train_labels[:1000], 5)

2019-02-07 22:53:39.107752 - Epoch 1: loss = 1915.7651581240034
2019-02-07 22:53:55.054655 - Epoch 2: loss = 1171.0766697432662
2019-02-07 22:54:12.910620 - Epoch 3: loss = 770.7325424963356
2019-02-07 22:54:28.707283 - Epoch 4: loss = 585.710522215171
2019-02-07 22:54:42.326545 - Epoch 5: loss = 482.77374841147065


In [341]:
net.test(test_data[:1000], test_labels[:1000])

80.2% success


## Trying with normal initialization

In [342]:
hidden_dims=(1024,2048)
n_hidden=2
mode='train'
datapath=None
model_path=None

net = NN(hidden_dims, n_hidden, mode, datapath, model_path, batchsize=1, initialization="normal")

In [343]:
net.train(train_data[:1000], train_labels[:1000], 5)

2019-02-07 22:54:58.866517 - Epoch 1: loss = 5839.356130496439
2019-02-07 22:55:11.313612 - Epoch 2: loss = 2118.3681307209254
2019-02-07 22:55:24.182870 - Epoch 3: loss = 828.9305198418704
2019-02-07 22:55:37.891680 - Epoch 4: loss = 725.0176876910268
2019-02-07 22:55:50.996748 - Epoch 5: loss = 782.8087994230723


In [344]:
net.test(test_data[:1000], test_labels[:1000])

83.1% success
