In [None]:
import numpy as np
import random
import math

class NN(object):
    
    def __init__(self,hidden_dims=(1024,2048),n_hidden=2,mode='train',datapath=None,model_path=None):
        self.init_method = ''
        
        self.h0 = 784     # +1 for bias
        self.h1 = hidden_dims[0]
        self.h2 = hidden_dims[1]
        self.h3 = 10
        
        self.layer_sizes = [self.h0, self.h1, self.h2, self.h3]
        self.network = [] # list of weights
        self.num_hlayers = 2
        self.num_classes = 10
        
        self.epochs = 10
        self.loss = 0 
        self.lr = 0.001
        self.batch_size = 20
        
        self.dataprep()
        self.initialize_weights()
        self.train()
#         self.test()
        

    def dataprep(self):

        data = np.load('mnist.pkl.npy', encoding='latin1')

        train_set = data[0][0]
        val_set = data[1][0]
        test_set = data[2][0]

        self.val_labels = data[1][1]
        self.test_labels = data[2][1]

        self.train_size = train_set.shape[0]
        self.val_size = val_set.shape[0]
        self.test_size = test_set.shape[0]

        p = np.random.permutation(self.train_size)
        self.train_set = train_set[p]
        self.train_labels = data[0][1][p]

#         self.val_set = np.hstack((val_set, np.ones((self.val_size,1)) ))
#         self.test_set = np.hstack((test_set, np.ones((self.test_size,1)) ))


        print(train_set.shape)
        print(val_set.shape)
        print(test_set.shape)
    
    def train(self):    
        m = self.train_size
        val_samples = self.val_size

        loss_history = np.zeros(self.epochs)
        n_batches = int(m/self.batch_size)
        batch_size = self.batch_size

        for it in range(self.epochs):
            loss = 0.0
            val_loss= 0.0
            indices = np.random.permutation(m)
            train_set = self.train_set[indices]
            train_labels = self.train_labels[indices]


            for i in range(0,m,batch_size):
                X_i = train_set[i:i+batch_size]
                y_i = train_labels[i:i+batch_size]
                outputs = self.forward(X_i)
                loss_batch = self.backward(outputs, y_i)
                loss += loss_batch

            for i in range(0,val_samples,batch_size):
                X_i_val = self.val_set[i:i+batch_size]
                y_i_val = self.val_labels[i:i+batch_size]
                outputs_val = self.forward(X_i_val)
                loss_val_batch = loss(self, outputs_val, y_i_val)
                val_loss += loss_val_batch



            loss_history[it]  = loss

        return loss_history

    def initialize_weights(self):
        if(self.init_method=='zeros'):
            for i in range(len(self.layer_sizes)-1):
                layer_weights = np.zeros((self.layer_sizes[i]+1,self.layer_sizes[i+1]))
                layer_weights[-1,:] = 0
                print(layer_weights.shape)
                self.network.append(layer_weights) 
        elif(self.init_method=='glorot'):

            #compute the formula
            for i in range(len(self.layer_sizes)-1):
                d = sqrt(6.0/(self.layer_sizes[i]+self.layer_sizes[i+1]))      
                layer_weights = np.random.uniform(-d,d,(self.layer_sizes[i]+1,self.layer_sizes[i+1]))
                layer_weights[-1,:] = 0
                print(layer_weights.shape)
                self.network.append(layer_weights)    
        else:
            for i in range(len(self.layer_sizes)-1):
                layer_weights = np.random.normal(0,1,(self.layer_sizes[i]+1,self.layer_sizes[i+1]))
                layer_weights[-1,:] = 0  ##baises
                print(layer_weights.shape)
                self.network.append(layer_weights) 
        print('shapes of layers')
        for net_wt in self.network:
            print(net_wt.shape)
  
    def activation(self,inputs, layer_no):
        print('inputs')
        print(inputs.shape)
        inputs = np.hstack((inputs, np.ones((inputs.shape[0],1)) ))
        print('inputs shape post concat with 1s')
        print(inputs.shape)
        activation = np.dot(inputs, self.network[layer_no])     
        return activation

    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    def softmax(self,inputs): 
        print('softmax inputs')
        print(inputs.shape)
        # Result of softmax are invariant even if we add/subtract a constant.    
        ex = np.exp(inputs - np.max(inputs, axis=1,  keepdims=True))  # Subtract such that the maximum value is one.
        return ex / ex.sum(axis=1,  keepdims=True)  

    def forward(self,inputs):

        #we are always appending inputs before doing relu to the cache

        outputs = []
        outputs.append(inputs)
        print('forward loop inputs')
        print(inputs.shape)
        print('forward')
        for layer_no in range(len(self.network)-1):        
            inputs = self.activation(inputs, layer_no)
            print('activated inputs')
            print(inputs.shape)
            outputs.append(inputs)
            np.maximum(inputs,0,inputs)              #relu        
            #outputs.append(inputs)

        inputs = self.activation(inputs, len(self.network)-1 )
        inputs = self.softmax(inputs)
        outputs.append(inputs)

        return outputs 
    
    def relu(inputs):
        outputs = np.maximum(inputs,0)
        return outputs
    
    def backward(self,cache,labels):
        #cache should have
        #layer0_input    x0 
        #layer1_output   x1=(w0(x0)+bo) (relu-ed in network but not here)
        #layer2_output   x2=(w1(x1)+b1) (relu-ed in network but not here)
        #network_outputs x3=softmax+crossentropy(w2(x2)+b2)    

        predictions = cache[-1]
        one_hot = np.zeros((self.batch_size, self.num_classes))
        one_hot[np.arange(3), labels] = 1
        loss = loss(self, predictions, one_hot)



        #any avging div or normalisation for gradient sums? (for dot and sum)
        #compute gradients for weights and biases
        gradients = []
        grad = predictions - one_hot /(1.0*self.batch_size)   ##minus here???


        #transpose

        #wrt w3 b3
        temp = np.hstack((cache[-2], np.ones((predictions.shape[0],1)) ))

        gradients.append(np.dot(np.transpose(temp),grad)) #weights        
    #     gradients[1].append(grad) #bias
        grad = grad * relu(self.network[2])
        grad[temp] = 0 ##how to change this acc to temp

        #wrt w2 b2
        temp = np.hstack((cache[-3], np.ones((predictions.shape[0],1)) ))

        gradients.append(np.dot(np.transpose(temp),grad)) #weights    
    #     gradients[1].append(grad) #bias
        grad = grad * relu(self.network[1])
        grad[temp] = 0 

        #wrt w1 b1
        temp = np.hstack((cache[-4], np.ones((predictions.shape[0],1)) ))

        gradients.append(np.dot(np.transpose(temp),grad)) #weights
    #     gradients[1].append(grad) #bias
        grad = grad * relu(self.network[0])
        grad[temp] = 0     

        self.update(gradients)


        return loss

    def reluDerivative(x):  #not needed right? and diff formula here is okay?
         x[x<=0] = 0
         x[x>0] = 1
         return x
        
    def loss(self,prediction,labels): #cross entropy


        epsilon = 1e-12
        predictions = np.clip(predictions, epsilon, 1. - epsilon)
        ce = -np.sum(np.multiply(targets,np.log(predictions+1e-9)), axis=1)/(labels.shape[0]*1.0)
    #     loss = np.sum(-one_hot_labels * np.log(ao))
        return loss
    
    def update(self,grads):
        length_net = len(self.network)
        assert(len(grads)==length_net)

        i=0
        for layer_no in range(length_net):
            self.network[layer_no] += grads[length_net-1-i]             
            i += 1
            
    def test(self):
        m = len(self.test_size)
        loss_history = np.zeros(self.epochs)
        n_batches = int(m/self.batch_size)
        batch_size = self.batch_size

        for it in range(self.epochs):
            loss =0.0
            indices = np.random.permutation(m)
            self.test_set = self.test_set[indices]
            self.test_labels = self.test_labels[indices]
            for i in range(0,m,batch_size):
                X_i = self.test_set[i:i+batch_size]
                y_i = self.test_labels[i:i+batch_size]
                outputs = self.forward(self, X_i)

                #need loss and predictions

            loss_history[it]  = loss

        return loss_history
    
nn = NN()