In [31]:
import math
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import mnist

In [32]:
(trainX, trainY), (testX, testY) = mnist.load_data()
trainX = trainX.reshape(-1, 28 * 28).astype("float32") / 255.0
testX = testX.reshape(-1, 28 * 28).astype("float32") / 255.0

In [33]:
"""
Here we implement a fully very simple fully connected neural network right from scratch

There are two hidden layers each with 16 neurons
One input layer and one output layer
Note that we used this neural network only as a small example
Hence necessarily our accuracy will be less. In order to show that we don't really
perform worse than a pre-defined library like tensor flow we implemented the same net in keras and
tested the neural net and as it turns out at the end, our accuracy of the algorithm and neural network we
wrote from scratch gives better accuracy than given by tensorflow on the same network

The function followed is the sigmoid function but can be changed easily in func definition
Biases and weights are maintained in vectors and matrices respectively
"""


def func(x):
    return 1/(1+math.exp(-x))

def derivative(x):
    delta = 0.000000000001
    return ((func(x+delta) - func(x-delta))/(2*delta))

In [34]:
#biases
b_l1 = np.zeros(16) # bias of 1st hidden layer
b_l2 = np.zeros(16) # bias of 2nd hidden layer
b_o = np.zeros(10) # bias of output layer

#weights
w_i_l1 = np.zeros((784,16)) # weights of edges between input layer and 1st hidden layer
w_l1_l2 = np.zeros((16,16)) # weights of edges between 1st hidden layer and 2nd hidden layer
w_l2_o = np.zeros((16,10)) # weights of edges between 1st hidden layer and output layer

#inputs to each layer
i_l1 = np.zeros(16) # biased input to 1st hidden layer
i_l2 = np.zeros(16) # biased input to 2nd hidden layer
i_o = np.zeros(10) # biased input to output layer

#outputs to each layer
o_l1 = np.zeros(16) # output of 1st hidden layer after biases applied
o_l2 = np.zeros(16) # output of 2nd hidden layer after biases applied
o_o = np.zeros(10) # output of output layer after biases applied

#error function
er = np.zeros(10)

In [35]:
def eval_i_l1(idx,data):
    for i in range(16):
        i_l1[i] = b_l1[i]
        for j in range(784):
            i_l1[i] += w_i_l1[j][i]*data[idx][j]

def eval_o_l1(idx,data):
    eval_i_l1(idx,data)
    for i in range(16):
        o_l1[i] = func(i_l1[i])

def eval_i_l2(idx,data):
    eval_o_l1(idx,data)
    for i in range(16):
        i_l2[i] = b_l2[i]
        for j in range(16):
            i_l2[i] += w_l1_l2[j][i]*o_l1[j]

def eval_o_l2(idx,data):
    eval_i_l2(idx,data)
    for i in range(16):
        o_l2[i] = func(i_l2[i])

def eval_i_o(idx,data):
    eval_o_l2(idx,data)
    for i in range(10):
        i_o[i] = b_o[i]
        for j in range(16):
            i_o[i] += w_l2_o[j][i]*o_l2[j]


def eval_o_o(idx,data):
    eval_i_o(idx,data)
    for i in range(10):
        o_o[i] = func(i_o[i])

def eval_model(idx,data):
    eval_o_o(idx,data)

In [36]:
# applying backpropagation algorithm

deriv_bo = np.zeros(10)
deriv_w_2o = np.zeros((16,10))
deriv_b2 = np.zeros(16)
deriv_w_12 = np.zeros((16,16))
deriv_b1 = np.zeros(16)
deriv_w_i1 = np.zeros((784,16))


# apply dynamic programming to evaluate derivative
# each successive derivative arrays depends on the previous thereby
# minimizing calculation by calculating reduntant terms only once

def eval_bo(idx):
    for i in range(10):
        deriv_bo[i] = derivative(i_o[i])*(o_o[i])
    #print("$ "+str(derivative(i_o[i])) + " # " + str(i_o[i]))
    deriv_bo[trainY[idx]] -= derivative(i_o[trainY[idx]])

def eval_w_2o():
    for j in range(16):
        for i in range(10):
            deriv_w_2o[j][i] = o_l2[j]*deriv_bo[i]

def eval_b2():
    for i in range(16):
        deriv_b2[i] = 0
        for j in range(10):
            deriv_b2[i] += derivative(i_l2[i])*w_l2_o[i][j]*deriv_bo[j]

def eval_w_12():
    for j in range(16):
        for i in range(16):
            deriv_w_12[j][i] = o_l1[j]*deriv_b2[i]

def eval_b1():
    for i in range(16):
        deriv_b1[i] = 0
        for k in range(16):
            deriv_b1[i] += deriv_b2[k]*w_l1_l2[i][k]*derivative(i_l1[i])

def eval_w_i1(idx):
    for j in range(784):
        for i in range(16):
            deriv_w_i1[j][i] = deriv_b1[i]*trainX[idx][j]


def eval_grads(idx):
    eval_bo(idx)
    eval_w_2o()
    eval_b2()
    eval_w_12()
    eval_b1()
    eval_w_i1(idx)

def mini_batch_vanilla_descent(lr,batch_size): #lr = learning rate
    indices = np.random.randint(0,trainX.shape[0],batch_size)
    
    # average gradient vectors or matrixes
    avgd_bo = np.zeros(10)
    avgd_w_2o = np.zeros((16,10))
    avgd_b2 = np.zeros(16)
    avgd_w_12 = np.zeros((16,16))
    avgd_b1 = np.zeros(16)
    avgd_w_i1 = np.zeros((784,16))
    
    for idx in indices:
        eval_model(idx,trainX)
        eval_grads(idx)
        
        # compute average gradient from batch
        for i in range(10):
            avgd_bo[i] += deriv_bo[i]/batch_size
        
        for i in range(16):
            for j in range(10):
                avgd_w_2o[i][j] += deriv_w_2o[i][j]/batch_size
                
        for i in range(16):
            avgd_b2[i] += deriv_b2[i]/batch_size

        for i in range(16):
            for j in range(16):
                avgd_w_12[i][j] += deriv_w_12[i][j]/batch_size
        
        for i in range(16):
            avgd_b1[i] += deriv_b1[i]/batch_size
        
        for i in range(784):
            for j in range(16):
                avgd_w_i1[i][j] += deriv_w_i1[i][j]/batch_size
        
        # now apply gradient descent
        for i in range(10):
            b_o[i] -= avgd_bo[i]*lr
        
        for i in range(16):
            for j in range(10):
                w_l2_o[i][j] -= avgd_w_2o[i][j]*lr
                
        for i in range(16):
            b_l2[i] -= avgd_b2[i]*lr

        for i in range(16):
            for j in range(16):
                w_l1_l2[i][j] -= avgd_w_12[i][j]*lr
        
        for i in range(16):
            b_l1[i] -= avgd_b1[i]*lr
        
        for i in range(784):
            for j in range(16):
                w_i_l1[i][j] -= avgd_w_i1[i][j]*lr
        
    #print(deriv_bo[1:6])

In [37]:
def mini_batch_vanilla_gradient_descent_iterator(lr,batch_size,iterations):
    for i in range(iterations):
        mini_batch_vanilla_descent(lr,batch_size)
        if((i)%(iterations//50)==0): 
            print(str(i+1)+" iterations done!")

In [38]:
# applying mini-batch gradient descent using backpropagation algorithm
iterations = 1000
mini_batch_vanilla_gradient_descent_iterator(10,10,iterations)
print(str(iterations) +  " iterations done! finally")

1 iterations done!
21 iterations done!
41 iterations done!
61 iterations done!
81 iterations done!
101 iterations done!
121 iterations done!
141 iterations done!
161 iterations done!
181 iterations done!
201 iterations done!
221 iterations done!
241 iterations done!
261 iterations done!
281 iterations done!
301 iterations done!
321 iterations done!
341 iterations done!
361 iterations done!
381 iterations done!
401 iterations done!
421 iterations done!
441 iterations done!
461 iterations done!
481 iterations done!
501 iterations done!
521 iterations done!
541 iterations done!
561 iterations done!
581 iterations done!
601 iterations done!
621 iterations done!
641 iterations done!
661 iterations done!
681 iterations done!
701 iterations done!
721 iterations done!
741 iterations done!
761 iterations done!
781 iterations done!
801 iterations done!
821 iterations done!
841 iterations done!
861 iterations done!
881 iterations done!
901 iterations done!
921 iterations done!
941 iterations done

In [39]:
def eval_acc():
    cnt = 0
    indices = np.random.randint(0,testX.shape[0],1000) #choosing 1000 test samples randomly for testing
    #print(testX.shape[0])
    for idx in indices:
        #print("testing for "+str(idx+1)+"th sample")
        eval_model(idx,testX)
        probablility = 0
        prediction = -1
        for i in range(10):
            #print(o_o[i])
            if(probablility < o_o[i]):
                probablility = o_o[i]
                prediction = i
        #print("------")
        #print(testY[idx])
        #print(prediction)
        if(testY[idx]==prediction):
            cnt += 1
    return cnt/1000

In [40]:
acc1 = eval_acc()
f = open("output_2.txt","w")
f.write("Accuracy of MyNet from code written from scratch : "+str(acc1)+'\n')
print("Accuracy of My_Neural_Net from code written from scratch : "+str(acc1)+'\n')

Accuracy of My_Neural_Net from code written from scratch : 0.197



In [41]:
MyNet = tf.keras.Sequential()
MyNet.add(keras.layers.InputLayer(784))
MyNet.add(layers.Dense(16, activation='sigmoid', name='LAYER_1'))
MyNet.add(layers.Dense(16, activation='sigmoid', name='LAYER_2'))
MyNet.add(layers.Dense(10, activation='sigmoid', name='OUTPUT'))

MyNet.summary()

MyNet.compile(
    loss=keras.losses.MeanSquaredError(),
    metrics=["accuracy"],
)

MyNet.fit(trainX, trainY, batch_size=10)
acc = MyNet.evaluate(testX, testY)[1]
f.write("Accuracy of MyNet using standard library : "+str(acc)+'\n')
print("Accuracy of My_Neural_Net using standard library : "+str(acc)+'\n')

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 LAYER_1 (Dense)             (None, 16)                12560     
                                                                 
 LAYER_2 (Dense)             (None, 16)                272       
                                                                 
 OUTPUT (Dense)              (None, 10)                170       
                                                                 
Total params: 13,002
Trainable params: 13,002
Non-trainable params: 0
_________________________________________________________________
Accuracy of My_Neural_Net using standard library : 0.09799999743700027

