# Neural Network 

Implementing single layer neural network for handwritten-digit classification

###### Importing Libraries

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import minimize # for minimizing gradient
from scipy.io import loadmat # for loading .mat file
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline

One-hot encoding turns a class label n (out of k classes) into a vector of length k where index n is "hot" (1) while the rest are zero.<br>
Eg. for digits labelled 1-10 (i.e. 0-9), 1 maybe represented as [0,1,0,0,0,0,0,0,0,0] (i.e it belongs to class 2)

###### Loading data

In [8]:
data = loadmat('ex3data1.mat')
# data
X = data['X']
y = data['y']
X.shape, y.shape

((5000, 400), (5000, 1))

###### One-hot encoding the y dataset 

In [20]:
encoder = OneHotEncoder(sparse=False) 
# sparse returns Sparse matrix when True, else returns an array

Yonehot = encoder.fit_transform(y)
Yonehot.shape, y[3100], Yonehot[3100,:]

((5000, 10),
 array([6], dtype=uint8),
 array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]))

###### Defining the functions to be used 

In [35]:
def sigmoid(z): # Sigmoid function for calculating chance of being that solution
    return 1/(1+np.exp(-z))

def forwardProp(X, theta1, theta2):    # for forward propagating in the neural network towards final outccome
    m = X.shape[0]
    # adding the bias node
    a1 = np.insert(X, obj=0, values=np.ones(m), axis=1)
    z2 = a1 * theta1.T   # matrix multiplication
    a2 = np.insert(sigmoid(z2), obj=0, values=np.ones(m), axis=1) # for hidden layer, adding bias node
    z3 = a2 * theta2.T   # matrix multiplication
    h = sigmoid(z3) # final hypothesis for layer 3 (final layer)
    
    return a1, z2, a2, z3, h


def cost(params, inputSize, hiddenlayerSize, numOfLabels, X, y, rate):    # defining the cost function
    m = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)
    
    
    # Changing (reshaping) the parameter array into theta matrices depending on size of each layer
    theta1 = np.matrix(np.reshape(params[:hiddenlayerSize * (inputSize + 1)], (hiddenlayerSize, (inputSize + 1))))
    theta2 = np.matrix(np.reshape(params[hiddenlayerSize * (inputSize + 1):],(numOfLabels, (hiddenlayerSize + 1))))
    # since if a network has s(j) units in layer j 
    # and s(j+1) in layer j+1,
    # then theta(j) will have dimension: s(j+1) x s(j)+1
    
    
    
    # passing theta to feed-forward algorithm
    a1, z2, a2, z3, h = forwardProp(X, theta1, theta2)
    
    # Computing the cost now (without regularization)
    J = 0
    for i in range(m):
        first = np.multiply(-y[i,:], np.log(h[i,:]))
        second = np.multiply((1-y[i,:]), np.log(1 - h[i,:]))
        J += np.sum((first - second))
        
    J = J/m
    
    # Regularizing the cost
    J += (float(rate) /(2*m))*(np.sum(np.power(theta1[:,1:], 2)) + np.sum(np.power(theta2[:,1:], 2)))
    return J

###### Trying out the Cost function 

In [36]:
inputSize = 400 # 400 points for each image of digit
hiddenlayerSize = 25
numOfLabels = 10
rate = 1

# randomly initializing the params array of the size of all parameters of network
params = (np.random.random(size=hiddenlayerSize * (inputSize+1) + (numOfLabels*(hiddenlayerSize + 1))) - 0.5)*0.25

m = X.shape[0]
X = np.matrix(X)
y = np.matrix(y)

theta1 = np.matrix(np.reshape(params[:hiddenlayerSize * (inputSize + 1)], (hiddenlayerSize, (inputSize + 1))))
theta2 = np.matrix(np.reshape(params[hiddenlayerSize * (inputSize + 1):],(numOfLabels, (hiddenlayerSize + 1))))

theta1.shape, theta2.shape



((25, 401), (10, 26))

Just checking out shapes to confirm if all have proper dimensions or not and then calculating cost using random values of weights 

In [37]:
a1, z2, a2, z3, h = forwardProp(X, theta1, theta2)
a1.shape, z2.shape, a2.shape, z3.shape, h.shape

((5000, 401), (5000, 25), (5000, 26), (5000, 10), (5000, 10))

In [38]:
cost(params, inputSize, hiddenlayerSize, numOfLabels, X, Yonehot, rate)

7.095865505430175

###### Functions defining the sigmoid-gradient and Backproportionate Algo

In [39]:
def sigGradient(z):
    return np.multiply(sigmoid(z), (1- sigmoid(z)))


def backProp(params, inputSize, hiddenlayerSize, numOfLabels, X, y, rate): # For minimizing cost function
    # starting is same as cost function
    m = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)
    
    
    # Changing (reshaping) the parameter array into theta matrices depending on size of each layer
    theta1 = np.matrix(np.reshape(params[:hiddenlayerSize * (inputSize + 1)], (hiddenlayerSize, (inputSize + 1))))
    theta2 = np.matrix(np.reshape(params[hiddenlayerSize * (inputSize + 1):],(numOfLabels, (hiddenlayerSize + 1))))
    
    a1, z2, a2, z3, h = forwardProp(X, theta1, theta2)
    
    # Here starts the BackProp Algo:
    J = 0
    delta1 = np.zeros(theta1.shape) # (25, 401)
    delta2 = np.zeros(theta2.shape) # (10, 26) 
    
    # Compute the cost
    for i in range(m):
        first = np.multiply(-y[i,:], np.log(h[i,:]))
        second = np.multiply((1-y[i,:]), np.log(1 - h[i,:]))
        J += np.sum((first - second))
        
    J = J/m
    
    # Regularizing the cost
    J += (float(rate) /(2*m))*(np.sum(np.power(theta1[:,1:], 2)) + np.sum(np.power(theta2[:,1:], 2)))
    
    
    # performing backpropagation
    for j in range(m):
        a1j = a1[j,:] # (1, 401) -taking each row at a time
        z2j = z2[j,:] # (1, 25)
        a2j = a2[j,:] # (1, 26)
        hj = h[j,:]   # (1, 10)
        yj = y[j,:]   # (1, 10)
        
        d3j = hj -yj  # (1, 10) -difference between real and predicted value
        
        z2j = np.insert(z2j, obj=0, values=np.ones(1)) # (1, 26)
        d2j = np.multiply((theta2.T * d3j.T).T, sigGradient(z2j)) # (1, 26)
        
        delta1 = delta1 + (d2j[:, 1:]).T * a1j
        delta2 = delta2 + (d3j.T * a2j)
        
    delta1 = delta1/m
    delta2 = delta2/m
    
    # Regularizing the term
    delta1[:, 1:] = delta1[:, 1:] + (theta1[:, 1:] * rate)/m
    delta2[:, 1:] = delta2[:, 1:] + (theta2[:, 1:] * rate)/m
    
    #flatten these matrices for passing into minimize function
    grad = np.concatenate((np.ravel(delta1), np.ravel(delta2))) # double brackets coz of more than 1 arguments
    
    return J, grad

# IT'S OVER..IT'S FINALLY OVER

###### Calling the backprop algo

In [40]:
J ,grad = backProp(params, inputSize, hiddenlayerSize, numOfLabels, X, Yonehot, rate)
J, grad.shape

(7.095865505430175, (10285,))

###### Now, training our model using minimize function from scipy library 

In [41]:
fmin = minimize(fun=backProp, x0=params, args=(inputSize, hiddenlayerSize, numOfLabels, X, Yonehot, rate), 
               method='TNC', jac=True, options={'maxiter':250})
fmin

     fun: 0.3392923647518514
     jac: array([-1.29326227e-03,  7.94553313e-07,  1.00721862e-06, ...,
        2.64325504e-04,  4.95264211e-05,  6.26773995e-05])
 message: 'Max. number of function evaluations reached'
    nfev: 250
     nit: 24
  status: 3
 success: False
       x: array([ 0.32947015,  0.00397277,  0.00503609, ...,  0.98715763,
       -2.01915746, -1.8178498 ])

###### Predicting the outcome using the optimized value of parameters got from last step 

In [42]:
X = np.matrix(X)
    
    
# Changing (reshaping) the parameter array into theta matrices depending on size of each layer
theta1 = np.matrix(np.reshape(fmin.x[:hiddenlayerSize * (inputSize + 1)], (hiddenlayerSize, (inputSize + 1))))
theta2 = np.matrix(np.reshape(fmin.x[hiddenlayerSize * (inputSize + 1):],(numOfLabels, (hiddenlayerSize + 1))))
    
a1, z2, a2, z3, h = forwardProp(X, theta1, theta2)
Ypredict =np.array(np.argmax(h, axis=1) + 1)
Ypredict

array([[10],
       [10],
       [10],
       ...,
       [ 9],
       [ 9],
       [ 9]])

###### Calculating the accuracy 

In [44]:
correct = [1 if a == b else 0 for (a, b) in zip(Ypredict, y)]  
accuracy = (sum(map(int, correct)) / float(len(correct)))  
print('accuracy = {0}%'.format(accuracy * 100))

accuracy = 99.11999999999999%
