In [26]:
import numpy as np
from scipy import optimize as sp
import pandas as pd
import math

class Neural_Network():
    
    def __init__(self):
        self.num = 0
    
    def sigmoid(self,z):
        return 1.0/(1.0 + np.exp(-z))

    def sigmoidGradient(self,z):
        return self.sigmoid(z)*(1-self.sigmoid(z))
    
    def regularization(self, lamda, m):
        lamda_val = lamda/(2.0*m)
        theta1_sum = 0 
        theta2_sum = 0
        for j in range(len(self.Theta1)-1):
            for k in range(self.Theta1[0].size-1):
                theta1_sum += self.Theta1[j+1][k+1]*self.Theta1[j+1][k+1]
        for j in range(len(self.Theta2)-1):
            for k in range(self.Theta2[0].size-1):
                theta2_sum += self.Theta2[j+1][k+1]*self.Theta2[j+1][k+1]
        return lamda_val*(theta1_sum+theta2_sum)
    
    def calc_cost(self, y_vals, hyp, lamda, m): #hyp and y are both 10x1 vectors 
        cost = 0
        for k in range(y_vals.size):
            cost += -y_vals[k] * math.log(hyp[k]) - (1-y_vals[k])*math.log(1-hyp[k])
        return cost
    
    def predict(self, weights, x_vals):
            x_vals = np.hstack(([1],x_vals))
            weights1 = weights[0]
            weights2 = weights[1]
            z2 = np.matmul(x_vals,weights1.T)
            a2 = self.sigmoid(z2)
            a2 = np.concatenate((np.array([1]), a2))
            z3 = np.matmul(a2,weights2.T)
            a3 = self.sigmoid(z3)
            print("Hyp: " + str(a3))
            max_val = a3[0]
            max_index = 0
            for i in range(len(a3)):
                print("Curr Val: " + str(a3[i]) + " at index " + str(i))
                if (a3[i] > max_val):
                    max_val = a3[i]
                    print("New Max Val: " + str(max_val))
                    max_index = i
            prediction = max_index+1
            if prediction == 10:
                prediction = 0
            return prediction
    
    def nnCostFunction(self, nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lambda_reg):
        self.Theta1 = np.reshape(nn_params[:hidden_layer_size*(input_layer_size+1)],(hidden_layer_size, input_layer_size+1))
        self.Theta2 = np.reshape(nn_params[hidden_layer_size*(input_layer_size+1):], (num_labels, hidden_layer_size+1))
        
        m = len(X)
        labels = y.flatten()
        # set y to be matrix of size m x k
        y = np.zeros((m,num_labels))
        # for every label, convert it into vector of 0s and a 1 in the appropriate position
        for i in range(m): #each row is new training sample
            index = int(labels[i]-1)
            y[i][index] = 1
        J = 0;
        Theta1_grad = np.zeros_like(self.Theta1)
        Theta2_grad = np.zeros_like(self.Theta2)

        # add column of ones as bias unit from input layer to second layer
        X = np.hstack((np.ones((m,1)), X)) # = a1
        
        #Forward and Back prop: 

        bigDelta1 = 0
        bigDelta2 = 0
        cost_temp = 0

        # for each training example
        for t in range(m):

            ## step 1: perform forward pass
            # set lowercase x to the t-th row of X
            x = X[t]
            # note that uppercase X already included column of ones 
            # as bias unit from input layer to second layer, so no need to add it

            # calculate second layer as sigmoid( z2 ) where z2 = Theta1 * a1
            z2 = np.matmul(x,self.Theta1.T)
            a2 = self.sigmoid(z2)

            # add column of ones as bias unit from second layer to third layer
            a2 = np.concatenate((np.array([1]), a2))
            # calculate third layer as sigmoid (z3) where z3 = Theta2 * a2
           # print("Theta: " + str(self.Theta2))
            z3 = np.matmul(a2,self.Theta2.T)
            a3 = self.sigmoid(z3)

            ## step 2: 
            delta3 = np.zeros((num_labels))

            #subtract actual val in y from each hypothesized val in a3
            y_vals = np.zeros((num_labels))
            for k in range(num_labels): #for each of the 10 labels subtract
                y_k = y[t][k]
                y_vals[k] = y_k
                delta3[k] = a3[k] - y_k\

            ## step 3: for the hidden layer l=2, set delta2 = Theta2' * delta3 .* sigmoidGradient(z2)
            # note that we're skipping delta2_0 (=gradients of bias units, which we don't use here)
            delta2 = np.matmul(self.Theta2[:,1:].T, delta3) * self.sigmoidGradient(z2)

            ## step 4: accumulate gradient from this example
            # accumulation
            bigDelta1 += np.outer(delta2, x)
            bigDelta2 += np.outer(delta3, a2)

            cost_temp += self.calc_cost(y_vals, a3, lambda_reg, m)
        term1 = (1/m)*cost_temp
        term2 = self.regularization(lambda_reg, m)
        J = term1 + term2
        #print("Cost is: " + str(J))
        # step 5: obtain gradient for neural net cost function by dividing the accumulated gradients by m
        Theta1_grad = bigDelta1 / m
        Theta2_grad = bigDelta2 / m
        

        #% REGULARIZATION FOR GRADIENT
        # only regularize for j >= 1, so skip the first column
        Theta1_grad_unregularized = np.copy(Theta1_grad)
        Theta2_grad_unregularized = np.copy(Theta2_grad)
        Theta1_grad += (float(lambda_reg)/m)*self.Theta1
        Theta2_grad += (float(lambda_reg)/m)*self.Theta2
        Theta1_grad[:,0] = Theta1_grad_unregularized[:,0]
        Theta2_grad[:,0] = Theta2_grad_unregularized[:,0]
        flattened_grads = np.hstack((Theta1_grad.flatten(),Theta2_grad.flatten()))
        
        return J, flattened_grads
        

df = pd.read_csv(r'/Users/elliesuit/emnist/Emnist Data/Theta1.csv', header = None)
df2 = pd.read_csv(r'/Users/elliesuit/emnist/Emnist Data/Theta2.csv', header = None)
df3 = pd.read_csv(r'/Users/elliesuit/emnist/Emnist Data/X.csv', header = None)
df4 = pd.read_csv(r'/Users/elliesuit/emnist/Emnist Data/Y.csv', header = None)
theta1 = np.zeros([25,401])
theta2 = np.zeros([10,26])
x = np.zeros([5000,400])
y = np.zeros([5000,1])
index = 0 
while (index < 25):
    theta1[index] = df.iloc[index]
    index+=1

index = 0
while(index<10):
    theta2[index] = df2.iloc[index]
    index+=1

index = 0
while(index<5000):
    x[index] = df3.iloc[index]
    index+=1
ones = np.ones((5000,1))
#x = np.hstack((ones, Xtemp)) #this is a1 (67) 

index = 0
while (index<5000):
    y[index] = df4.iloc[index]
    index+=1

nn_params = [theta1, theta2] 
flattened_params = np.hstack((theta1.flatten(),theta2.flatten()))
input_layer_size = 400
hidden_layer_size = 25
num_labels = 10
lambda_val = 1 
n = Neural_Network()
n.nnCostFunction(flattened_params, input_layer_size, hidden_layer_size, num_labels, x, y, lambda_val)
#Minimize:
flat_theta1 = nn_params[0].flatten()
flat_theta2 = nn_params[1].flatten()
initial_weights = [flat_theta1, flat_theta2]
nn_params = np.hstack((flat_theta1, flat_theta2)) 
func_args = (input_layer_size, hidden_layer_size, num_labels, x, y, lambda_val)
result = sp.minimize(n.nnCostFunction, x0 = nn_params, args = func_args, method = 'cg', jac = True, options = {'disp': True, 'maxiter': 5})
adjusted_weights = result.x
theta1 = np.reshape(adjusted_weights[:hidden_layer_size*(input_layer_size+1)],(hidden_layer_size, input_layer_size+1))
theta2 = np.reshape(adjusted_weights[hidden_layer_size*(input_layer_size+1):], (num_labels, hidden_layer_size+1))

#Prediction: 
print(n.predict([theta1,theta2],x[4702]))


1
Entered Function
Cost is: 0.3783864266414258
2
Entered Function
Cost is: 0.3783864266414258
3
Entered Function
Cost is: 0.37821785972037214
4
Entered Function
Cost is: 0.3775923803891706
5
Entered Function
Cost is: 0.375859478555941
6
Entered Function
Cost is: 0.3743547054720781
7
Entered Function
Cost is: 0.3701720088103295
8
Entered Function
Cost is: 0.36879024050251674
9
Entered Function
Cost is: 0.3651056856385941
10
Entered Function
Cost is: 0.35735867536770427
11
Entered Function
Cost is: 0.34791461768179266
12
Entered Function
Cost is: 0.3429849130027497
13
Entered Function
Cost is: 0.34261157241100154
14
Entered Function
Cost is: 0.3414860714948612
15
Entered Function
Cost is: 0.3408922591861223
         Current function value: 0.340892
         Iterations: 5
         Function evaluations: 14
         Gradient evaluations: 14
Hyp: [3.80585860e-04 1.32439793e-04 3.16413403e-03 1.65867597e-02
 5.59915430e-04 6.10307357e-05 1.59472107e-03 2.38501218e-03
 9.92725989e-01 1.7381037