In [1]:
import numpy as np
from scipy import optimize as sp
import pandas as pd
import math

class Neural_Network():
    #calculates sigmoid function, params: z (variable), returns: sigmoid calculation
    def sigmoid(self,z):
        return 1.0/(1.0 + np.exp(-z))
    
    #calculates the sigmoidGradient (derivative), params: z (variable), returns: sigmoid gradient calculation 
    def sigmoidGradient(self,z):
        return self.sigmoid(z)*(1-self.sigmoid(z))
    
    #calculates the regularization term for the neural network, params: lambda (regularization constant),
    #m (number of training samples), returns regularization value
    def regularization(self, lamda, m):
        lamda_val = lamda/(2.0*m)
        theta1_sum = 0 
        theta2_sum = 0
        for j in range(len(self.Theta1)-1):
            for k in range(self.Theta1[0].size-1):
                theta1_sum += self.Theta1[j+1][k+1]*self.Theta1[j+1][k+1]
        for j in range(len(self.Theta2)-1):
            for k in range(self.Theta2[0].size-1):
                theta2_sum += self.Theta2[j+1][k+1]*self.Theta2[j+1][k+1]
        return lamda_val*(theta1_sum+theta2_sum)
    
    #calculates the cost for the neural network, params: y_vals (expected output values), hyp (calculated output values),
    #m (number of training samples), returns cost between given sample and expected value  
    def calc_cost(self, y_vals, hyp, lamda, m): #hyp and y are both 10x1 vectors 
        cost = 0
        for k in range(y_vals.size):
            cost += -y_vals[k] * math.log(hyp[k]) - (1-y_vals[k])*math.log(1-hyp[k])
        return cost
    
    #predicts the number that correlates to the input data, params: weights(an array that consists of 2 weight matricies),
    #x_vals (array that consists of input values), returns prediction number (0-9) 
    def predict(self, weights, x_vals):
            #x_vals = np.hstack(([1],x_vals))
            weights1 = weights[0]
            weights2 = weights[1]
            z2 = np.matmul(x_vals,weights1.T)
            a2 = self.sigmoid(z2)
            a2 = np.hstack(([1], a2))
            z3 = np.matmul(a2,weights2.T)
            a3 = self.sigmoid(z3)
            max_val = a3[0]
            max_index = 0
            print(a3)
            for i in range(len(a3)):
                if (a3[i] > max_val):
                    max_val = a3[i]
                    max_index = i
            prediction = max_index+1
            if prediction == 10:
                prediction = 0
            return prediction
        
    #performs forward and backward prop to get a final cost value, J, and 2 gradient weight matricies
    #params: nn_params(array that consists of 2 weight matricies for layer 1 and 2 respectively), input_layer_size (number of input units),
    #hidden_layer_size (number of hidden units), num_labels (number of output units), x (training samples), y (expected output values), lambda_reg (regularization constant)
    #returns cost and an array of weight gradient vectors 
    def nnCostFunction(self, nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lambda_reg):
        self.Theta1 = np.reshape(nn_params[:hidden_layer_size*(input_layer_size+1)],(hidden_layer_size, input_layer_size+1))
        self.Theta2 = np.reshape(nn_params[hidden_layer_size*(input_layer_size+1):], (num_labels, hidden_layer_size+1))
        
        J = 0;
        Theta1_grad = np.zeros_like(self.Theta1)
        Theta2_grad = np.zeros_like(self.Theta2)
       
        #Forward and Back prop: 

        bigDelta1 = 0
        bigDelta2 = 0
        cost_temp = 0

        # for each training example
        for t in range(m):

            ## step 1: perform forward pass
            x = X[t]

            #calculate z2 (linear combination) and a2 (activation for layer 2)
            z2 = np.matmul(x,self.Theta1.T)
            a2 = self.sigmoid(z2)

            # add column of ones as bias unit to the second layer
            a2 = np.hstack(([1], a2))
            # calculate z3 (linear combination) and a3 (activation for layer 3 aka final hypothesis)
            z3 = np.matmul(a2,self.Theta2.T)
            a3 = self.sigmoid(z3)
            
            #Backpropogation: 

            #step 2: set delta 3
            delta3 = np.zeros((num_labels))

            #Get Error: subtract actual val in y from each hypothesized val in a3  
            y_vals = np.zeros((num_labels))
            for k in range(num_labels): #for each of the 10 labels subtract
                y_k = y[t][k]
                y_vals[k] = y_k
                delta3[k] = a3[k] - y_k

            #step 3: for layer 2 set delta2 = Theta2 Transpose * delta3 .* sigmoidGradient(z2) (= Chain Rule)
            #Skip over the bias unit in layer 2: no gradient calculated for this value 
            delta2 = np.matmul(self.Theta2[:,1:].T, delta3) * self.sigmoidGradient(z2)

            #step 4: accumulate gradient from this sample
            bigDelta1 += np.outer(delta2, x)
            bigDelta2 += np.outer(delta3, a2)
            #Update the total cost given the cost from this sample
            cost_temp += self.calc_cost(y_vals, a3, lambda_reg, m)
            
        #Accumulate cost values and regularize to get Cost(J) 
        term1 = (1/m)*cost_temp
        term2 = self.regularization(lambda_reg, m)
        J = term1 + term2
        print("Cost: " + str(J)) 
        
        # step 5: obtain gradient for neural net cost function by dividing the accumulated gradients by m
        Theta1_grad = bigDelta1 / m
        Theta2_grad = bigDelta2 / m
        

        #Regularization
        #only regularize for j >= 1, so skip the first column
        Theta1_grad_unregularized = np.copy(Theta1_grad)
        Theta2_grad_unregularized = np.copy(Theta2_grad)
        Theta1_grad += (float(lambda_reg)/m)*self.Theta1
        Theta2_grad += (float(lambda_reg)/m)*self.Theta2
        Theta1_grad[:,0] = Theta1_grad_unregularized[:,0]
        Theta2_grad[:,0] = Theta2_grad_unregularized[:,0]
        flattened_grads = np.hstack((Theta1_grad.flatten(),Theta2_grad.flatten()))
        
        return J, flattened_grads
        
#Read in data files
df = pd.read_csv(r'/Users/elliesuit/emnist/Emnist Data/Theta1.csv', header = None)
df2 = pd.read_csv(r'/Users/elliesuit/emnist/Emnist Data/Theta2.csv', header = None)
df3 = pd.read_csv(r'/Users/elliesuit/emnist/Emnist Data/X.csv', header = None)
df4 = pd.read_csv(r'/Users/elliesuit/emnist/Emnist Data/Y.csv', header = None)
#Initialize layer sizes
input_layer_size = 400
hidden_layer_size = 25
num_labels = 10
#Set sizes for weight and data matricies
theta1 = np.zeros([hidden_layer_size,input_layer_size+1])
theta2 = np.zeros([num_labels,hidden_layer_size+1])
x = np.zeros((len(df3),input_layer_size))
y_vec = np.zeros((len(df4),))

#create data and weight arrays
index = 0 
while (index < hidden_layer_size):
    theta1[index] = df.iloc[index]
    index+=1

index = 0
while(index<num_labels):
    theta2[index] = df2.iloc[index]
    index+=1

index = 0
while(index<len(x)):
    x[index] = df3.iloc[index]
    index+=1
ones = np.ones((len(df3),1))
x = np.hstack((ones, x)) 

index = 0
while (index<len(y_vec)):
    y_vec[index] = df4.iloc[index]
    index+=1
    
m = len(x)
# set y to be a 2-D matrix with each column being a different sample and each row corresponding to a value 0-9
y = np.zeros((m,num_labels))
# for every label, convert it into vector of 0s and a 1 in the appropriate position
for i in range(m): #each row is new training sample
    index = int(y_vec[i]-1)
    y[i][index] = 1

nn_params = [theta1, theta2] 
flattened_params = np.hstack((theta1.flatten(),theta2.flatten()))
lambda_val = 2.3 
n = Neural_Network()
#calculate the gradients and cost
n.nnCostFunction(flattened_params, input_layer_size, hidden_layer_size, num_labels, x, y, lambda_val)
#Minimize:
#flatten and merge theta1 and theta2 values into a single vector 
nn_params = flattened_params
func_args = (input_layer_size, hidden_layer_size, num_labels, x, y, lambda_val)
#minimize using the conjugate-gradient (cg) algorithm 
result = sp.minimize(n.nnCostFunction, x0 = nn_params, args = func_args, method = 'tnc', jac = True, options = {'disp': True, 'maxiter': 1000})
print(result)
adjusted_weights = result.x
theta1 = np.reshape(adjusted_weights[:hidden_layer_size*(input_layer_size+1)],(hidden_layer_size, input_layer_size+1))
theta2 = np.reshape(adjusted_weights[hidden_layer_size*(input_layer_size+1):], (num_labels, hidden_layer_size+1))
print("Theta1 new: " + str(theta1))
print("Theta2 new: " + str(theta2))
#Prediction: 
print("Predicted Value is: " + str(n.predict([theta1,theta2],x[2231])))


Cost: 0.49637086631235133
Cost: 0.49637086631235133
Cost: 0.4963708660912275
Cost: 0.4963708653560094
Cost: 0.4963708650956953
Cost: 0.4963708661062709
Cost: 0.4925217886717824
Cost: 0.4925217883596334
Cost: 0.49252178853131756
Cost: 0.492521788293834
Cost: 0.49252178841788885


KeyboardInterrupt: 