In [1]:
import numpy as np
from scipy import optimize as sp
import pandas as pd
import math
from random import sample, uniform

class Neural_Network():
    #calculates sigmoid function, params: z (variable), returns: sigmoid calculation
    def sigmoid(self,z):
        return 1.0/(1.0 + np.exp(-z))
    
    #calculates the sigmoidGradient (derivative), params: z (variable), returns: sigmoid gradient calculation 
    def sigmoidGradient(self,z):
        return self.sigmoid(z)*(1-self.sigmoid(z))
    
    def randomly_initialize(self, num_in, num_out):
        epsilon = 0.12
        size = num_out*(num_in+1)
        all_weights = np.zeros((size,))
        for i in range(size):   
            all_weights[i] = uniform(-epsilon, epsilon) 
        weights = np.reshape(all_weights,(num_out,num_in+1))
        return weights
    
    #calculates the regularization term for the neural network, params: lambda (regularization constant),
    #m (number of training samples), returns regularization value
    def regularization(self, lamda, m):
        lamda_val = lamda/(2.0*m)
        theta1_sum = 0 
        theta2_sum = 0
        for j in range(len(self.Theta1)-1):
            for k in range(self.Theta1[0].size-1):
                theta1_sum += self.Theta1[j+1][k+1]*self.Theta1[j+1][k+1]
        for j in range(len(self.Theta2)-1):
            for k in range(self.Theta2[0].size-1):
                theta2_sum += self.Theta2[j+1][k+1]*self.Theta2[j+1][k+1]
        return lamda_val*(theta1_sum+theta2_sum)
    
    #calculates the cost for the neural network, params: y_vals (expected output values), hyp (calculated output values),
    #m (number of training samples), returns cost between given sample and expected value  
    def calc_cost(self, y_vals, hyp, lamda, m): #hyp and y are both 10x1 vectors 
        cost = 0
        for k in range(len(y_vals)):
            cost += -y_vals[k] * math.log(hyp[k]) - (1-y_vals[k])*math.log(1-hyp[k])
        return cost
    
    #predicts the number that correlates to the input data, params: weights(an array that consists of 2 weight matricies),
    #x_vals (array that consists of input values), returns prediction number (0-9) 
    def predict(self, weights, x_vals):
            #x_vals = np.hstack(([1],x_vals))
            weights1 = weights[0]
            weights2 = weights[1]
            z2 = np.matmul(x_vals,weights1.T)
            a2 = self.sigmoid(z2)
            a2 = np.hstack(([1], a2))
            z3 = np.matmul(a2,weights2.T)
            a3 = self.sigmoid(z3)
            max_val = a3[0]
            max_index = 0
            print(a3)
            for i in range(len(a3)):
                if (a3[i] > max_val):
                    max_val = a3[i]
                    max_index = i
            prediction = max_index+1
            if prediction == 10:
                prediction = 0
            return prediction
        
    #performs forward and backward prop to get a final cost value, J, and 2 gradient weight matricies
    #params: nn_params(array that consists of 2 weight matricies for layer 1 and 2 respectively), input_layer_size (number of input units),
    #hidden_layer_size (number of hidden units), num_labels (number of output units), x (training samples), y (expected output values), lambda_reg (regularization constant)
    #returns cost and an array of weight gradient vectors 
    def nnCostFunction(self, nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lambda_reg):
        self.Theta1 = np.reshape(nn_params[:hidden_layer_size*(input_layer_size+1)],(hidden_layer_size, input_layer_size+1))
        self.Theta2 = np.reshape(nn_params[hidden_layer_size*(input_layer_size+1):], (num_labels, hidden_layer_size+1))
        
        J = 0;
        Theta1_grad = np.zeros_like(self.Theta1)
        Theta2_grad = np.zeros_like(self.Theta2)
       
        #Forward and Back prop: 

        bigDelta1 = 0
        bigDelta2 = 0
        cost_temp = 0

        # for each training example
        for t in range(m):

            ## step 1: perform forward pass
            x = X[t]

            #calculate z2 (linear combination) and a2 (activation for layer 2)
            z2 = np.matmul(x,self.Theta1.T)
            a2 = self.sigmoid(z2)

            # add column of ones as bias unit to the second layer
            a2 = np.hstack(([1], a2))
            # calculate z3 (linear combination) and a3 (activation for layer 3 aka final hypothesis)
            z3 = np.matmul(a2,self.Theta2.T)
            a3 = self.sigmoid(z3)
            
            #Backpropogation: 

            #step 2: set delta 3
            delta3 = np.zeros((num_labels))

            #Get Error: subtract actual val in y from each hypothesized val in a3  
            y_vals = np.zeros((num_labels))
            for k in range(num_labels): #for each of the 10 labels subtract
                y_k = y[t][k]
                y_vals[k] = y_k
                delta3[k] = a3[k] - y_k

            #step 3: for layer 2 set delta2 = Theta2 Transpose * delta3 .* sigmoidGradient(z2) (= Chain Rule)
            #Skip over the bias unit in layer 2: no gradient calculated for this value 
            delta2 = np.matmul(self.Theta2[:,1:].T, delta3) * self.sigmoidGradient(z2)

            #step 4: accumulate gradient from this sample
            bigDelta1 += np.outer(delta2, x)
            bigDelta2 += np.outer(delta3, a2)
            #Update the total cost given the cost from this sample
            cost_temp += self.calc_cost(y_vals, a3, lambda_reg, m)
            
        #Accumulate cost values and regularize to get Cost(J) 
        term1 = (1/m)*cost_temp
        term2 = self.regularization(lambda_reg, m)
        J = term1 + term2
        print("Cost: " + str(J)) 
        
        # step 5: obtain gradient for neural net cost function by dividing the accumulated gradients by m
        Theta1_grad = bigDelta1 / m
        Theta2_grad = bigDelta2 / m
        

        #Regularization
        #only regularize for j >= 1, so skip the first column
        Theta1_grad_unregularized = np.copy(Theta1_grad)
        Theta2_grad_unregularized = np.copy(Theta2_grad)
        Theta1_grad += (float(lambda_reg)/m)*self.Theta1
        Theta2_grad += (float(lambda_reg)/m)*self.Theta2
        Theta1_grad[:,0] = Theta1_grad_unregularized[:,0]
        Theta2_grad[:,0] = Theta2_grad_unregularized[:,0]
        flattened_grads = np.hstack((Theta1_grad.flatten(),Theta2_grad.flatten()))
        
        return J, flattened_grads
        
        
#Read in data files
#df = pd.read_csv(r'/Users/elliesuit/emnist/Emnist Data/Theta1.csv', header = None)
#df2 = pd.read_csv(r'/Users/elliesuit/emnist/Emnist Data/Theta2.csv', header = None)
df3 = pd.read_csv(r'/Users/elliesuit/emnist/Emnist Data/X.csv', header = None)
df4 = pd.read_csv(r'/Users/elliesuit/emnist/Emnist Data/Y.csv', header = None)
#Initialize layer sizes
input_layer_size = 400
hidden_layer_size = 100
num_labels = 10
#Set sizes for weight and data matricies
theta1 = np.zeros([hidden_layer_size,input_layer_size+1])
theta2 = np.zeros([num_labels,hidden_layer_size+1])
x = np.zeros((len(df3),input_layer_size))
x_sample = np.zeros((int(len(df3)*(0.7)), input_layer_size)) #take only 70% for training to leave 30% for testing
y_vec = np.zeros((len(df4),))
y_sample = np.zeros((int(len(df4)*0.7)),)
random_indicies = sample(range(0,int(len(df3))),int(len(df3)*0.7)) 

n = Neural_Network()

#create data and weight arrays
theta1 = n.randomly_initialize(input_layer_size, hidden_layer_size)
theta2 = n.randomly_initialize(hidden_layer_size, num_labels)

index = 0
while(index<len(x)):
    x[index] = df3.iloc[index]
    index+=1

index = 0
while (index<len(y_vec)):
    y_vec[index] = df4.iloc[index]
    index+=1
    
for index in range(len(random_indicies)):
    sample_index = random_indicies[index] 
    x_sample[index] = x[sample_index] 
    y_sample[index] = y_vec[sample_index]
x_test = np.zeros((int(len(df3)*0.3),input_layer_size))
y_test = np.zeros((int(len(df4)*0.3),))
#set test data
test_indicies = np.zeros((int(len(df3)*0.3),))
count = 0
for ind in range(len(df3)):
    if ind not in random_indicies:
        test_indicies[count] = ind
        count+=1
for ii in range(len(test_indicies)):
    test_index = int(test_indicies[ii])
    x_test[ii] = x[test_index]
    y_test[ii] = y_vec[test_index]
    
x = x_sample
ones = np.ones((len(x_sample),1))
test_ones = np.ones((len(x_test),1))
x = np.hstack((ones, x)) 
x_test = np.hstack((test_ones,x_test))
y_vec = y_sample
    
m = len(x)
# set y to be a 2-D matrix with each column being a different sample and each row corresponding to a value 0-9
y = np.zeros((m,num_labels))
y_test_matrix = np.zeros((len(y_test),num_labels))
# for every label, convert it into vector of 0s and a 1 in the appropriate position
for i in range(m): #each row is new training sample
    index = int(y_vec[i]-1)
    y[i][index] = 1
y_temp = y_test
for j in range(int(len(y_test))):
    index2 = int(y_temp[j]-1)
    y_test_matrix[j][index2] = 1
y_test = y_test_matrix
nn_params = [theta1, theta2] 




In [4]:
flattened_params = np.hstack((theta1.flatten(),theta2.flatten()))
lambda_val = 2
#calculate the gradients and cost
final_res = n.nnCostFunction(flattened_params, input_layer_size, hidden_layer_size, num_labels, x, y, lambda_val)
#Minimize:
#flatten and merge theta1 and theta2 values into a single vector 
nn_params = flattened_params
func_args = (input_layer_size, hidden_layer_size, num_labels, x, y, lambda_val)
#minimize using the conjugate-gradient (cg) algorithm 
result = sp.minimize(n.nnCostFunction, x0 = nn_params, args = func_args, method = 'cg', jac = True, options = {'gtol': 0.0000001,'disp': True, 'maxiter': 1000})
print("Training Result: " + str(result.fun))
adjusted_weights = result.x
theta1 = np.reshape(adjusted_weights[:hidden_layer_size*(input_layer_size+1)],(hidden_layer_size, input_layer_size+1))
theta2 = np.reshape(adjusted_weights[hidden_layer_size*(input_layer_size+1):], (num_labels, hidden_layer_size+1))
print ("Theta1 new: " + str(theta1))
print("Theta2 new: " + str(theta2))
#Prediction: Training Data
J = 0
cost_temp = 0
#Cost: Test Data
for samp in range(len(x_test)):
    x_curr = x_test[samp]
    z2 = np.matmul(x_curr,theta1.T)
    a2 = n.sigmoid(z2)
    a2 = np.hstack(([1], a2))
    z3 = np.matmul(a2,theta2.T)
    a3 = n.sigmoid(z3)

    cost_temp += n.calc_cost(y_test[samp], a3, lambda_val, len(x_test))

#Accumulate cost values and regularize to get Cost(J) 
term1 = (1/len(x_test))*cost_temp
term2 = n.regularization(1, len(x_test))
J = term1 + term2
print("Test Cost: " + str(J))

print("Predicted Value is: " + str(n.predict([theta1,theta2],x[2235])))

Cost: 0.4648209964709733
Cost: 0.4648209964709733
Cost: 0.4648204812950939
Cost: 0.46482038135777265
Cost: 0.46482026187998027
Cost: 0.4648200726694377
Cost: 0.46481974674103554
Cost: 0.464819534970033
Cost: 0.46481927483838786
Cost: 0.4648189959980751
Cost: 0.46481896315309307
Cost: 0.4648185956712864
Cost: 0.4648177765426701
Cost: 0.46482491794417957
Cost: 0.4648177687640359
Cost: 0.46481597507675254
Cost: 0.46481310144240584
Cost: 0.4648119807726956
Cost: 0.4648131862988843
Cost: 0.46481137982281245
Cost: 0.4648107975612348
Cost: 0.4648104958202931
Cost: 0.46481019855748784
Cost: 0.464813895467985
Cost: 0.46481021425484126
Cost: 0.4648097039615189
Cost: 0.46480836542920406
Cost: 0.4648080420804877
Cost: 0.4648078476174239
Cost: 0.46480770447202263
Cost: 0.46480733572454
Cost: 0.46480715089186464
Cost: 0.4648065222294584
Cost: 0.46480575842949295
Cost: 0.46480427683834596
Cost: 0.464800998827696
Cost: 0.46479957473001815
Cost: 0.4647983921853468
Cost: 0.46479725133361494
Cost: 0.4647

Cost: 0.46413173901783256
Cost: 0.46413012527933917
Cost: 0.4641292022601331
Cost: 0.46412783080079106
Cost: 0.464124823890409
Cost: 0.4641234880584252
Cost: 0.4641221481094201
Cost: 0.4641213738678418
Cost: 0.46412069560613306
Cost: 0.46411955119667125
Cost: 0.4641161885169263
Cost: 0.4641147241620037
Cost: 0.4641117840704537
Cost: 0.4641092847620303
Cost: 0.46410619430829814
Cost: 0.46410253218661296
Cost: 0.46409982826362056
Cost: 0.4640960030596712
Cost: 0.4640928971686169
Cost: 0.46409017323484564
Cost: 0.4640871694755712
Cost: 0.46408318167984497
Cost: 0.46408058944084074
Cost: 0.46407732042471705
Cost: 0.4640738167059641
Cost: 0.4640705208471281
Cost: 0.46406744930004945
Cost: 0.4640615927868462
Cost: 0.4640573425779497
Cost: 0.4640540670415261
Cost: 0.4640514179982146
Cost: 0.46404909792166954
Cost: 0.4640465420972032
Cost: 0.4640428165483459
Cost: 0.46403901158904015
Cost: 0.4640326823723799
Cost: 0.4640281799418833
Cost: 0.46402553885681896
Cost: 0.4640221242355773
Cost: 0.46

Cost: 0.4633209929418103
Cost: 0.46332047788630937
Cost: 0.4633197357218736
Cost: 0.46331905927106476
Cost: 0.46331940747520534
Cost: 0.46331861091461946
Cost: 0.46331816372614315
Cost: 0.46331751925896925
Cost: 0.4633171440636828
Cost: 0.46331670977479467
Cost: 0.4633161131377316
Cost: 0.4633151675799694
Cost: 0.4633131076206277
Cost: 0.46330903632322973
Cost: 0.46330730428256833
Cost: 0.4633084364621321
Cost: 0.4633062116249965
Cost: 0.46330486204761034
Cost: 0.46330251887221774
Cost: 0.46329981857232294
Cost: 0.463293942145903
Cost: 0.46328941891933473
Cost: 0.4632968808083803
Cost: 0.46328736129048653
Cost: 0.46328593843611837
Cost: 0.4632834767764721
Cost: 0.4632808523696458
Cost: 0.463278069648016
Cost: 0.46327278940327843
Cost: 0.4632730304211346
Cost: 0.46326866505725206
Cost: 0.4632617767137316
Cost: 0.4632607554590283
Cost: 0.46326040139930924
Cost: 0.46326000286353763
Cost: 0.46325898194482523
Cost: 0.46325852354358144
Cost: 0.46325742214539023
Cost: 0.4632569457717053
Cost:

Cost: 0.4629590796020626
Cost: 0.46295856824479004
Cost: 0.46295764114789406
Cost: 0.4629563596637556
Cost: 0.4629556022242224
Cost: 0.46295469764774966
Cost: 0.4629528679037538
Cost: 0.4629506337147238
Cost: 0.46294885679119757
Cost: 0.4629483301530831
Cost: 0.4629472680729425
Cost: 0.46294599824115373
Cost: 0.4629428719430484
Cost: 0.46294171003457274
Cost: 0.46293977831622896
Cost: 0.46293884244029404
Cost: 0.46293807466749215
Cost: 0.4629372336281236
Cost: 0.46293674086801595
Cost: 0.4629363875094262
Cost: 0.4629352364190664
Cost: 0.46293419439674227
Cost: 0.4629331301395262
Cost: 0.46293232043903493
Cost: 0.4629320193733234
Cost: 0.46293179342254653
Cost: 0.46293147681601654
Cost: 0.4629306474488206
Cost: 0.46293032075649254
Cost: 0.46292985087238914
Cost: 0.46292961209351957
Cost: 0.462929264154217
Cost: 0.4629290718311637
Cost: 0.46292883072863655
Cost: 0.46292867403647864
Cost: 0.4629284361916707
Cost: 0.46292821639570725
Cost: 0.46292781278843587
Cost: 0.4629275041198555
Cost: