In [394]:
## Cole Lewis, Assignment 3, GitHub: @colelewis

In [395]:
# starting import and provided methods
import numpy as np

def linear(x, weights, bias): # our means of forward propagation, pushes input data forward through hidden layer(s) to the output layer
    return np.dot(x, weights) + bias

def sigmoid(x): # the activation function, returns a value between 0 and 1 from a weighted set of inputs
    return 1.0 / (1.0 + np.exp(-x))

def d_sigmoid(x): # during back propagation, we need the derivative of the activation function in calculating gradients
    return sigmoid(x) * (1 - sigmoid(x))

def error(actual, experimental): # calculates mean absolute error
    return np.mean(np.abs(actual - experimental))

def sort_output(raw_inference): # converts the inference output into either 0 or 1 from the raw output of infer()
    if raw_inference > 0.5:
        return(1)
    else:
        return(0)

This neural network will be designed specifically for the problem at hand: approximating logic gates from the given input. 
Knowing what the purpose of the neural network will inform its design in the following ways:
* Our inputs will always be two binary values, named x1 and x2, meaning we will design the neural network to accept two input vectors or "features". 
* Furthermore, our data will always follow the same truth table format, so our data can be pre-processed within the class without worrying about deviance from the format, we will only need to pass a file path to the `train()` method. 
* For a problem of this nature, only 1 hidden layer will suffice, although we could easily implement more; after 1 layer we could consider our neural network to be a deep neural network. 

In [396]:
class logic_nn():
    def __init__(self): # default constructor for our logic_nn class

        # the neural network will take two input vectors, so we need two hidden weights: one for each input.
        # each hidden weight will be randomly initialized to meet the expectation of our optimization algorithm: stochastic gradient descent (SGD)
        # randomly initialized weights also help with what is called "symmetry breaking"; when all weights are initialized equally, it can become difficult for them to change independently when training
        # research into the topic suggests that small values between 0 and 0.1 make for the best starting weights for SGD
        self.hidden_weights = np.array([[np.random.uniform(0, 0.1), np.random.uniform(0, 0.1)]]) # initialize two random weights between 0 and 0.1 (inclusive of 0.1)

        # this weight is also initialized randomly for the same reason as the hidden weights
        self.output_weights = np.array([[np.random.uniform(0, 0.1)], [np.random.uniform(0, 0.1)]])

        # the neural network will have two hidden bias terms, one for each hidden weight
        # these bias terms are necessary to shift the weighted result properly and is likened to a constant (b term) in linear regression (y = mx + b)
        # since our randomly initialized weights satisfy the network's symmetry breaking, we initialize the terms to 0
        # research suggests that initializing bias terms to 0 is good practice as well
        self.hidden_bias = np.zeros((1, 2))

        # functions as described above
        self.output_bias = np.zeros((1, 2))


    def preprocess_data(self, file_path): # takes the provided .csv file path and returns two vectors: input_vector with x1 and x2, and output_vector with the result (y)
        root_data = np.genfromtxt(file_path, delimiter=',') # this loads the entire .csv file to be preprocessed

        # we are slicing the array from start (:) to index 2 (:2, to the second column) with all the input values
        # this gives us an array with x1 in the first column and x2 in the second
        input_vector = root_data[:, :2] 

        # negative numbers refer to indexes from the end; we're slicing from the start index to the end, leaving us the last column as output
        output_vector = root_data[:, -1]

        # now, we have a few things to fix:
        # the output vector is a different shape as the input vector, we need to reshape it to provide a corresponding output value for each row of input values
        # output vector will now be a single column instead of a row
        output_vector = output_vector.reshape(-1, 1)

        # both vectors are also made of float values instead of integers
        # our network must interpret the input and output data as integers since they represent binary value and will never be floats
        input_vector = input_vector.astype(int)
        output_vector = output_vector.astype(int)

        # lastly, the .csv file has string objects as labels for each column
        # upon converting each vector to be solely integers, the strings end up converting to strange vlaues and should be removed
        # below, each vector will have the top row removed to get rid of this value, this is the first element in each vector
        input_vector = np.delete(input_vector, 0, axis=0)
        output_vector = np.delete(output_vector, 0, axis=0)
        
        # our vectors are now prepared for training
        return input_vector, output_vector        


    # in our feedforward neural network, data is fed forward through our input layer -> hidden layer -> output layer
    # the function below is meant to calculate outputs from the network by summing the dot product of inputs and weights with the bias
    # unlike infer(), this method returns both 1st and 2nd layer outputs since back propagation will need both to determine how to shift weights
    # note: this method will be used for making predictions as well; a2 will be the interpretable result from the output layer
    def forward_propagate(self, input_vector):
        
        # raw output from the hidden layer, layer 1
        z1 = linear(input_vector, self.hidden_weights.T, self.hidden_bias)
        a1 = sigmoid(z1) # layer 1 output activated with sigmoid()

        # now we sum the dot product of layer 1 output and output weights with output bias
        # this demonstrates forward propagation in action, pushing output forwards from layer 1 -> layer 2
        z2 = linear(a1, self.output_weights, self.output_bias)
        a2 = sigmoid(z2) # activate the output layer with sigmoid()

        return z1, a1, z2, a2 # results from each layer of the network, raw and activated


    # backward propagation updates our weights and biases by calculated loss
    # our function will need to first find the partial derivative of loss with respect to the inference our network make
    # from there, we find partial derivatives of loss with respect to each weight and bias
    # lastly, we subtract from each weight and bias the product of themselves and the learning rate, this is the gradient descent
    def backward_propagate(self, input_vector, output_vector):
        z1, a1, z2, a2 = self.forward_propagate(input_vector) # we need the layer 1 and 2 outputs and activations from forward propagation
        l = output_vector.shape[0] # first shape dimension of the array; number of samples given to back propagate, we use this to normalize our values

        # before finding weight gradients, we need the partial derivatives of layer 1 and 2 with respect to loss, these form the foundation of subsequent calculations
        d_z2 = a2 - output_vector # predicted result - true result, activated output layer derivative
        d_z1 = np.dot(self.output_weights, d_z2) * d_sigmoid(z1) # activated first layer derivative

        # calculate hidden weights gradient, dLoss/dW1
        hwg = np.dot(input_vector, d_z1) / l
        
        # calculate output weights gradient, dLoss/dW2
        owg = np.dot(a1, d_z2.T) / l

        # calculate hidden bias gradient, dLoss/db1
        hbg = np.sum(d_z1) / l

        # calculate output bias gradient, dLoss/db2
        obg = np.sum(d_z2) / l

        # calculate sample error
        sample_error = error(output_vector, a2)

        return hwg, owg, hbg, obg, sample_error


    # tunes each parameters by subtracting them by the product of their respective gradient calculated by backward propagation and the learning rate
    def update_parameters(self, learning_rate, hidden_weight_gradient, output_weight_gradient, hidden_bias_gradient, output_bias_gradient):
        self.hidden_weights -= learning_rate * hidden_weight_gradient
        self.output_weights -= learning_rate * output_weight_gradient
        self.hidden_bias -= learning_rate * hidden_bias_gradient
        self.output_bias -= learning_rate * output_bias_gradient

    def make_batch(self, input_vector, output_vector, batch_size): # make batches with randomized entries
        batch = []
        np.random.seed() # ensure new random value each iteration
        for i in range(batch_size + 1):
            key_index = np.random.randint(0, 4) # determines random index of truth table to use, [0, 4)
            batch.append([input_vector[key_index], output_vector[key_index]]) # adds inputs and result to batch
        return batch


    def train(self, epochs, learning_rate, data_file_path, batch_size):
        root_x, root_y = self.preprocess_data(data_file_path)
        for i in range(0, epochs + 1):
            batch = self.make_batch(root_x, root_y, batch_size)
            for j in range(0, batch_size + 1): # each sample is batch[j]
                # input vector from sample is batch[j][0], output is batch[j][1]
                hidden_weight_gradient, output_weight_gradient, hidden_bias_gradient, output_bias_gradient, sample_error = self.backward_propagate(batch[j][0], batch[j][1]) # back propagate over vectors and generate gradients
                self.update_parameters(learning_rate, hidden_weight_gradient, output_weight_gradient, hidden_bias_gradient, output_bias_gradient) # update weights and biases with calculated gradients
                print(f'epoch: {i}/{epochs}, step {j}/{batch_size}, error: {sample_error}')




In [397]:
network = logic_nn()
network.train(50, 0.05, "and.csv", 45) # demonstrating with the AND file

# testing 1 AND 1, expected result is 1
_, _, _, prediction = network.forward_propagate([1, 1])
print(f'Raw prediction from network: {prediction[0][0]}\nClassified prediction from network: {sort_output(prediction[0][0])}')

# testing 1 AND 0, expected result is 0
_, _, _, prediction = network.forward_propagate([1, 0])
print(f'Raw prediction from network: {prediction[0][0]}\nClassified prediction from network: {sort_output(prediction[0][0])}')

xor_network = logic_nn()
xor_network.train(100, 0.1, "xor.csv", 45) # demonstrating with the XOR file, this one is more complex, takes longer & higher learning rate to be correct

# testing 0 XOR 1, expected result is 1
_, _, _, prediction = xor_network.forward_propagate([0, 1])
print(f'Raw prediction from network: {prediction[0][0]}\nClassified prediction from network: {sort_output(prediction[0][0])}')

# testing 0 XOR 0, expected result is 0
_, _, _, prediction = xor_network.forward_propagate([0, 0])
print(f'Raw prediction from network: {prediction[0][0]}\nClassified prediction from network: {sort_output(prediction[0][0])}')

or_network = logic_nn()
or_network.train(50, 0.05, "or.csv", 45)

# testing 0 OR 0, expected result is 0
_, _, _, prediction = or_network.forward_propagate([0, 0])
print(f'Raw prediction from network: {prediction[0][0]}\nClassified prediction from network: {sort_output(prediction[0][0])}')

# and so on and so forth


Raw prediction from network: 0.8917043793334058
Classified prediction from network: 1
Raw prediction from network: 0.0410528531403871
Classified prediction from network: 0
Raw prediction from network: 0.4879670149105459
Classified prediction from network: 0
Raw prediction from network: 0.46321276963517355
Classified prediction from network: 0
Raw prediction from network: 0.08232567008522498
Classified prediction from network: 0
