<a href="https://colab.research.google.com/github/danielhampikian/GIMM-400/blob/master/NeuralNetworkSimpleToComplex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

NumPy is the main package for scientific computations in python and has been a major backbone of Python applications in various computational, engineering, scientific, statistical, image processing, etc fields. NumPy was built from 2 earlier libraries: Numeric and Numarray.
Most deep learning algorithms make use of several numpy operations and functions. This is because compared with pure python syntax, NumPy computations are faster. NumPy for instance makes use of vectorization that enables the elimination of unnecessary loops in a code structure, hence reducing latency in execution of code. The following is an example of vectorization for a 1-d array NumPy dot operation:

In [5]:
#with vectorization
import numpy as np #always remember to import numpy
import time
t0 = time.process_time()
#array1
tmatrix1 = [1,2,3,4,5]
#array2
tmatrix2 = [6,7,8,9,10]
#dot matrix
dt = np.dot(tmatrix1,tmatrix2)
t1 = time.process_time()
print ("dot operation = " + str(dt) + "\n Computation time = " + str(1000*(t1 - t0)) + "ms")

#without vectorization
t0 = time.process_time()
dot = 0
for i in range(len(tmatrix1)):
  %time dot+= tmatrix1[i]*tmatrix2[i]
t1 = time.process_time()
print ("dot operation = " + str(dot) + "\n Computation time = " + str(1000*(t1 - t0)) + "ms")

dot operation = 130
 Computation time = 0.5991880000000283ms
CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 7.87 µs
CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.39 µs
CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs
CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.44 µs
CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.68 µs
dot operation = 130
 Computation time = 6.979469000000016ms


In [6]:
import numpy as np
#create a 1-d array of 1,2,3
mvector = np.array([1,2,3])
print(mvector)
#create a 2x3 matrix/2-d array
mmatrix= np.array([[1,2,3],[4,5,6]])
print(mmatrix)
#print array type
print(type(mmatrix))
#print array shape
print(mmatrix.shape)
#print array size
print(mmatrix.size)
#print smallest number in a matrix
print(mmatrix.min())
#print biggest number
print(mmatrix.max())


[1 2 3]
[[1 2 3]
 [4 5 6]]
<class 'numpy.ndarray'>
(2, 3)
6
1
6


In [8]:
import random
import numpy as np
#Adding, subtracting and multiplying
#array1
mmatrix1 = np.array([[1,2,3],[4,5,6]])
#array2
mmatrix2 = np.array([[6,7,8],[8,9,10]])
#add matrices
addmatrix = np.add(mmatrix1,mmatrix2)
print("addmatrix \n", addmatrix)
#subtract matrices
submatrix = np.subtract(mmatrix1,mmatrix2)
print("submatrix \n", submatrix)
#multiply matrices
mulmatrix = np.multiply(mmatrix1,mmatrix2)
print("mulmatrix \n", mulmatrix)
#Calculating dot products
#array1
dmatrix1 = np.array([[1,2,3],[4,5,6]])
#array2
dmatrix2 = np.array([[6,7],[8,9],[9,10]])
print("array1 \n", dmatrix1)
print("array2 \n", dmatrix2)
#dot matrix
dotmatrix = np.dot(dmatrix1,dmatrix2)
print("dotmatrix \n", dotmatrix)
#Finding the exponential of a matrix
mexp = np.exp(mmatrix)
print(mexp)



addmatrix 
 [[ 7  9 11]
 [12 14 16]]
submatrix 
 [[-5 -5 -5]
 [-4 -4 -4]]
mulmatrix 
 [[ 6 14 24]
 [32 45 60]]
array1 
 [[1 2 3]
 [4 5 6]]
array2 
 [[ 6  7]
 [ 8  9]
 [ 9 10]]
dotmatrix 
 [[ 49  55]
 [118 133]]
[[  2.71828183   7.3890561   20.08553692]
 [ 54.59815003 148.4131591  403.42879349]]


In [10]:

# Simple Feedforward Artificial Neural Network: no external libraries like tensorflow, pandas, or keras:
# No backpropogation or optimization:

epoch = 500 #number of times you will run the neural net through the entire training set
lr = 1 #learning rate 
bias = 1 #value of bias 
weights = [random.random(),random.random(),random.random()] #weights generated in a list (3 weights in total for 2 neurons and the bias)
print("Weights before training: " + str(weights))

def NeuralNet(weights, inputNeuron1, inputNeuron2, expectedOutput):
  outputActual = inputNeuron1*weights[0]+inputNeuron2*weights[1]+bias*weights[2]
  outputActual = ApplySigmoidActivation(outputActual)
  error = expectedOutput - outputActual #simple error measurement
  #inference: here we train the hidden layer which is just an array of weights.
  weights[0] += error * inputNeuron1 * lr
  weights[1] += error * inputNeuron2 * lr
  weights[2] += error * bias * lr
  
def ApplySigmoidActivation(x):
  return 1/(1+np.exp(-x))
  
def ApplySigmoidDerivative(x):
  return  x*(1-x)

def ApplyReluActivation(x):
   return np.maximum(0,x)
  
def ApplyReluDerivative(x):
  if x<=0:
    x = 0
  else:
    x = 1
  return x

def softmax(x):
  expo = np.exp(x)
  expo_sum = np.sum(np.exp(x))
  return expo/expo_sum

print (softmax(mmatrix))


for i in range(epoch):
  NeuralNet(weights, 1,1,1) #True or true input should return true
  NeuralNet(weights, 1,0,1) #True or false input should return true
  NeuralNet(weights, 0,1,1) #False or true input should return true
  NeuralNet(weights, 0,0,0) #False or false input should return false
  if(i % 200== 0):
    print("weights currently at: " + str(weights))
  
print("Weights after training: " + str(weights))
while True: 
  x = int(input())
  y = int(input())
  output = x*weights[0] + y*weights[1] + bias*weights[2]
  print(output)
  if output > 0 : #softmax activation function for output because we have binary classification
    output = 1
  else :
    output = 0
  print(x, "or", y, "is : ", output)

Weights before training: [0.6395260357636279, 0.7952832665343739, 0.1847190774741274]
[[0.00426978 0.01160646 0.03154963]
 [0.08576079 0.23312201 0.63369132]]
weights currently at: [1.0444121348448816, 1.1356165855960612, 0.08230210074235478]
weights currently at: [8.30671219146593, 8.28807536868092, -3.695777540762057]
weights currently at: [9.68204753525835, 9.67239026392353, -4.383165309024845]
Weights after training: [10.122788219066265, 10.114992519428133, -4.603478599528888]


KeyboardInterrupt: ignored

In [0]:
# Slightly more complex single layer Neural Network: still no external libraries like tensorflow, pandas, or keras:
import numpy as np #import numpy the linear algebra library

# sigmoid function - introduces non-linearity into the neural net (not 1-1 input output)
#sigmoid maps to a value between 0 and 1 to convert numbers to probabilities
def nonlin(x,deriv=False):
    if(deriv==True): # when true, this maps the derivative of the sigmoid function - so we can get the slope at a given point
        return x*(1-x)
    return 1/(1+np.exp(-x))
    
# input dataset
X = np.array([  [0,0,1],
                [0,1,1],
                [1,0,1],
                [1,1,1]])
    
# output dataset            
y = np.array([[0,0,1,1]]).T

# seed random numbers to make calculation
# deterministic (just a good practice)
np.random.seed(1)
epochs = 50000
weights = np.array([[0,0,0,0]]).T
# initialize weights randomly with mean 0
#we only have an input and output layer, so we only need on matrix of wieghts to connect them with dimensions (3,1) 3 inputs to 1 output
#if we wanted to change the number of inputs we would also change this (3,1) to reflect inputs, but we generally want 1 output since we're doing a regresion neural net
#syn0 = 2*np.random.random((3,1)) - 1

def neuralNet(l0, epochs, predict, userTest=[0,0,0], testWeights=[0,0,0,0]):
  if(not predict):
    for iter in range(epochs):

      # forward propagation - the prediction step, we let the network try to predict it's output and measure the error
      syn0 = testWeights
      #this is the 'weighted sum step where we use the dot or cross product of two matrices to output a matrix that has the multiplied and summed total of the weights and values for that row and column combination, and then we convert that back into a number since we get a probability with the sigmoid function'
      #the dimensions going on in the next step are: (4X3)dot(3X1) = (4X1) so everything in the matrix gets multplied and hte result is a matix with the number of rows in first matrix and number of columns in second
      l1 = nonlin(np.dot(l0,syn0)) #multiplies 10 input by syn0 weights, then passes it through our sigmoid function to convert the numbers to probabilities 
      

      # compare our guess - l1 - to the actual answer - y -
      l1_error = y - l1
      if(iter % 100 == 0):
        print('error before slope of sigmoid of values in l1')
        print(l1_error)
        print("l1 itself (dot product of l0 and weight (weighted sum))")
        print(l1)
    

      # multiply how much we missed by the 
      # slope of the sigmoid at the values in l1 - basically we multiply elementwise a 4,1 matrix with another 4,1 matrix of its sigmoid derivatives thereby reducing the error of high confidence predictions.
      # if the network has a very confident guess (slope is very shallow or close to 0), we leave it alone with the multiplication here but if it's closer to .5 then we heavily update the guess with this multiplication
      l1_delta = l1_error * nonlin(l1,True)
      if(iter%100 == 0):
        print('error after slope of sigmoid of values in l1 is multiplied times l1 error')
        print(l1_delta)
        print("Mean Error:" + str(np.mean(np.abs(l1_delta)))) # we'll just measure the error for accuracy here taking the mean of the absolute value of the errors in the array

    

     # update weights (again cross multiplication , added to all weights between the two matrix)
      syn0 += np.dot(l0.T,l1_delta)
    return syn0,l1
    
  if predict:           
          
    syn0 = testWeights
    total = 0    #the dimensions goig on in the next step are: (4X3)dot(3X1) = (4X1) so everything in the matrix gets multplied and hte result is a matix with the number of rows in first matrix and number of columns in second
    for i in range(len(userTest)):
      #multiplies 10 input by syn0 weights, then passes it through our sigmoid function to convert the numbers to probabilities 
    #print("hey" + str(l1))
      total += userTest[i] * testWeights[i]
        # compare our guess - l1 - to the actual answer - y -
        # l1_error = y - l1
        #l1_delta = l1_error * nonlin(l1,True)
        
    

      # update weights (again cross multiplication , added to all weights between the two matrix)
        #syn0 += np.dot(l0.T,l1_delta)
     # return the prediction:
    return np.around(nonlin(total),decimals=0)
      
w,l1 = neuralNet(X,1000, False, testWeights = 2*np.random.random((3,1)) - 1)
print("Output After Training:")
print(str(l1))
print("Weights: " + str(w))
#user test:
while True:
  userInput = input("Enter int array: ")
  userArray = list(map(int, userInput.split()))
  print(userArray)
  print(str(neuralNet(X,1,True, userTest=userArray, testWeights=w)))
#Note that adding a 0 0 0 learning set and a 0 1 0 doesn't work yet, we need to introduce two hyperparameters, layers, numbers of neurons, and an alpha or (learning rate)
#Try 1 1 0 which is not in the data set, but if assume the rule it's trying to infer is perfect correspondence to the first column should return 1

error before slope of sigmoid of values in l1
[[-0.2689864 ]
 [-0.36375058]
 [ 0.76237183]
 [ 0.6737243 ]]
l1 itself (dot product of l0 and weight (weighted sum))
[[0.2689864 ]
 [0.36375058]
 [0.23762817]
 [0.3262757 ]]
error after slope of sigmoid of values in l1 is multiplied times l1 error
[[-0.05289153]
 [-0.08418501]
 [ 0.13811206]
 [ 0.14809799]]
Mean Error:0.10582164657032124
error before slope of sigmoid of values in l1
[[-0.10975431]
 [-0.0914018 ]
 [ 0.07391609]
 [ 0.08910196]]
l1 itself (dot product of l0 and weight (weighted sum))
[[0.10975431]
 [0.0914018 ]
 [0.92608391]
 [0.91089804]]
error after slope of sigmoid of values in l1 is multiplied times l1 error
[[-0.01072391]
 [-0.00759069]
 [ 0.00505974]
 [ 0.00723177]]
Mean Error:0.00765152685655693
error before slope of sigmoid of values in l1
[[-0.07532702]
 [-0.06151182]
 [ 0.04977973]
 [ 0.06113161]]
l1 itself (dot product of l0 and weight (weighted sum))
[[0.07532702]
 [0.06151182]
 [0.95022027]
 [0.93886839]]
error af

In [16]:
#Two layer neural net with output of the first layer used as input for the second layer, capable of dealing with non-correlational data
import numpy as np

def nonlin(x,deriv=False):
	if(deriv==True):
	    return x*(1-x)

	return 1/(1+np.exp(-x))
    
X = np.array([[0,0,1],
            [0,1,1],
            [1,0,1],
            [1,1,1]])
                
y = np.array([[0],
			[1],
			[1],
			[0]])

np.random.seed(1)

# randomly initialize our weights with mean 0
syn0 = 2*np.random.random((3,4)) - 1
syn1 = 2*np.random.random((4,1)) - 1

for j in range(60000):

	# Feed forward through layers 0, 1, and 2
    l0 = X
  # It's really just 2 of the previous implementation stacked on top of each other. The output of the first layer (l1) is the input to the second layer.
    l1 = nonlin(np.dot(l0,syn0))
    l2 = nonlin(np.dot(l1,syn1))

    # how much did we miss the target value?
    l2_error = y - l2
    
    if (j% 10000) == 0:
        print("Error:" + str(np.mean(np.abs(l2_error)))) # we'll just measure the error for accuracy here taking the mean of the absolute value of the errors in the array
        
    # in what direction is the target value?
    # were we really sure? if so, don't change too much.
    l2_delta = l2_error*nonlin(l2,deriv=True)

    # how much did each l1 value contribute to the l2 error (according to the weights)? This is a critical step!
    # use the "confidence weighted error" from l2 to establish an error for l1. To do this, send the error across the weights from l2 to l1. 
    # This gives what you could call a "contribution weighted error" because we learn how much each node value in l1 "contributed" to the error in l2. This step is called "backpropagating" and is the namesake of the algorithm. We then update syn0 using the same steps we did in the 2 layer implementation.
    l1_error = l2_delta.dot(syn1.T)
    
    # in what direction is the target l1?
    # were we really sure? if so, don't change too much.
    l1_delta = l1_error * nonlin(l1,deriv=True)

    syn1 += l1.T.dot(l2_delta)
    syn0 += l0.T.dot(l1_delta)


Error:0.4964100319027255
Error:0.008584525653247157
Error:0.0057894598625078085
Error:0.004629176776769985
Error:0.0039587652802736475
Error:0.003510122567861678


In [21]:
import numpy as np
#This introduces hyperparameters of alpha or learning rate and size of hidden layers.
#Challenge: find the best hyperparameters for alpha and hidden layer sizes in this two layer neural net with backpropogation through gradient descent
#Bonus extend this to an artibray user enter number of hidden layers with user entered sizes
#Reflect: Why might it be good to start with a large alpha and decrease it over epochs?  This is one of the things that an optimizer does in tensorflow and keras!
alphas = [0.001,0.01,0.1,1,10,100,1000]
hiddenSizes = [4,8,16,32]

# compute sigmoid nonlinearity
def sigmoid(x):
    output = 1/(1+np.exp(-x))
    return output

# convert output of sigmoid function to its derivative
def sigmoid_output_to_derivative(output):
    return output*(1-output)
    
X = np.array([[0,0,1],
            [0,1,1],
            [1,0,1],
            [1,1,1]])
                
y = np.array([[0],
			[1],
			[1],
			[0]])

for alpha in alphas:
  for hiddenSize in hiddenSizes:
    print("\nTraining With Alpha:" + str(alpha) + " and hidden layer size: " + str(hiddenSize))
    np.random.seed(1)

    # randomly initialize our weights with mean 0
    synapse_0 = 2*np.random.random((3,hiddenSize)) - 1
    synapse_1 = 2*np.random.random((hiddenSize,1)) - 1

    for j in range(60000):

        # Feed forward through layers 0, 1, and 2
        layer_0 = X
        layer_1 = sigmoid(np.dot(layer_0,synapse_0))
        layer_2 = sigmoid(np.dot(layer_1,synapse_1))

        # how much did we miss the target value?
        layer_2_error = layer_2 - y

        if (j % 20000) == 0:
            print("Error after "+str(j)+" iterations:" + str(np.mean(np.abs(layer_2_error))))

        # in what direction is the target value?
        # were we really sure? if so, don't change too much.
        layer_2_delta = layer_2_error*sigmoid_output_to_derivative(layer_2)

        # how much did each l1 value contribute to the l2 error (according to the weights)?
        layer_1_error = layer_2_delta.dot(synapse_1.T)

        # in what direction is the target l1?
        # were we really sure? if so, don't change too much.
        layer_1_delta = layer_1_error * sigmoid_output_to_derivative(layer_1)

        synapse_1 -= alpha * (layer_1.T.dot(layer_2_delta))
        synapse_0 -= alpha * (layer_0.T.dot(layer_1_delta))




Training With Alpha:0.001 and hidden layer size: 4
Error after 0 iterations:0.4964100319027255
Error after 20000 iterations:0.4935960431880486
Error after 40000 iterations:0.48910016654420474

Training With Alpha:0.001 and hidden layer size: 8
Error after 0 iterations:0.49885891282661
Error after 20000 iterations:0.4960339829421634
Error after 40000 iterations:0.49230609024302613

Training With Alpha:0.001 and hidden layer size: 16
Error after 0 iterations:0.4968197940374576
Error after 20000 iterations:0.4915701638369947
Error after 40000 iterations:0.4835511920051555

Training With Alpha:0.001 and hidden layer size: 32
Error after 0 iterations:0.49643992250078794
Error after 20000 iterations:0.48497630702745953
Error after 40000 iterations:0.46903846539028254

Training With Alpha:0.01 and hidden layer size: 4
Error after 0 iterations:0.4964100319027255
Error after 20000 iterations:0.359097202563399
Error after 40000 iterations:0.14307065901337032

Training With Alpha:0.01 and hidden