## A hacker's guide to Neural Networks

This notebook is a theoretical approach to breaking down the inner workings of neural networks into digestable parts to help gain a better inuition for how neural networks work. A neural network is twofold **forward propagation** and **backpropagation**. We'll break down forward propagation and backpropagation exploring each concept indepth including their biological inspiration and mathematical understanding; slowly building up to the bigger idea. I've found exploring each idea in-depth helps to better understand how these systems work. In essences, neural networks are a black box and the goal is to demystify these systems so you can better understand how they work.    

In [1]:
import numpy as np
import matplotlib.pyplot as plt

![Perceptron](./images/Perceptron.png)

In [7]:
inputs = [1.2, 5.1, 2.1] 
weights = [3.1, 2.1, 8.7]
bias = 3

output = inputs[0]*weights[0] + inputs[1]*weights[1] + inputs[2]*weights[2] + bias
print(output)

35.7


In [25]:
# Dot product - multiplies each scalar element-wise then sums those scalars together
output = np.dot(weights, inputs) + bias
print(output)

35.7


![Multi-Linear_Perceptron](./images/Multi-Linear_Perceptron.png)

In [2]:
np.random.seed(42)

inputs = np.random.randn(5)
weights = np.random.randn(3, 5)
biases = np.random.randn(3)

output = np.dot(weights, inputs) + biases
print(output)

[ 0.78599378 -2.74494004 -0.92044729]


In [32]:
inputs

array([ 1.17115625, -1.16803526, -0.82160782,  0.92183037, -0.33256601])

In [38]:
inputs, inputs.shape

(array([ 1.17115625, -1.16803526, -0.82160782,  0.92183037, -0.33256601]),
 (5,))

In [39]:
weights, weights.shape

(array([[-0.1495665 , -0.14756567, -1.19343778,  0.52912856,  0.84744265],
        [-0.33508691,  1.40474206,  1.35984153, -0.29074424,  0.1959536 ],
        [ 0.87985141, -0.13661696, -1.098662  , -0.99735146, -0.06985282]]),
 (3, 5))

In [21]:
# Forward pass
x = [1.0, -2.0, 3.0]  # input values
w = [-3.0, -1.0, 2.0]  # weights
b = 1.0  # bias

# Multiplying inputs by weights
xw0 = x[0] * w[0]
xw1 = x[1] * w[1]
xw2 = x[2] * w[2]

print(xw0, xw1, xw2)

# Adding weighted inputs and bias
z = xw0 + xw1 + xw2 + b
print(z)

# ReLU activation function
y = max(z, 0)
print(y)

# Backward pass

# The derivative from the next layer
dvalue = 1

# Derivative of ReLU and the chain rule
drelu_dz = dvalue * (1. if z > 0 else 0)
print(drelu_dz)

# Partial derivatives of the multiplication, the chain rule
dsum_dxw0 = 1
drelu_dxw0 = drelu_dz * dsum_dxw0

dsum_dxw1 = 1
drelu_dxw1 = drelu_dz * dsum_dxw1

dsum_dxw2 = 1
drelu_dxw2 = drelu_dz * dsum_dxw2


dsum_db = 1
drelu_db = drelu_dz * dsum_db

print(drelu_dxw0, drelu_dxw1, drelu_dxw2, drelu_db)

# Partial derivatives of the multiplication, the chain rule
dmul_dx0 = w[0]
drelu_dx0 = drelu_dxw0 * dmul_dx0

dmul_dw0 = x[0]
drelu_dw0 = drelu_dxw0 * dmul_dw0

dmul_dx1 = w[1]
drelu_dx1 = drelu_dxw1 * dmul_dx1

dmul_dw1 = x[1]
drelu_dw1 = drelu_dxw1 * dmul_dw1

dmul_dx2 = w[2]
drelu_dx2 = drelu_dxw2 * dmul_dx2

dmul_dw2 = x[2]
drelu_dw2 = drelu_dxw2 * dmul_dw2



print(drelu_dx0, drelu_dw0, drelu_dx1, drelu_dw1, drelu_dx2, drelu_dw2)

-3.0 2.0 6.0
6.0
6.0
1.0
1.0 1.0 1.0 1.0
-3.0 1.0 -1.0 -2.0 2.0 3.0


In [22]:
print(w, b)

dx = [drelu_dx0, drelu_dx1, drelu_dx2]
dw = [drelu_dw0, drelu_dw1, drelu_dw2]
db = drelu_db

w[0] += -0.001 * dw[0]
w[1] += -0.001 * dw[1]
w[2] += -0.001 * dw[2]
b += -0.001 * db

print(w, b)

[-3.0, -1.0, 2.0] 1.0
[-3.001, -0.998, 1.997] 0.999


In [23]:
# Multiply inputs by weights
xw0 = x[0] * w[0]
xw1 = x[1] * w[1]
xw2 = x[2] * w[2]

# Adding
z = xw0 + xw1 + xw2 + b

# ReLU activation function
y = max(z, 0)
print(y)

5.985


In [28]:
# Passed-in gradient from the next layer
# for the purpose of this example we're going to use
# a vector of 1s
dvalues = np.array([[1., 1., 1.]])

# We have 3 sets of weights - one set for each neuron
# we have 4 inputs, thus 4 weights
# recall that we keep weights transposed
weights = np.array([[.2, .8, -.5, 1],
                    [.5, -.91, .26, -.5],
                    [-.26, -.27, .17, .87]]).T

# Sum weights related to the given input multiplied by
# the gradient related to the given neuron 
dx0 = sum(weights[0]*dvalues[0])
dx1 = sum(weights[1]*dvalues[0])
dx2 = sum(weights[2]*dvalues[0])
dx3 = sum(weights[3]*dvalues[0])

dinputs = np.array([dx0, dx1, dx2, dx3])
print(dinputs)

[ 0.44 -0.38 -0.07  1.37]


In [39]:
print(dvalues[0].shape, weights.T.shape)

# sum weights of given input
# and multiply by the passed-in gradient for this neuron
dinputs = np.dot(dvalues[0], weights.T)
print(dinputs)

(3,) (3, 4)
[ 0.44 -0.38 -0.07  1.37]


In [43]:
# Passed-in gradient from the next layer
# for the purpose of this example we're going to use
# an array of an incremental gradient values
dvalues = np.array([[1., 1., 1.],
                    [2., 2., 2.],
                    [3., 3., 3.]])

# We have 3 sets of weights - one set for each neuron
# we have 4 inputs, thus 4 weights
# recall that we keep weights transposed
weights = np.array([[.2, .8, -.5, 1],
                    [.5, -.91, .26, -.5],
                    [-.26, -.27, .17, .87]]).T

# sum weights of given input
# and multiply by the passed-in gradient for this neuron
print(dvalues.shape, weights.T.shape)

dinputs = np.dot(dvalues, weights.T)
print(dinputs)

(3, 3) (3, 4)
[[ 0.44 -0.38 -0.07  1.37]
 [ 0.88 -0.76 -0.14  2.74]
 [ 1.32 -1.14 -0.21  4.11]]


In [47]:
# Passed-in gradient from the next layer
# for the purpose of this example we're going to use
# an array of an incremental gradient values
dvalues = np.array([[1., 1., 1.],
                    [2., 2., 2.],
                    [3., 3., 3.]])

# We have 3 sets of inputs - samples
inputs = np.array([[1, 2, 3, 2.5],
                   [2., 5., -1., 2.],
                   [-1.5, 2.7, 3.3, -0.8]])

# sum weights of given input
# and multiply by the passed-in gradient for this neuron
dweights = np.dot(inputs.T, dvalues)
print(dweights)

[[ 0.5  0.5  0.5]
 [20.1 20.1 20.1]
 [10.9 10.9 10.9]
 [ 4.1  4.1  4.1]]


In [52]:
# Passed-in gradient from the next layer
# for the purpose of this example we're going to use
# an array of an incremental gradient values
dvalues = np.array([[1., 1., 1.],
                    [2., 2., 2.],
                    [3., 3., 3.]])

# One bias for each neuron
# biases are the row vector with a shape (1, neuron)
biases = np.array([[2, 3, .5]])

# dbiases - sum values, do this over samples (first axis), keepdims 
# since this by default will produce a plain list - 
# we explained this in the chapter 4
print(dvalues.shape)

# "keepdims" lets us keep the gradient as a row vector - recall the shape of biases array.
dbiases = np.sum(dvalues, axis=0, keepdims=True)
print(dbiases)


(3, 3)
[[6. 6. 6.]]


In [59]:
# Example layer output
z = np.array([[1, 2, -3, -4],
              [2, -7, -1, 3],
              [-1, 2, 5, -1]])

dvalues = np.array([[1, 2, 3, 4],
                    [5, 6, 7, 8],
                    [9, 10, 11, 12]])

# ReLU activation's derivative
drelu = np.zeros_like(z)
drelu[z > 0] = 1

print(drelu)

# The chain rule
drelu *= dvalues

print(drelu)


# ReLU activation's derivative
# with the chain rule applied
drelu = dvalues.copy()
drelu[z <= 0] = 0

print(drelu)

[[1 1 0 0]
 [1 0 0 1]
 [0 1 1 0]]
[[ 1  2  0  0]
 [ 5  0  0  8]
 [ 0 10 11  0]]
[[ 1  2  0  0]
 [ 5  0  0  8]
 [ 0 10 11  0]]


In [60]:
# Passed-in gradient from the next layer
# for the purpose of this example we're going to use
# an array of an incremental gradient values
dvalues = np.array([[1., 1., 1.],
                    [2., 2., 2.],
                    [3., 3., 3.]])

# We have 3 sets of inputs - samples
inputs = np.array([[1, 2, 3, 2.5],
                   [2., 5., -1., 2.],
                   [-1.5, 2.7, 3.3, -0.8]])

# We have 3 sets of weights - one set for each neuron
# we have 4 inputs, thus 4 weights
# recall that we keep weights transposed
weights = np.array([[.2, .8, -.5, 1],
                    [.5, -.91, .26, -.5],
                    [-.26, -.27, .17, .87]]).T

# One bias for each neuron
# biases are the row vector with a shape (1, neuron)
biases = np.array([[2, 3, .5]])

# Forward pass
layer_outputs = np.dot(inputs, weights) + biases  # Dense layer
relu_outputs = np.maximum(0, layer_outputs)  # ReLU activation

# Let's optimize and test backpropagation here
# ReLU activation - simulates derivative with respect to input values
# from next layer passed to current layer during backpropagation
drelu = relu_outputs.copy()
drelu[layer_outputs <= 0] = 0

# Dense layer
# dinputs - multiply by weights
dinputs = np.dot(drelu, weights.T)
# dweights - multiply by inputs
dweights = np.dot(inputs.T, drelu)
# dbiases - sum values, do this over samples (first axis), keepdims
# since this by default will produce a plain list - 
# we explained this in the chapter 4
dbiases = np.sum(drelu, axis=0, keepdims=True)

# Update parameters
weights += -0.001 * dweights
biases += -0.001 * dbiases

print(weights)
print(biases)

[[ 0.179515   0.5003665 -0.262746 ]
 [ 0.742093  -0.9152577 -0.2758402]
 [-0.510153   0.2529017  0.1629592]
 [ 0.971328  -0.5021842  0.8636583]]
[[1.98489  2.997739 0.497389]]
