# Motivation
Computations involved in deep learning are mainly about linear algebra. NumPy is very optimized for operations on multidimensional arrays. 

**Why should we spend time learning a deep learning framework?**

## A fully connected NN with NumPy
https://chih-ling-hsu.github.io/2017/08/30/NN-XOR

In [1]:
import numpy as np
import matplotlib.pyplot as plt

     
def tanh_derivative(x):
    return 1 / np.cosh(x) ** 2


class FullyConnectedNN_np:
    #########
    # parameters
    # ----------
    # net_arch:  consists of a list of integers, indicating
    #            the number of neurons in each layer, i.e. the network architecture
    #########
    def __init__(self, net_arch):        
        self.activity = np.tanh
        self.activity_derivative = tanh_derivative
        self.layers = len(net_arch)
        self.steps_per_epoch = 1
        self.arch = net_arch
        self.weights = []

        # Random initialization with range of weight values (-1,1)
        for layer in range(self.layers - 1):
            w = 2*np.random.rand(net_arch[layer] + 1, net_arch[layer+1]) - 1
            self.weights.append(w)
    
    def _forward_prop(self, x):
        y = x

        for i in range(len(self.weights)-1):
            activation = np.dot(y[i], self.weights[i])
            activity = self.activity(activation)

            # add the bias for the next layer
            activity = np.concatenate((np.ones(1), np.array(activity)))
            y.append(activity)

        # last layer
        activation = np.dot(y[-1], self.weights[-1])
        activity = self.activity(activation)
        y.append(activity)
        
        return y
    
    def _back_prop(self, y, target, learning_rate):
        error = target - y[-1]
        delta_vec = [error * self.activity_derivative(y[-1])]

        # we need to begin from the back, from the next to last layer
        for i in range(self.layers-2, 0, -1):
            error = delta_vec[-1].dot(self.weights[i][1:].T)
            error = error*self.activity_derivative(y[i][1:])
            delta_vec.append(error)

        # Now we need to set the values from back to front
        delta_vec.reverse()
        
        # Finally, we adjust the weights, using the backpropagation rules
        for i in range(len(self.weights)):
            layer = y[i].reshape(1, self.arch[i]+1)
            delta = delta_vec[i].reshape(1, self.arch[i+1])
            self.weights[i] += learning_rate*layer.T.dot(delta)
    
    #########
    # parameters
    # ----------
    # data:    the set of all possible pairs of booleans True or False indicated by the integers 1 or 0
    # labels:  the result of the logical operation 'xor' on each of those input pairs
    #########
    def fit(self, data, labels, learning_rate=0.1, epochs=100):
        
        # Add bias units to the input layer - 
        # add a "1" to the input data (the always-on bias neuron)
        ones = np.ones((1, data.shape[0]))
        Z = np.concatenate((ones.T, data), axis=1)
        
        for k in range(epochs):
            if (k+1) % 10000 == 0:
                print('epochs: {}'.format(k+1))
        
            sample = np.random.randint(X.shape[0])

            # We will now go ahead and set up our feed-forward propagation:
            x = [Z[sample]]
            y = self._forward_prop(x)

            # Now we do our back-propagation of the error to adjust the weights:
            target = labels[sample]
            self._back_prop(y, target, learning_rate)
    
    #########
    # the predict function is used to check the prediction result of
    # this neural network.
    # 
    # parameters
    # ----------
    # x:      single input data
    #########
    def predict_single_data(self, x):
        val = np.concatenate((np.ones(1).T, np.array(x)))
        for i in range(0, len(self.weights)):
            val = self.activity(np.dot(val, self.weights[i]))
            val = np.concatenate((np.ones(1).T, np.array(val)))
        return val[1]
    
    #########
    # the predict function is used to check the prediction result of
    # this neural network.
    # 
    # parameters
    # ----------
    # X:      the input data array
    #########
    def predict(self, X):
        Y = np.array([]).reshape(0, self.arch[-1])
        for x in X:
            y = np.array([[self.predict_single_data(x)]])
            Y = np.vstack((Y,y))
        return Y
    
    def __call__(self, X):
        return self.predict(X)


In [2]:
# Set the input data
X = np.array([[0, 0], [0, 1],
                [1, 0], [1, 1]])

# Set the labels, the correct results for the xor operation
y = np.array([0, 1, 
                 1, 0])

In [3]:
np.random.seed(41)

net = FullyConnectedNN_np([2, 2, 1])

net.fit(X, y, epochs=4000)

print("Final prediction")
for s in X:
    print(s, net.predict_single_data(s))

Final prediction
[0 0] 0.0035321878676498617
[0 1] 0.9799502402444462
[1 0] 0.9784029892880758
[1 1] 0.020612293862843356


## A fully connected NN with PyTorch

In [4]:
import torch
from torch import nn, optim

# Set the input data and responses
X = torch.tensor(X, dtype=torch.float)
y = torch.tensor(y, dtype=torch.float).unsqueeze(1)

In [5]:
def FullyConnectedNN_pt(net_arch):
    layers = []
    for i in range(len(net_arch) - 2):
        layers.append(nn.Linear(net_arch[i], net_arch[i + 1]))
        layers.append(nn.Tanh())
    layers.append(nn.Linear(net_arch[-2], net_arch[-1]))
    layers.append(nn.Sigmoid())
    return nn.Sequential(*layers)

In [6]:
from IPython import display
torch.manual_seed(0)

net = FullyConnectedNN_pt([2, 2, 1])

lr = 0.1
criterion = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr=lr)

for e in range(4000):
    y_pred = net(X)    
    loss = criterion(y_pred, y)
    #display.clear_output(wait=True)
    #print("EPOCH: {} | BCE: {}".format(e, loss.item()))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
print("Final prediction")
for s in X:
    print(s.numpy(), net(s).detach().numpy())
plt.show()

Final prediction
[0. 0.] [0.01678943]
[0. 1.] [0.99052423]
[1. 0.] [0.9905206]
[1. 1.] [0.01770484]


## Answers:
- Deep learning models are quite standardized in many of their aspects
- PyTorch provides a flexible and optimized way to automatically compute gradients
- There is a large, active community behind it
- Some nice people wrote a lot of flexible, well-optimized code for us. And I assume a significant portion of us is not made of elves

## Resources:
https://github.com/pytorch/examples

https://pytorch.org/tutorials/

https://pytorch.org/resources/

https://pytorch.org/docs/stable/index.html