# APS1070 Week 11 Lecture Code
## Part 1a - 2-layer neural network

In this example we will implement a 2-layer neural network from scratch. This network is using a squared error loss for a multiclass classification problem, ideally we should be using a softmax loss function.

In [None]:
# load "Iris_3class.csv" to Google Colab
from google.colab import files
uploaded = files.upload()

Saving Iris_3class.csv to Iris_3class.csv


In [None]:
import pandas as pd
raw_data = pd.read_csv("Iris_3class.csv", header = None)
raw_data.values.shape

(150, 5)

In [None]:
import numpy as np
raw_data = raw_data.values

X_train = raw_data[:,:4]
y_train = raw_data[:,4:5].astype(int)
print(X_train.shape, y_train.shape)
print(X_train.dtype, y_train.dtype)

(150, 4) (150, 1)
float64 int64


convert labels (ground truths) into one-hot vectors

In [None]:
#Convert array to one-hot encoding
def to_one_hot(Y):
    n_col = np.amax(Y) + 1
    binarized = np.zeros((len(Y), n_col))
    for i in range(len(Y)):
        binarized[i, Y[i]] = 1.
    return binarized

In [None]:
y_train = to_one_hot(y_train)
print(X_train.shape, y_train.shape)
print(X_train.dtype, y_train.dtype)

(150, 4) (150, 3)
float64 float64


In [None]:
#verify one-hot encoding
y_train[0:5,:]

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

At its core a 2-layer neural network is just a few lines of code. 

Note that we have excluded the bias terms to keep things simple.

In [None]:
#define sigmoid
def sigmoid(x):
    return 1/(1+np.exp(-x))

def ann(W, X_train, y_train):

  #Weights
  w0 = W[:20].reshape(4,5)
  w1 = W[20:].reshape(5,3)

  #Feed forward
  layer0 = X_train
  layer1 = sigmoid(np.dot(layer0, w0))
  layer2 = sigmoid(np.dot(layer1, w1)) #predictions

  return layer2

Most of the complexity is introduced in order to train the network.

In [None]:
#define sigmoid
def sigmoid(x):
    return 1/(1+np.exp(-x))

def ann(W, X_train, y_train):

  #Weights
  w0 = W[:20].reshape(4,5)
  w1 = W[20:].reshape(5,3)

  #Feed forward
  layer0 = X_train
  layer1 = sigmoid(np.dot(layer0, w0))
  layer2 = sigmoid(np.dot(layer1, w1)) #predictions
 
  #Back propagation using gradient descent
  dw0, dw1 = np.zeros((4,5)), np.zeros((5,3))

  #calculate partial derivatives
  dL_du_hat = layer2-y_train
  du_hat_du = layer2*(1-layer2)
  du_dv_hat = w1.T
  dv_hat_dv = layer1*(1-layer1)
  dv_dw0 = X_train
  du_dw1 = layer1

  #gradients
  dw1 += du_dw1.T.dot(dL_du_hat*du_hat_du)
  dw0 += dv_dw0.T.dot((dL_du_hat*du_hat_du).dot(du_dv_hat)*(dv_hat_dv))
  
  #combine gradients
  dW = np.array(list(dw0.flatten()) + list(dw1.flatten()))

  #squared error
  error = 0.5*np.sum((layer2 - y_train)**2)

  return (error, dW, layer2)

Before training the network, let us verify the gradients were calculated correctly

In [None]:
#initialize weights
w0 = 2*np.random.random((4, 5)) - 1
w1 = 2*np.random.random((5, 3)) - 1

#combine weights
W = np.array(list(w0.flatten()) + list(w1.flatten()))

#compute gradients analytically
(error, dW, y_hat) = ann(W, X_train, y_train)

#compute gradients numerically
dW_num = np.zeros((len(W),1))

for ind in range(len(W)):
  #reset gradients
  We1 = np.array(list(w0.flatten()) + list(w1.flatten()))
  We2 = np.array(list(w0.flatten()) + list(w1.flatten()))
  
  #increment slightly
  We1[ind] = We1[ind] + 0.000001
  We2[ind] = We2[ind] - 0.000001
  
  #compute errors
  (error_e1, dW_e1, y_hat) = ann(We1, X_train, y_train)
  (error_e2, dW_e2, y_hat) = ann(We2, X_train, y_train)
  
  #calculate each gradient
  grad_num = (error_e1-error_e2)/0.000002
  
  #display difference
  print(round(abs(grad_num-dW[ind]),4), grad_num, dW[ind])

0.0 -0.39696112708043074 -0.3969611315255835
0.0 9.403257656970254 9.40325765691772
0.0 12.15712492808052 12.157124930724983
0.0 0.6168242379089861 0.6168242404558764
0.0 -4.446974493532707 -4.446974497290471
0.0 -0.14841754847338962 -0.148417551415719
0.0 6.121654791968467 6.121654791233746
0.0 8.225682563534065 8.225682552333968
0.0 0.26866472779829564 0.26866472405056696
0.0 -3.1055338425289847 -3.105533845770081
0.0 -0.39701716758600014 -0.39701717160108363
0.0 3.2271908096959123 3.2271908104776563
0.0 3.6861417029854238 3.6861417067057656
0.0 0.5398442866066944 0.5398442873110401
0.0 -1.0426430065990644 -1.0426430110362779
0.0 -0.14719170593480158 -0.1471917110667333
0.0 0.6313751086395314 0.6313751035999894
0.0 0.650397424806215 0.6503974211739173
0.0 0.19697923647754578 0.19697924224597488
0.0 -0.08685636032623734 -0.08685635914200414
0.0 0.10910099490502034 0.10910098363690418
0.0 0.12217915923429246 0.12217915657541749
0.0 0.15286622101484681 0.15286622511636389
0.0 10.7431500

Train neural network

In [None]:
#initialize weights
w0 = 2*np.random.random((4, 5)) - 1
w1 = 2*np.random.random((5, 3)) - 1

#combine weights into a single vector
W = np.array(list(w0.flatten()) + list(w1.flatten()))

#train network
n = 0.001 #learning rate
errors = []
for i in range(100000):
  (error, dW, y_hat) = ann(W, X_train, y_train)
  W += -dW * n
  errors.append(error)

In [None]:
#examine predictions on training data
np.round(y_hat,1)

array([[1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. 

The above is just a demonstration of building a neural network from scratch. To properly assess this network you should create a validation data set.

## Part 1b - PyTorch

The following is an example of a 1-layer and 2-layer neural network using PyTorch.

PyTorch - 1-layer neural network

In [None]:
import torch

x = torch.ones(5)  # input tensor
y = torch.zeros(3)  # expected output

w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
z = torch.matmul(x, w)+b

# assumes binary output
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)

obtain gradients for 1-layer neural network

In [None]:
loss.backward()
print(w.grad)
print(b.grad)

tensor([[0.2804, 0.3105, 0.1160],
        [0.2804, 0.3105, 0.1160],
        [0.2804, 0.3105, 0.1160],
        [0.2804, 0.3105, 0.1160],
        [0.2804, 0.3105, 0.1160]])
tensor([0.2804, 0.3105, 0.1160])


PyTorch - 2-layer neural network

In [None]:
#2-layer neural network
import torch

num_hidden = 3
x = torch.ones(5)  # input tensor
y = torch.zeros(3)  # expected output

# layer 1
w = torch.randn(5, num_hidden, requires_grad=True)
b = torch.randn(num_hidden, requires_grad=True)
z = torch.matmul(x, w)+b
z = torch.sigmoid(z)

# layer 2
w2 = torch.randn(num_hidden, 3, requires_grad=True)
b2 = torch.randn(3, requires_grad=True)
z2 = torch.matmul(z, w2)+b2

# assumes binary output
loss = torch.nn.functional.binary_cross_entropy_with_logits(z2, y)


obtain gradients for 2-layer neural network

In [None]:
loss.backward()
print(w.grad)
print(b.grad)
print(w2.grad)
print(b2.grad)

tensor([[0.1788, 0.0107, 0.0060],
        [0.1788, 0.0107, 0.0060],
        [0.1788, 0.0107, 0.0060],
        [0.1788, 0.0107, 0.0060],
        [0.1788, 0.0107, 0.0060]])
tensor([0.1788, 0.0107, 0.0060])
tensor([[0.2061, 0.2360, 0.0221],
        [0.1278, 0.1463, 0.0137],
        [0.0088, 0.0101, 0.0009]])
tensor([0.2729, 0.3125, 0.0293])


PyTorch computational graphs make gradient calculations stright forward. Much of this is hidden away allowing you to focus more on developing and testing your model architectures.

## Part 1c - PyTorch Full Example

see file "PyTorch_Example_MNIST" for a full example using PyTorch.