# Backpropagation
Backpropagation (backprop, BP) is a widely used algorithm for training feedforward neural networks and is in the center of most of Deep Learning advances. During a neural network training, backpropagation computes the gradient of the loss function with respect to the weights of the network for a single input through the chain rule. The name backpropagation strictly refers only for computing the gradient, not how the gradient is used.

Aiming at learning the underlaying concepts of Backpropagation, it is proposed an image classification exercise using the well-known [MNIST database](http://yann.lecun.com/exdb/mnist/) and an artificial neural network as classifier.

In [None]:
import numpy as np
from keras.datasets import mnist
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

#Loading the mnist datset

In [None]:
(x_train,y_train), (x_test,y_test)= mnist.load_data()
x_train = np.reshape(x_train, (-1, 784))
x_test = np.reshape(x_test, (-1, 784))


#Dataset pre-processing

In [None]:

num_classes = 10
# Dataset normalization
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train= x_train/255.
x_test = x_test/255.
# One-hot encoding
y_train_h = np.zeros((np.size(y_train,0) , num_classes))
y_test_h = np.zeros((np.size(y_test,0) , num_classes))
for i in range(np.size(y_train , 0)):
  y_train_h[i , y_train[i]] = 1

for i in range(np.size(y_test , 0)):
  y_test_h[i , y_test[i]] = 1


#Parameters initialization functions

In [None]:

def Weights_Init(size_1 , size_2):
  W_init_max = 4 * np.sqrt(6. / (size_1 + size_2))
  W = np.random.uniform(-W_init_max , W_init_max , (size_2 , size_1))
  return W


def Bias_Init(size):
  b = np.zeros([size,1])
  return b

#Network hyper-parameters configuration and trainable parameters initialization

In [None]:
input_size = 784
hidden_size = 256
output_size = num_classes

W1 = Weights_Init(input_size , hidden_size)
W2 = Weights_Init(hidden_size, output_size)
b1 = Bias_Init(hidden_size)
b2 = Bias_Init(output_size)
print(np.shape(W1))
print(np.shape(b1))

(256, 784)
(256, 1)


#Forward propagation function

In [None]:

def ForwardPropagation(X , W1, W2, b1, b2):
  
  X = np.transpose(X)
  
  a1 = np.concatenate((X , np.ones([1 , np.size(X , 1)])))
  Wc1 = np.concatenate((W1 , b1),axis=1)
  Wc2 = np.concatenate((W2 , b2),axis=1)
  
  a2 = 1/(1 + np.exp(np.matmul(-Wc1 , a1)))
  a2 = np.concatenate((a2 , np.ones([1 , np.size(a2 , 1)])))
  
  a3 = 1/(1 + np.exp(np.matmul(-Wc2 , a2)))
  return a3, a2, a1

# Loss function

In [None]:
def CrossEntropy(y , y_pred):  
  y_pred = np.transpose(y_pred)
  loss = np.mean(np.sum(-(y*np.log(y_pred) + (1 - y)*np.log(1 - y_pred)),1))
  return loss
  

#Accuracy score function

In [None]:
def ComputeAccuracy(y_h , y_pred):
  
  y_pred = np.transpose(y_pred)
  pred = np.argmax(y_pred,axis=1)
  y = np.argmax(y_h, axis=1)
  accuracy = accuracy_score(y , pred)
  return accuracy
  

# BackPropagation 

In [None]:
def ComputeGrads(x_train_b, y_train_b, W1, b1, W2, b2):
  
  N = np.size(x_train_b, 0)
  Wc1 = np.concatenate((W1 , b1),axis=1)
  Wc2 = np.concatenate((W2 , b2),axis=1)
  
  y_pred, a2, a1 = ForwardPropagation(x_train_b, W1, W2, b1, b2)
  delta3 = (y_pred-y_train_b.T)/N
  delta2 = Wc2.T @ delta3 * (a2*(1-a2))
  delta2 = delta2[:-1,:]

  dWc2 = delta3 @ a2.T 
  dWc1 = delta2 @ a1.T
  
  return dWc1, dWc2
  
  

#Training function

In [None]:
num_epochs = 50
batch_size = 100
learning_rate = 0.001

num_of_batchs_tr = np.size(x_train , 0)//batch_size
num_of_batchs_vl = np.size(x_test  , 0)//batch_size
for e in range(num_epochs):
  x_train, y_train_h = shuffle(x_train, y_train_h, random_state = 0)
  
  #Computing the training error for each batch
  loss_b = 0
  acc_b = 0
  for b in range(num_of_batchs_tr):
    x_train_b = x_train[b*batch_size : (b + 1)*batch_size,:]
    y_train_b = y_train_h[b*batch_size : (b + 1)*batch_size,:]
    y_pred, _, _ = ForwardPropagation(x_train_b, W1, W2, b1, b2)
    loss_b += CrossEntropy(y_train_b , y_pred)
    acc_b += ComputeAccuracy(y_train_b , y_pred)
  
  loss_tr = loss_b/num_of_batchs_tr
  acc_tr = acc_b/num_of_batchs_tr
  
  #Computing the validation error for each batch
  loss_b = 0
  acc_b = 0
  for b in range(num_of_batchs_vl):
    x_test_b = x_test[b*batch_size : (b + 1)*batch_size,:]
    y_test_b = y_test_h[b*batch_size : (b + 1)*batch_size,:]
    y_pred, _, _ = ForwardPropagation(x_test_b, W1, W2, b1, b2)
    loss_b += CrossEntropy(y_test_b , y_pred)
    acc_b += ComputeAccuracy(y_test_b , y_pred)
    
  loss_vl = loss_b/num_of_batchs_vl
  acc_vl = acc_b/num_of_batchs_vl
  print("Epoch: [%2d/%2d] Train loss: %.8f, Train accuracy: %.8f, Validation loss: %.8f, Validation accuracy: %.8f" % (e+1, num_epochs, loss_tr, acc_tr*100, loss_vl, acc_vl*100))
  #Performing backprop
  for b in range(num_of_batchs_tr):
    
    x_train_b = x_train[b*batch_size : (b + 1)*batch_size, :]
    y_train_b = y_train_h[b*batch_size : (b + 1)*batch_size, :]
    dWc1, dWc2 = ComputeGrads(x_train_b, y_train_b, W1, b1, W2, b2)
    #Update the parameters
    W1 = W1 - learning_rate*dWc1[:,:-1]
    W2 = W2 - learning_rate*dWc2[:,:-1]
    b1 = b1 - learning_rate*np.reshape(dWc1[:,-1],(256,1))
    b2 = b2 - learning_rate*np.reshape(dWc2[:,-1],(10,1))