##### **Importing Libraries.**

In [1]:
import numpy as np 

##### **Loading Parameters.**

In [2]:
with np.load('parameters.npz') as params:
    W1, W2, W3 = params['W1'], params['W2'], params['W3']
    b1, b2, b3 = params['b1'], params['b2'], params['b3']

In [3]:
X = np.array([1, 0, 1]).reshape(-1, 1)
y = np.array([0, 0, 1]).reshape(-1, 1)

##### **Defining Functions.**

In [4]:
# Sigmoid function 

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [5]:
# Softmax function.

def softmax(x):
    y = np.exp(x)
    return y / np.sum(y)

##### Q1 : **How many (learnable) parameters are there in the network?**

In [6]:
sum([x.size for x in [W1, W2, W3, b1, b2, b3]])

36

##### Q2 : **What is the sum of the elments of output a<sub>1</sub>? (Choose the nearest option to your answer)**

In [7]:
a1 = (W1 @ X) + b1
print(a1)

[[1.5350184 ]
 [1.98250233]
 [1.93014489]]


In [8]:
a1.sum()

5.44766562448759

##### Q3 : **What is the sum of the elments of output h<sub>1</sub>? (Choose the nearest option to your answer)**

In [9]:
h1 = sigmoid(a1)
print(h1)

[[0.82273938]
 [0.87894766]
 [0.87326546]]


In [10]:
h1.sum()

2.5749524957231924

##### Q4 : **The sum of the elements of [a<sub>2</sub>, h<sub>2</sub>, a<sub>3</sub>]. What is the loss value?**

Now, calculate the remaining entities i.e. a<sub>2</sub>, h<sub>2</sub> & a<sub>3</sub> & also y_hat.

In [11]:
a2 = W2 @ h1 + b2
h2 = sigmoid(a2)
a3 = W3 @ h2 + b3

y_hat = softmax(a3)

In [12]:
[a2.sum(), h2.sum(), a3.sum()]

[6.460166773282592, 2.63139309587371, 4.874920988265703]

In [13]:
y_hat

array([[0.23691422],
       [0.33838847],
       [0.42469732]])

In [14]:
loss_value = -1 * np.sum(y * np.log(y_hat))
print(loss_value)

0.8563785622753882


#### **Backpropagation Formulas for reference** : 

1. ∇<sub>a<sub>L</sub></sub>L(θ) = - (e(y) - ŷ)

2. ∇<sub>W<sub>k</sub></sub>L(θ) = ∇<sub>a<sub>k</sub></sub>L(θ) . h<sub>k-1</sub><sup>T</sup>

3. ∇<sub>b<sub>k</sub></sub>L(θ) = ∇<sub>a<sub>k</sub></sub>L(θ)

4. ∇<sub>h<sub>k-1</sub></sub>L(θ) = W<sub>k</sub><sup>T</sup> . ∇<sub>a<sub>k</sub></sub>L(θ)

5. ∇<sub>a<sub>k-1</sub></sub>L(θ) = ∇<sub>h<sub>k-1</sub></sub>L(θ) ⊙ [..., g'(a<sub>k-1 ,j</sub>), ...]

##### Q5 : **Choose the vector that corresponds to : ∇<sub>a<sub>3</sub></sub>L(θ).**

this is aka "gradient of loss function wrt a<sub>3</sub>".

In [15]:
gradient_a3 = -(y - y_hat)
print(gradient_a3)

[[ 0.23691422]
 [ 0.33838847]
 [-0.57530268]]


##### Q6 : **We know that after computing gradients, we update the values of b<sub>2</sub> by subtracting its gradient, as shown below :**

b<sub>2</sub> - η∇<sub>b<sub>2</sub></sub>L(θ). 

**Which of the following is the gradient vector of : b<sub>2</sub> ??**


Perform Backpropagation.

In [16]:
gradient_w3 = gradient_a3 @ h2.T
print(gradient_w3)

[[ 0.21202113  0.18529411  0.2260992 ]
 [ 0.30283325  0.26465862  0.3229412 ]
 [-0.51485438 -0.44995274 -0.5490404 ]]


In [17]:
gradient_b3 = gradient_w3
print(gradient_b3)

[[ 0.21202113  0.18529411  0.2260992 ]
 [ 0.30283325  0.26465862  0.3229412 ]
 [-0.51485438 -0.44995274 -0.5490404 ]]


In [18]:
gradient_h2 = W3.T @ gradient_a3
print(gradient_h2)

[[ 0.1954864 ]
 [-0.11722488]
 [-0.08814526]]


In [19]:
gradient_a2 = gradient_b2 = gradient_h2 * sigmoid(a2) * (1 - sigmoid(a2))

print(gradient_a2)

[[ 0.01838198]
 [-0.01997644]
 [-0.0038401 ]]


##### Q7 : **Update all the parameters with the calculated gradients. Forward propagate the input through the network. What is the new loss value ?** (Take η=1)

Continue calculating / updating parameters.

In [20]:
gradient_w2 = gradient_a2 @ h1.T
print(gradient_w2)

[[ 0.01512358  0.0161568   0.01605235]
 [-0.0164354  -0.01755824 -0.01744473]
 [-0.0031594  -0.00337525 -0.00335343]]


In [21]:
gradient_h1 = W2.T @ gradient_a2
print(gradient_h1)

[[ 0.00571305]
 [ 0.01326947]
 [-0.01908499]]


In [22]:
gradient_a1 = gradient_b1 = gradient_h1 * (sigmoid(a1) * (1 - sigmoid(a1)))

print(gradient_a1)

[[ 0.00083319]
 [ 0.00141185]
 [-0.00211219]]


In [23]:
gradient_w1 = gradient_a1 @ X.T
print(gradient_w1)

[[ 0.00083319  0.          0.00083319]
 [ 0.00141185  0.          0.00141185]
 [-0.00211219  0.         -0.00211219]]


Now, update all the parameters with the calculated gradients. 

Also, eta = 1.

In [24]:
eta = 1

In [25]:
w1_new = W1 - eta * gradient_w1
b1_new = b1 - eta * gradient_b1

In [26]:
print(f"W1 :\n {W1}\n")
print(f"W1_new : \n {w1_new}\n")

W1 :
 [[0.5488135  0.71518937 0.60276338]
 [0.54488318 0.4236548  0.64589411]
 [0.43758721 0.891773   0.96366276]]

W1_new : 
 [[0.54798032 0.71518937 0.60193019]
 [0.54347133 0.4236548  0.64448226]
 [0.4396994  0.891773   0.96577495]]



In [27]:
w2_new = W2 - eta * gradient_w2
b2_new = b2 - eta * gradient_b2

In [28]:
w3_new = W3 - eta * gradient_w3
b3_new = b3 - eta * gradient_b3

Now, forward propagating the input through the network

In [29]:
a1 = w1_new @ X + b1_new
h1 = sigmoid(a1)

a2 = w2_new @ h1 + b2_new
h2 = sigmoid(a2)

a3 = w3_new @ h2 + b3_new
y_hat = softmax(a3)

In [30]:
y_hat = np.sum(y_hat, axis=1)
print(y_hat)

[0.03807969 0.03906752 0.92285279]


New loss value.

In [31]:
loss_value_new = -1 * np.sum(y * np.log(y_hat))
print(loss_value_new)

6.590823679375906
