In [None]:
import numpy as np

#1) Softmax function

Softmax forward and back propagation implementation from previous lab seems to be incorrect. Therefore, in the first part let's rewirete these functions.

In [None]:
#loss function required for backward propagation
#in my case I use function that returns array with 1 for biggest number, and 0s for others
def loss_fn(array):
  max_val = np.argmax(array)
  out = np.zeros(len(array))
  out[max_val] = 1
  return out

In [None]:
def SoftMax_forward_prop(input, normalization=False):
  output = input
  if normalization: # if we use normalization
    output -= np.max(input) # we substract maximal value from each number
  output = np.exp(output)
  return output / np.sum(output)

Jacobian is calculated according to this source:

https://aimatters.wordpress.com/2019/06/17/the-softmax-function-derivative/

In [None]:
def SoftMax_jacobian(input, normalization=False): # function for calculating jacobian of SoftMax according to input
  output = SoftMax_forward_prop(input, normalization)
  jacobian = np.zeros((len(input), len(input)))
  for i in range(len(input)):
    for j in range(len(input)):
      if i == j:
        jacobian[i][j] = output[i] * (1 - output[j])
      else:
        jacobian[i][j] = -output[i] * output[j]
  return jacobian

In [None]:
def SoftMax_backward_prop(input, loss, normalization=False): # backpropagation
  jac = SoftMax_jacobian(input, normalization) # calculating jacobian
  return jac @ np.array(loss)

#2) RelU function

Finding jacobian of RelU:
$$\frac{\partial z}{\partial x}=
\begin{bmatrix}
\frac{\partial z(x_1)}{∂x_1} & \frac{\partial z(x_1)}{∂x_2} & ... & \frac{\partial z(x_1)}{∂x_m} \\
 \frac{\partial z(x_2)}{∂x_1} & \frac{\partial z(x_2)}{∂x_2} & ... & \frac{\partial z(x_2)}{∂x_m}\\
 ...
 \\\frac{\partial z(x_m)}{∂x_1} & \frac{\partial z(x_m)}{∂x_2} & ... & \frac{\partial z(x_m)}{∂x_m}
\end{bmatrix}$$

Let's consider $\frac{\partial z(x_i)}{\partial x_j}$.
* If $i\neq j$,  $\frac{\partial z(x_i)}{\partial x_j}=0$.
* If $i=j$ and $x_i > 0$: $\frac{\partial z(x_i)}{\partial x_j} = \frac{\partial (max(0, x_i))}{\partial x_i} = \frac{\partial x_i}{\partial x_i} = 1$  
* If $i=j$ and $x_i ≤ 0$: $\frac{\partial z(x_i)}{\partial x_j} = \frac{\partial (max(0, x_i))}{\partial x_i} = \frac{\partial (0)}{\partial x_i} = 0$

In [None]:
def RelU_jacobian(input):
  jac = np.zeros((len(input), len(input)))
  for i in range(len(input)):
    if input[i] > 0:
      jac[i][i] = 1
  return jac

In [None]:
def RelU_forward_prop(input):
  return np.array([max(0, x) for x in input]) # applying RelU to the input

In [None]:
def RelU_backward_prop(input, loss):
  jac = RelU_jacobian(input) # finding jacobian for RelU according to input
  return jac @ np.array(loss)

#3) Matrix multiplication

In [None]:
def MatMul_forward_prop(matrix, input):
  return np.array(matrix) @ np.array(input)

In [None]:
def MatMul_backward_prop(matrix, loss):
  return np.array(matrix).T @ np.array(loss)

#Let's test these functions

##Forward propagation

In [None]:
import torch
import torch.nn as nn
a = [1., 9., 3.] # variable for my functions
a_t1 = torch.tensor(a, requires_grad=True) # variable for torch SoftMax
a_t2 = torch.tensor(a, requires_grad=True) # variabel for torch RelU

In [None]:
f = SoftMax_forward_prop(a) # applying softmax forward propagation

In [None]:
# comparing results
print('Mine softmax forward output:', f)
print('Torch softmax forward output:', nn.Softmax(dim=0).forward(a_t1).tolist())

Mine softmax forward output: [0.1141952  0.04201007 0.84379473]
Torch softmax forward output: [0.11419519037008286, 0.04201006516814232, 0.8437947034835815]


In [None]:
f = RelU_forward_prop(a) # applying relu forward propagation

In [None]:
# comparing results
print('Mine softmax forward output:', f)
print('Torch softmax forward output:', nn.ReLU().forward(a_t2).tolist())

Mine softmax forward output: [1. 0. 3.]
Torch softmax forward output: [1.0, 0.0, 3.0]


In [None]:
W = np.random.rand(4, 3) # initialize matrix W for matrix multiplication
X = [[1, 2], # initialize matrix X for matrix multiplication
     [2, -3],
     [-7, 9]]

W_t = torch.tensor(W, dtype=torch.float64) # converting W to tensor
X_t = torch.tensor(X, dtype=torch.float64, requires_grad=True) # converting X

In [None]:
f = MatMul_forward_prop(W, X) # applying matrix multiplication forward propagation

In [None]:
# comparing results
print('Mine softmax forward output:\n', f)
print('Torch softmax forward output:\n', torch.matmul(W_t, X_t).detach().numpy())

Mine softmax forward output:
 [[-4.45251977  6.65426244]
 [-0.70628076  3.28919234]
 [ 0.3575458   1.74498934]
 [ 0.28000093 -0.58458918]]
Torch softmax forward output:
 [[-4.45251977  6.65426244]
 [-0.70628076  3.28919234]
 [ 0.3575458   1.74498934]
 [ 0.28000093 -0.58458918]]


##Backward propagation

In [None]:
loss = [1, -4, 9] # specify dL/dz
loss_t = torch.tensor(loss) # convert it to tensor
LOSS = torch.tensor(np.random.rand(4, 2)) # specify dL/dz for matrix multiplication

In [None]:
b = SoftMax_backward_prop(a, loss) # applying SoftMax back propagation using my function
res1 = nn.functional.softmax(a_t1, dim=0).backward(loss_t) # applying SoftMax back propagation using pytorch
# comparing results
print('Mine softmax backward output:', b)
print('Torch softmax backward output:', a_t1.grad.tolist())

Mine softmax backward output: [-0.74687172 -0.48480908  1.23168081]
Torch softmax backward output: [-0.7468716502189636, -0.48480910062789917, 1.2316807508468628]


In [None]:
b = RelU_backward_prop(a, loss) # applying RelU back propagation using my function
res2 = nn.functional.relu(a_t2).backward(loss_t) # applying RelU back propagation using pytorch
# comparing results
print('Mine RelU backward output:', b)
print('Torch RelU backward output:', a_t2.grad.tolist())

Mine RelU backward output: [ 1. -4.  9.]
Torch RelU backward output: [1.0, -4.0, 9.0]


In [None]:
b = MatMul_backward_prop(LOSS.numpy(), W) # applying matrix multiplication back propagation using my function
res2 = nn.functional.linear(X_t.t(), weight=W_t).backward(LOSS.t()) # applying matrix multiplication back propagation using pytorch
# X_t is transposed, because the difference between the order of multiplication in my function and in pytorch
# output is transposed as well

# comparing results
print('Mine matrix multiplication backward output:\n', b)
print('Torch matrix multiplication backward output:\n', X_t.grad.t().numpy())

Mine matrix multiplication backward output:
 [[0.81823771 1.59993299 0.73531026]
 [0.81280535 0.99103996 0.53722786]]
Torch matrix multiplication backward output:
 [[0.81823771 1.59993299 0.73531026]
 [0.81280535 0.99103996 0.53722786]]
