#Conv2d and Conv2dTranspose

## Import necessary libraries

In [None]:
import torch
from torch import nn
import numpy as np
import random

from sklearn.metrics import mean_squared_error

*Курсив*## 1. Conv2d

### Forward Operation

In [None]:
def apply_filter(a_slice_prev, W, b):
    """
    Arguments:
    a_slice_prev -- slice of input data of shape (n_C_prev, f, f)
    W -- Weight parameters - matrix of shape (n_C_prev, f, f)
    b -- Bias parameters - matrix of shape (1, 1, 1)

    Returns:
    Z -- a scalar value, the result of convolving the sliding window (W, b) on a slice x of the input data
    """

    # Element-wise product between a_slice_prev and W
    s = a_slice_prev * W

    # Sum over all entries of the volume s
    z = torch.sum(s)

    # Add bias b to Z
    Z = z + torch.squeeze(b)

    return Z


def conv_forward(A_prev, W, b):
    """
    Arguments:
    A_prev -- output activations of the previous layer,
        torch tensor of shape (m, n_C_prev, n_H_prev, n_W_prev)
    W -- Weights, torch tensor of shape (n_C, n_C_prev, f, f)
    b -- Biases, torch tensor of shape (n_C, 1, 1, 1)

    Returns:
    Z -- conv output, numpy array of shape (m, n_C, n_H, n_W)
    cache -- cache of values needed for the conv_backward() function
    """

    # Retrieve dimensions from A_prev's shape
    (m, n_C_prev, n_H_prev, n_W_prev) = A_prev.shape

    # Retrieve dimensions from W's shape
    (n_C, n_C_prev, f, f) = W.shape

    #default conv2d parameters, keep them just for visibility
    stride = 1
    pad = 0

    # Compute the dimensions of the CONV output volume using the formula from pytorch documentation
    n_H = int((n_H_prev+(2*pad)-f)/stride)+1
    n_W = int((n_W_prev+(2*pad)-f)/stride)+1

    # Initialize the output volume Z with zeros
    Z =  torch.zeros((m, n_C, n_H, n_W), dtype=torch.float32)

    # Create A_prev_copy because I need A_prev unchanged for the backprop later
    A_prev_copy = A_prev.clone()

    for i in range(m):                   # loop over the batch of training examples
        a_prev_copy = A_prev_copy[i]     # Select ith training example
        for h in range(n_H):             # loop over vertical axis of the output
            # Find the vertical start and end of the current slice
            vert_start = stride * h
            vert_end = vert_start + f

            for w in range(n_W):         # loop over horizontal axis of the output
                # Find the horizontal start and end of the current slice
                horiz_start = stride * w
                horiz_end = horiz_start + f

                for c in range(n_C):     # loop over channels of the output

                    # Define the 3D slice of a_prev_pad
                    a_slice_prev = a_prev_copy[:, vert_start:vert_end, horiz_start:horiz_end]

                    # Apply convolution on the 3D slice with the filter W and bias b, to get back one output neuron
                    weights = W[c, :, :, :]
                    biases  = b[c, :, :, :]
                    Z[i, c, h, w] = apply_filter(a_slice_prev, weights, biases)   # A function that I defined earlier


    # Save information for the backprop
    cache = (A_prev, W, b)

    return Z, cache

Next, I'll define the standart PyTorch Conv2d layer with default parameters to check its ouput and compare it with my custom one (must be very similar)

In [None]:
# run these calculations 10 times to be sure that errors are very small
for i in range(10):

  # define some random values that I'll use for testing
  in_channels  = random.randint(1, 10)
  out_channels = random.randint(1, 10)
  kernel_size  = random.randint(1, 5)
  m = random.randint(1, 10)
  n_H_prev = random.randint(5, 10)  # to make it larger than kernel_size
  n_W_prev = random.randint(5, 10)  # to make it larger than kernel_size

  # define the standart PyTorch Conv2d layer
  model = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size)

  # initialize weights and biases for the layer with random values
  torch.nn.init.xavier_uniform_(model.weight)
  torch.nn.init.uniform_(model.bias)

  # initialize input with random values
  A_prev = torch.rand(m, in_channels, n_H_prev, n_W_prev)

  # create the same weights and biases to feed them into my custom function
  W = model.weight
  b = model.bias[:, None, None, None]  # just change the dimension

  # get results from the custom function
  Z, cache_conv = conv_forward(A_prev, W, b)

  # get results from the standart PyTorch layer
  model_out = model(A_prev)

  # compare the results using mean squared error (MSE) as an appropriate metric
  error = mean_squared_error(model_out.detach().numpy().reshape(m, -1), Z.detach().numpy().reshape(m, -1))

  # print out the error. The closer it is to zero, the better
  print(error)

2.7658432e-15
1.1876611e-15
6.642834e-16
3.3968198e-15
7.764281e-14
4.0212227e-16
6.776145e-15
1.1964136e-14
2.39714e-15
5.901334e-15


I can see that error value ranges a little from ~ xe-16 to xe-14 that are very small numbers.

### Backward Operation

In [None]:
def conv_backward(dZ, cache):
    """
    Arguments:
    dZ -- gradient of the cost with respect to the output of the conv layer (Z), torch tensor of shape (m, n_C, n_H, n_W)
    cache -- cache of values needed for the conv_backward(), output of conv_forward()

    Returns:
    dA_prev -- gradient of the cost with respect to the input of the conv layer (A_prev),
               torch tensor of shape (m, n_C_prev, n_H_prev, n_W_prev)
    dW -- gradient of the cost with respect to the weights of the conv layer (W)
          torch tensor of shape (n_C, n_C_prev, f, f)
    db -- gradient of the cost with respect to the biases of the conv layer (b)
          torch tensor of shape (n_C, 1, 1, 1)
    """

    # Retrieve information from "cache"
    (A_prev, W, b) = cache
    # Retrieve dimensions from A_prev's shape
    (m, n_C_prev, n_H_prev, n_W_prev) = A_prev.shape
    # Retrieve dimensions from W's shape
    (n_C, n_C_prev, f, f) = W.shape

    #defult parameter
    stride = 1

    # Retrieve dimensions from dZ's shape
    (m, n_C, n_H, n_W) = dZ.shape

    # Initialize dA_prev, dW, db with the correct shapes
    dA_prev = torch.zeros(A_prev.shape)
    dW = torch.zeros(W.shape)
    db = torch.zeros(b.shape)

    # Pad A_prev and dA_prev
    A_prev_pad = A_prev.clone()
    dA_prev_pad = dA_prev.clone()

    for i in range(m):                     # loop over the training examples

        # select ith training example from A_prev_pad and dA_prev_pad
        a_prev_pad = A_prev_pad[i]
        da_prev_pad = dA_prev_pad[i]

        for h in range(n_H):               # loop over vertical axis of the output volume
            for w in range(n_W):           # loop over horizontal axis of the output volume
                for c in range(n_C):       # loop over the channels of the output volume

                    # Find the corners of the current "slice"
                    vert_start = stride * h
                    vert_end = vert_start + f
                    horiz_start = stride * w
                    horiz_end = horiz_start + f

                    # Use the corners to define the slice from a_prev_pad
                    a_slice = a_prev_pad[:, vert_start:vert_end, horiz_start:horiz_end]

                    # Update gradients for the window and the filter's parameters
                    da_prev_pad[:, vert_start:vert_end, horiz_start:horiz_end] += W[c,:,:,:] * dZ[i, c, h, w]
                    dW[c,:,:,:] += a_slice * dZ[i, c, h, w]
                    db[c,:,:,:] += dZ[i, c, h, w]

        # Set the ith training example's dA_prev to the unpadded da_prev_pad
        dA_prev[i, :, :, :] = da_prev_pad[:, :, :]


    return dA_prev, dW, db

In [None]:
# run these calculations 10 times to be sure that errors are very small
for i in range(10):

  # define some random values that I'll use for testing
  in_channels  = random.randint(1, 10)
  out_channels = random.randint(1, 10)
  kernel_size  = random.randint(1, 5)
  m = random.randint(1, 10)
  n_H_prev = random.randint(5, 10)  # to make it larger than kernel_size
  n_W_prev = random.randint(5, 10)  # to make it larger than kernel_size

  # define the standart PyTorch Conv2d layer
  model = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size)

  # initialize weights and biases for the layer with random values
  torch.nn.init.xavier_uniform_(model.weight)
  torch.nn.init.uniform_(model.bias)

  # initialize input with random values
  A_prev = torch.rand(m, in_channels, n_H_prev, n_W_prev, requires_grad=True)

  # create the same weights and biases to feed them into my custom function
  W = model.weight.requires_grad_()
  b = model.bias[:, None, None, None].requires_grad_()

  # get results from the standart PyTorch layer
  model_out = model(A_prev)
  # to use .backward() method I need to get scalar value
  T = torch.sum(model_out)
  # call this method to compute gradients
  T.backward()

  # get results from the custom function (forward path)
  Z, cache_conv = conv_forward(A_prev, W, b)

  # get gradients from the custom function (use ones_like because I have dL/dy like this by default)
  dA, dW, db = conv_backward(torch.ones_like(Z), cache_conv)

  # compare the results using mean squared error (MSE) as an appropriate metric
  dA_error = mean_squared_error(A_prev.grad.detach().numpy().reshape(m, -1), dA.detach().numpy().reshape(m, -1))
  dW_error = mean_squared_error(model.weight.grad.detach().numpy().reshape(out_channels, -1), dW.detach().numpy().reshape(out_channels, -1))
  db_error = mean_squared_error(model.bias.grad.detach().numpy().reshape(out_channels, -1), db.detach().numpy().reshape(out_channels, -1))

  # print out the results
  print("Gradient of inputs error =", dA_error)
  print("Gradient of weights error =", dW_error)
  print("Gradient of biases error =", db_error, "\n")

Gradient of inputs error = 7.438428e-15
Gradient of weights error = 4.5474735e-13
Gradient of biases error = 0.0 

Gradient of inputs error = 4.744746e-15
Gradient of weights error = 0.0
Gradient of biases error = 0.0 

Gradient of inputs error = 3.7579816e-14
Gradient of weights error = 0.0
Gradient of biases error = 0.0 

Gradient of inputs error = 1.1142872e-15
Gradient of weights error = 0.0
Gradient of biases error = 0.0 

Gradient of inputs error = 5.629224e-15
Gradient of weights error = 0.0
Gradient of biases error = 0.0 

Gradient of inputs error = 2.3840164e-14
Gradient of weights error = 0.0
Gradient of biases error = 0.0 

Gradient of inputs error = 4.8839744e-17
Gradient of weights error = 4.760093e-09
Gradient of biases error = 0.0 

Gradient of inputs error = 8.5664534e-16
Gradient of weights error = 0.0
Gradient of biases error = 0.0 

Gradient of inputs error = 2.6048804e-14
Gradient of weights error = 0.0
Gradient of biases error = 0.0 

Gradient of inputs error = 1.7

## Conv2dTranspose

### Forward Operation

In [None]:
def add_matrix(a_prev_copy, filter, index, h, w, c, res):
  """
  Arguments:
  a_prev_copy -- ith output activation of the previous layer, torch tensor of shape (n_C_prev, n_H_prev, n_W_prev)
  filter -- weights, torch tensor of shape (n_C_prev, n_C, f, f)
  index -- no. of an example in a batch
  h -- verical axis coordinate
  w -- horizontal axis coordinate
  c -- no. of a channel
  Z -- current state of output tensor of shape (m, n_C, n_H, n_W)

  Returns:
  res -- conv output, torch tensor of shape (m, n_C, n_H, n_W)

  """

  # get current pixel that we're gonna process
  curr = a_prev_copy[c][h][w]
  # the main loop over the filters (n_C)
  for num_filter in range(filter.shape[1]):
    # multiply curent pixel value and appropriate 2D filter
    add_filter = curr*filter[c][num_filter]
    # loop over the height and width of the filter
    for i in range(add_filter.shape[0]):
      for j in range(add_filter.shape[1]):
        # add values to res matrix
        res[index][num_filter][h+i][w+j] += add_filter[i][j]


  return res


def conv_transpose_forward(A_prev, W, b):
    """
    Forward propagation for a transpose conv function

    Arguments:
    A_prev -- output activations of the previous layer,
        torch tensor of shape (m, n_C_prev, n_H_prev, n_W_prev)
    W -- Weights, torch tensor of shape (n_C_prev, n_C, f, f)
    b -- Biases, torch tensor of shape (n_C, 1, 1, 1)

    Returns:
    Z -- conv transpose output, torch tensor of shape (m, n_C, n_H, n_W)
    cache -- cache of values needed for the conv_transpose_backward() function
    """

    # Retrieve dimensions from A_prev's shape
    (m, n_C_prev, n_H_prev, n_W_prev) = A_prev.shape

    # Retrieve dimensions from W's shape
    (n_C_prev, n_C, f, f) = W.shape

    # Compute the dimensions of the output
    n_H = int(n_H_prev+f-1)
    n_W = int(n_W_prev+f-1)

    # Initialize the output Z with zeros
    Z =  torch.zeros((m, n_C, n_H, n_W), dtype=torch.float32)

    # Create A_prev_copy because I need A_prev unchanged to use it later
    A_prev_copy = A_prev.clone()

    for i in range(m):                      # loop over the batch of training examples
        a_prev_copy = A_prev_copy[i]        # Select ith training example
        for h in range(n_H_prev):           # loop over vertical axis of the input
            for w in range(n_W_prev):       # loop over horizontal axis of the input
                for c in range(n_C_prev):   # loop over channels of the input
                    Z = add_matrix(a_prev_copy, W, i, h, w, c, Z)   #use the function I defined earlier

        # add bias
        for x in range(n_C): Z[i][x] += torch.squeeze(b[x])

    # Save information in cache for the backprop
    cache = (A_prev, W, b)

    return Z, cache

In [None]:
# define some random values that I'll use for testing
in_channels  = random.randint(1, 10)
out_channels = random.randint(1, 10)
kernel_size  = random.randint(1, 5)
m = random.randint(1, 10)
n_H_prev = random.randint(1, 10)
n_W_prev = random.randint(1, 10)

# define the standart PyTorch ConvTranspose2d layer
model = nn.ConvTranspose2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size)

# create the same weights and biases to feed them into my custom function
torch.nn.init.xavier_uniform_(model.weight)
torch.nn.init.uniform_(model.bias)

# initialize input with random values
A_prev = torch.rand(m, in_channels, n_H_prev, n_W_prev)

# create the same weights and biases to feed them into my custom function
W = model.weight
b = model.bias[:, None, None, None]

# get results from the custom function
Z, cache_conv = conv_transpose_forward(A_prev, W, b)

# get results from the standart PyTorch layer
model_out = model(A_prev)

# compare the results using mean squared error (MSE) as an appropriate metric
error = mean_squared_error(model_out.detach().numpy().reshape(m, -1), Z.detach().numpy().reshape(m, -1))

# print out the results
print(error)

4.6761246e-15


I got a very small value so the functions are almost identical

### Backward Operation

In [None]:
def conv_transpose_backward(dZ, cache):
    """
    Arguments:
    dZ -- gradient of the cost with respect to the output of the conv_transpose layer (Z), torch tensor of shape (m, n_C, n_H, n_W)
    By default it contains m*n_C*n_H*n_W ones
    cache -- cache of values, output of conv_transpose_forward()

    Returns:
    dA_prev -- gradient of the cost with respect to the input of the conv_transpose layer (A_prev),
               torch tensor of shape (m, n_C_prev, n_H_prev, n_W_prev)
    dW -- gradient of the cost with respect to the weights of the conv_transpose layer (W)
          torch tensor of shape (n_C_prev, n_C, f, f)
    db -- gradient of the cost with respect to the biases of the conv_transpose layer (b)
          torch tensor of shape (n_C, 1, 1, 1)
    """

    # Retrieve information from cache
    (A_prev, W, b) = cache
    # Retrieve dimensions from A_prev's shape
    (m, n_C_prev, n_H_prev, n_W_prev) = A_prev.shape
    # Retrieve dimensions from W's shape
    (n_C_prev, n_C, f, f) = W.shape

    # Parameters by default
    stride = 1
    pad = 0

    # Retrieve dimensions from dZ's shape
    (m, n_C, n_H, n_W) = dZ.shape

    # Initialize dA_prev, dW, db with the correct shapes
    dA_prev = torch.zeros(A_prev.shape)
    dW = torch.zeros(W.shape)
    db = torch.zeros(b.shape)

    # Copy A_prev and dA_prev to make changes
    A_prev_copy = A_prev.clone()

    for i in range(m):                       # loop over the training examples

        # select ith training example from A_prev_pad and dA_prev_pad
        a_prev_copy = A_prev_copy[i]

        for h in range(n_H_prev):                   # loop over vertical axis of the input
            for w in range(n_W_prev):               # loop over horizontal axis of the input
                for c in range(n_C):                # loop over the channels of the output

                    # Find the corners of the current slice
                    vert_start = stride * h
                    vert_end = vert_start + f
                    horiz_start = stride * w
                    horiz_end = horiz_start + f

                    # Use the corners to define the slice from a_prev_copy
                    a_slice = a_prev_copy[:, h, w]

                    # Update gradients for the filter's parameters (loop over input channels)
                    for x in range(n_C_prev): dW[x,c,:,:] += a_slice[x] * dZ[i, c, vert_start:vert_end, horiz_start:horiz_end]
                    #da_prev_pad[:, h, w] += np.sum(W[:, c, :, :] * dZ[i, c, vert_start:vert_end, horiz_start:horiz_end])

    # Making sure your output shape is correct
    assert(dA_prev.shape == (m, n_C_prev, n_H_prev, n_W_prev))

    # compute db value
    db += m*n_H*n_W

    return dA_prev, dW, db

In [None]:
# define some random values that I'll use for testing
in_channels  = random.randint(1, 10)
out_channels = random.randint(1, 10)
kernel_size  = random.randint(1, 5)
m = random.randint(1, 10)
n_H_prev = random.randint(5, 10)  # to make it larger than kernel_size
n_W_prev = random.randint(5, 10)  # to make it larger than kernel_size

# define the standart PyTorch ConvTranspose2d layer
model = nn.ConvTranspose2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size)

# create the same weights and biases to feed them into my custom function
torch.nn.init.xavier_uniform_(model.weight)
torch.nn.init.uniform_(model.bias)

# initialize input with random values
A_prev = torch.rand(m, in_channels, n_H_prev, n_W_prev, requires_grad=True)

# create the same weights and biases to feed them into my custom function
W = model.weight.requires_grad_()
b = model.bias[:, None, None, None].requires_grad_()

# get results from the custom function (forward path)
Z, cache_conv = conv_transpose_forward(A_prev, W, b)
# get gradients from the custom function (use ones_like because I have dL/dy like this by default)
dA, dW, db = conv_transpose_backward(torch.ones_like(Z), cache_conv)

# get results from the standart PyTorch layer
model_out = model(A_prev)
# to use .backward() method I need to get scalar value
T = torch.sum(model_out)
# call this method to compute gradients
T.backward()

# compare the results using mean squared error (MSE) as an appropriate metric
dA_error = mean_squared_error(A_prev.grad.detach().numpy().reshape(m, -1), dA.detach().numpy().reshape(m, -1))
dW_error = mean_squared_error(model.weight.grad.detach().numpy().reshape(out_channels, -1), dW.detach().numpy().reshape(out_channels, -1))
db_error = mean_squared_error(model.bias.grad.detach().numpy().reshape(out_channels, -1), db.detach().numpy().reshape(out_channels, -1))

# print out the results
#print("Gradient of inputs error =", dA_error)
print("Gradient of weights error =", dW_error)
print("Gradient of biases error =", db_error, "\n")