# Lab 10. Padding layer
by Domrachev Ivan, B20-Ro-01

In [1]:
from keras.datasets import mnist
from nn_from_scratch.optimizers import GradientDescent
from nn_from_scratch.examples.simple_nn import NeuralNetwork
from nn_from_scratch.nodes import ReLU, SoftMaxLoss, Vectorization
from nn_from_scratch.neurons import Linear, Convolution
from nn_from_scratch.interfaces import Neuron
import tensorflow as tf
from tensorflow.keras import layers as t_layers


import numpy as np
from matplotlib import pyplot as plt

2023-11-10 17:36:09.583197: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-10 17:36:09.644114: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-10 17:36:09.645285: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




## Part 1. Padding implementation

The implementation of padding is quite straight-forward: when doing forward propogation, one add zeros to the input image, and when doing backpropogation, just trim the partial derivative of output matrix to a required size.

In [2]:
input_dim = (2, 3, 4, 5)
kernel_size = 3
output_layers = 20
padding = 1
conv = Convolution(input_dim, kernel_size, output_layers=output_layers, padding=padding)
output_dim = conv._output_dim

# Random input values (x itself, and assumed partial derivative)
x_input = np.random.random(input_dim)
dL_dy = np.random.random(output_dim)

output = conv.forward(x_input)
dL_dx = conv.backward(dL_dy)
dL_dw = conv._W_pd
bias = conv._B
dL_db = conv._B_pd

In [3]:
print(dL_dx.shape)

(2, 3, 4, 5)


As before, let's compare its performance with tensorflow implementation. Since `keras` does not allow to explicitely set the padding size, let's use `padding='same'`. It will add the padding to leave the picture of the same shape. Therefore, there would be the same padding as in our convolutional layer:

In [4]:
x_input_batched = tf.constant(
    np.moveaxis(
        x_input,
        1, -1
    ), 
    dtype=tf.float32
)

weights_reshaped = tf.constant(
    np.moveaxis(
        conv.W,
        0, -1
    ), 
    dtype=tf.float32
)

2023-11-10 17:36:13.667262: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-10 17:36:13.668741: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [5]:
x_input_batched.shape

TensorShape([2, 4, 5, 3])

In [6]:
conv_keras = t_layers.Conv2D(
    20, 3, 
    input_shape=x_input_batched.shape[1:],
    use_bias=False,
    padding='same',
    kernel_initializer=tf.keras.initializers.Constant(weights_reshaped)
)

In [7]:
with tf.GradientTape(persistent=True) as tape:
    tape.watch(x_input_batched)  # Watch the input tensor for gradient computation
    conv_output = conv_keras(x_input_batched)

conv_output_np = conv_output.numpy().transpose(0, 3, 1, 2).squeeze()

1. Check that layer output is correct

In [8]:
np.all((conv_output_np - output[0]) < 1)

True

2. Check that backpropogation for both input and weights is correct

In [9]:
dL_dy_keras = tf.constant(np.moveaxis(dL_dy, 1, -1), dtype=tf.float32)

dL_dx_keras = tape.gradient(
    conv_output, x_input_batched, output_gradients=dL_dy_keras
)
dL_dw_keras = tape.gradient(
    conv_output, conv_keras.trainable_variables, output_gradients=dL_dy_keras
)

dL_dx_keras_np = np.moveaxis(dL_dx_keras[0].numpy().squeeze(), -1, 0)
dL_dw_keras_np = np.moveaxis(dL_dw_keras[0].numpy().squeeze(), -1, 0)

In [10]:
np.all((dL_dx_keras_np - dL_dx[0]) < 1)

True

In [11]:
np.all((dL_dw_keras_np - np.moveaxis(dL_dw, 1, -1)) < 1e-4)

True

## Part 2. Incorporating padding in CNN

Now, let's integrate it in a complete neural network. It probably would not work well, since I still haven't fixes the CNN, but it will at least launch.

In [12]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()
# CNN are slow...
train_X = train_X[:1000]
n_input, n_output, batch_size, n_channels = 28**2, 10, 50, 1
assert train_X.shape[0] % batch_size == 0

train_X = train_X.reshape(train_X.shape[0] // batch_size, batch_size, n_channels, 28, 28)
train_y_ohe = np.zeros((len(train_y), n_output))
train_y_ohe[np.arange(len(train_y)), train_y] = 1
train_y_ohe = train_y_ohe.reshape((train_y.shape[0] // batch_size, batch_size, n_output))

The values preferably should belong to $[0; 1]$:

In [13]:
train_X = train_X / 255
test_X = test_X / 255

Now, let's train!

In [14]:
gds = GradientDescent(lr=0.5)
layers = [
    Convolution(
        (batch_size, n_channels, 28, 28), 
        kernel_size=3, 
        output_layers=4,
        padding=1,          # here is padding
        use_bias=False
    ),
    ReLU((batch_size, 4, 28, 28)),
    Convolution(
        (batch_size, 4, 28, 28), 
        kernel_size=7,
        output_layers=2,
        use_bias=False
    ),
    ReLU((batch_size, 2, 22, 22)),
    Vectorization((batch_size, 2, 22, 22)),
    Linear((batch_size, 968), (batch_size, n_output))
]
loss_fn = SoftMaxLoss((batch_size, n_output))
network = NeuralNetwork(
    n_input=n_input, 
    n_output=n_output, 
    batch_size=batch_size, 
    optimizer=gds,
    layers=layers,  
    loss_fn=loss_fn
)
network.fit(train_X, train_y_ohe, n_epochs=2)

100%|██████████| 20/20 [01:33<00:00,  4.68s/it, loss=2.27]


Epoch 1, Loss: 2.268403443126829


100%|██████████| 20/20 [01:21<00:00,  4.07s/it, loss=2.25]

Epoch 2, Loss: 2.247522820650666





array([2.24752282])

Great, loss is decreasing very slowly. Strange, let's check the predictions:

In [15]:
def compute_accuracy(test_X: np.array, test_y: np.array, model) -> float:
    correct_predictions = 0
    total = 0

    for test_inputs, test_labels in zip(test_X, test_y):
        predicts = model.predict(test_inputs)
        pred_class = np.argmax(predicts, axis=1)

        correct_predictions += (pred_class == test_labels).sum()
        total += len(test_labels)

    return correct_predictions / total

In [16]:
test_X_batches = test_X.reshape((test_X.shape[0] // batch_size, batch_size, 28, 28))
test_y_batches = test_y.reshape((test_y.shape[0] // batch_size, batch_size,))

acc = compute_accuracy(test_X_batches, test_y_batches, network)
print(f"Accuracy: {acc}")

Accuracy: 0.1046


Well, they are random... Maybe the model itself is invalid, let's train tensorflow analogue

## Part 3. Designing convolution for given output shape

> Create a function that takes as inputs the size of the input image and the desired size of the output image size at the k-th CONV-layer, and produces as output the appropriate padding and filter sizes, such that the amount of padding is minimized

The task is straight-forward, since:
1. To decrease image size by $N$, kernel of the size $N+1$ could be applied to achieve desired result. Then, it's always possible.
2. To increase image size by $N$, one needs to consider options:
   1. If $N$ is even, then kernel of the size $1$ with padding $\frac{N}{2}$ could be applied to achieve desired result. Then, it's always possible.
   2. If $N$ is odd, then kernel of the size $2$ with padding $\frac{N+1}{2}$ could be applied to achieve desired result. Then, it's always possible.


As such, it's always possible to complete class in one move. 

In [17]:
from collections.abc import Iterable

def find_conv(dim_init: int, dim_final: int, k: int) -> Iterable[tuple]:
    N = dim_final - dim_init
    kernels = []
    if N < 0:
        kernels.append((-N+1, 0))
    else:
        if N % 2 == 0:
            kernels.append((1, N//2))
        else:
            kernels.append((2, (N+1)//2))
    
    kernels.extend([(1, 0) for _ in range(k-1)])

    for k in kernels:
        print(f"({k[0]}x{k[0]}) kernel, {k[1]} padding")
    
    return kernels

In [18]:
find_conv(21, 25, 3)

(1x1) kernel, 2 padding
(1x1) kernel, 0 padding
(1x1) kernel, 0 padding


[(1, 2), (1, 0), (1, 0)]

In [19]:
find_conv(20, 25, 3)

(2x2) kernel, 3 padding
(1x1) kernel, 0 padding
(1x1) kernel, 0 padding


[(2, 3), (1, 0), (1, 0)]

In [20]:
find_conv(25, 20, 3)

(6x6) kernel, 0 padding
(1x1) kernel, 0 padding
(1x1) kernel, 0 padding


[(6, 0), (1, 0), (1, 0)]