# Lab 10.1. Linear layer validation

by Domrachev Ivan, B20-Ro-01


This notebook aims to consider the problems, that previous implementation of layers might have had. 

It strictly shows, that all errors are *negligble* compared to model weights

In [32]:
from nn_from_scratch.neurons import Linear, Convolution
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

## Part 1. Linear layer

In [33]:
input_dim=(10, 2)
output_dim=(10, 5)
linear = Linear(input_dim, output_dim)

# Random input values (x itself, and assumed partial derivative)
x_input = np.random.rand(*input_dim)
dL_dy = np.random.rand(*output_dim)

# Forward call
y_value = linear.forward(x_input)

# Backpropogation
dL_dx = linear.backward(dL_dy)
dL_dw = linear._W_pd

Let's validate the results compared to tensorflow's implementation

In [34]:
x_input_batched = tf.constant(
    x_input,
    dtype=tf.float32
)

weights_reshaped = tf.constant(
    linear.W[1:, :],
    dtype=tf.float32
)
bias_reshaped = tf.constant(
    linear.W[0, :]
)


In [35]:
x_input_batched.shape, weights_reshaped.shape, bias_reshaped.shape

(TensorShape([10, 2]), TensorShape([2, 5]), TensorShape([5]))

In [36]:
conv_keras = layers.Dense(
    5,
    input_shape=(2,),
    use_bias=True,
    kernel_initializer=tf.keras.initializers.Constant(weights_reshaped),
    bias_initializer=tf.keras.initializers.Constant(bias_reshaped)
)

In [37]:
with tf.GradientTape(persistent=True) as tape:
    tape.watch(x_input_batched)  # Watch the input tensor for gradient computation
    conv_output = conv_keras(x_input_batched)

conv_output_np = conv_output.numpy()

In [38]:
conv_output_np.shape

(10, 5)

1. Check that layer output is correct

In [39]:
np.all(np.abs(conv_output_np - y_value) < 1e-7)

True

In [40]:
print(np.abs(conv_output_np).mean())

0.05123331


2. Check that backpropogation for both input and weights is correct

In [41]:
dL_dy_keras = tf.constant(dL_dy, dtype=tf.float32)

dL_dx_keras = tape.gradient(
    conv_output, x_input_batched, output_gradients=dL_dy_keras
)
dL_dw_keras = tape.gradient(
    conv_output, conv_keras.trainable_variables, output_gradients=dL_dy_keras
)

dL_dx_keras_np = dL_dx_keras.numpy()
dL_dw_keras_np = dL_dw_keras[0].numpy()
dL_dbias_keras_np = dL_dw_keras[1].numpy()

In [42]:
dL_dx_keras_np

array([[-0.03389214,  0.00468405],
       [-0.06382886,  0.03250728],
       [-0.06522863,  0.03782374],
       [-0.13631496, -0.02891219],
       [-0.14101057,  0.05950642],
       [-0.10216872, -0.00614368],
       [-0.0583231 ,  0.01871667],
       [-0.052155  ,  0.0306218 ],
       [-0.14564934, -0.01086044],
       [-0.17198128,  0.03301783]], dtype=float32)

In [43]:
dL_dx

array([[-0.03389214,  0.00468405],
       [-0.06382886,  0.03250728],
       [-0.06522863,  0.03782374],
       [-0.13631496, -0.02891219],
       [-0.14101057,  0.05950642],
       [-0.10216872, -0.00614368],
       [-0.0583231 ,  0.01871667],
       [-0.052155  ,  0.0306218 ],
       [-0.14564934, -0.01086043],
       [-0.17198129,  0.03301783]], dtype=float32)

In [44]:
np.all(np.abs(dL_dx_keras_np - dL_dx) < 1e-7)

True

In [45]:
np.mean(np.abs(dL_dx_keras_np))

0.061667334

In [46]:
dL_dw_keras_np

array([[1.411161 , 1.7951473, 2.291652 , 1.8384316, 1.2848475],
       [1.6441565, 1.8421172, 2.079435 , 1.7833571, 1.8817294]],
      dtype=float32)

In [47]:
dL_dw[1:]

array([[1.41116096, 1.79514724, 2.29165201, 1.83843166, 1.28484743],
       [1.64415653, 1.84211717, 2.07943507, 1.78335713, 1.8817294 ]])

In [48]:
np.all(np.abs(dL_dw_keras_np - dL_dw[1:]) < 1e-6)

True

In [49]:
np.mean(np.abs(dL_dw[1:]))

1.785203460316635

In [50]:
np.all(np.abs(dL_dbias_keras_np - dL_dw[0]) < 1e-6)

True

In [51]:
np.mean(np.abs(dL_dw[0]))

3.892108001734596

## Part 2. Convolution

The convolutional layer was extended to support batches inputs and optional bias input
> Note: the solution is far from being generalized, f.e. it lacks padding, stride settings, as well as support of batches of the pictures

In [52]:
input_dim = (10, 3, 7, 5)
kernel_size = 2
output_layers = 20
conv = Convolution(input_dim, kernel_size, output_layers=output_layers, use_bias=False)
output_dim = conv._output_dim

# Random input values (x itself, and assumed partial derivative)
x_input = np.random.random(input_dim).astype(dtype=np.float32)
dL_dy = np.random.random(output_dim).astype(dtype=np.float32)

output = conv.forward(x_input)
dL_dx = conv.backward(dL_dy)
dL_dw = conv._W_pd
# bias = conv._B
# dL_db = conv._B_pd

In [53]:
b, m, n, p, q = input_dim[0], input_dim[2], input_dim[3], kernel_size, kernel_size
assert all(
    np.allclose(
        output[i][kern][j][k], (x_input[i, :, j:j+p, k:k+q] * conv._W[kern]).sum()
    ) 
    for i in range(b) 
    for j in range(m-p+1) 
    for k in range(n-q+1)
    for kern in range(output_layers)
)

As before, let's compare its performance with tensorflow implementation:

In [54]:
x_input_batched = tf.constant(
    np.moveaxis(
        x_input,
        1, -1
    ), 
    dtype=tf.float32
)

weights_reshaped = tf.constant(
    conv.W.transpose(2, 3, 1, 0),   
    dtype=tf.float32
)

In [55]:
conv_keras = layers.Conv2D(
    20, 2,
    input_shape=x_input_batched.shape[1:],
    use_bias=False,
    kernel_initializer=tf.keras.initializers.Constant(weights_reshaped),
    data_format='channels_last'
)

In [56]:
with tf.GradientTape(persistent=True) as tape:
    tape.watch(x_input_batched)  # Watch the input tensor for gradient computation
    conv_output = conv_keras(x_input_batched)

conv_output_np = conv_output.numpy()
conv_output_np = np.moveaxis(conv_output_np, -1, 1)

1. Check that layer output is correct

In [57]:
np.all(np.abs(conv_output_np - output)< 1e-7) 

True

In [58]:
np.max(np.abs(output))

0.33199567

2. Check that backpropogation for both input and weights is correct

In [59]:
dL_dy_keras = tf.constant(np.moveaxis(dL_dy, 1, -1), dtype=tf.float32)

dL_dx_keras = tape.gradient(
    conv_output, x_input_batched, output_gradients=dL_dy_keras
)
dL_dw_keras = tape.gradient(
    conv_output, conv_keras.trainable_variables, output_gradients=dL_dy_keras
)

dL_dx_keras_np = np.moveaxis(dL_dx_keras.numpy().squeeze(), -1, 1)
dL_dw_keras_np = np.moveaxis(dL_dw_keras[0].numpy().squeeze(), -1, 0)

In [60]:
dL_dx_keras_np.shape

(10, 3, 7, 5)

In [61]:
dL_dx.shape

(10, 3, 7, 5)

In [64]:
np.max(np.abs(dL_dx_keras_np - dL_dx) < 1e-6)

True

In [65]:
np.mean(np.abs(dL_dx))

0.23194458

In [116]:
np.all(np.abs(dL_dw_keras_np - np.moveaxis(dL_dw, 1, -1))< 1e-4) 

True

In [115]:
np.mean(np.abs(dL_dw))

58.58165