# Lab 8, part 2: Convolutional Neural Network sample
by Domrachev Ivan, B20-Ro-01

In [2]:
from keras.datasets import mnist
from nn_from_scratch.optimizers import GradientDescent
from nn_from_scratch.examples.simple_nn import NeuralNetwork
from nn_from_scratch.nodes import ReLU, SoftMaxLoss, Vectorization
from nn_from_scratch.neurons import Linear, Convolution
from nn_from_scratch.interfaces import Neuron
import tensorflow as tf
from tensorflow.keras import layers as t_layers


import numpy as np
from matplotlib import pyplot as plt

2023-11-10 17:18:34.833098: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-10 17:18:34.901871: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-10 17:18:34.902703: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Part 1. One layer NN

Finally, everything is ready to create a Convolutional Neural Network!

Conviniently, the framework from the simple Neural Network is suitable for the CNN as well, so let's utilize it:

In [2]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()
# CNN are slow...
train_X = train_X[:1000]
n_input, n_output, batch_size, n_channels = 28**2, 10, 50, 1
assert train_X.shape[0] % batch_size == 0

train_X = train_X.reshape(train_X.shape[0] // batch_size, batch_size, n_channels, 28, 28)
train_y_ohe = np.zeros((len(train_y), n_output))
train_y_ohe[np.arange(len(train_y)), train_y] = 1
train_y_ohe = train_y_ohe.reshape((train_y.shape[0] // batch_size, batch_size, n_output))

The values preferably should belong to $[0; 1]$:

In [3]:
train_X = train_X / 255
test_X = test_X / 255

Now, let's train!

In [6]:
gds = GradientDescent(lr=0.5)
layers = [
    Convolution(
        (batch_size, n_channels, 28, 28), 
        kernel_size=3, 
        output_layers=4,
        use_bias=False
    ),
    ReLU((batch_size, 4, 26, 26)),
    Convolution(
        (batch_size, 4, 26, 26), 
        kernel_size=7,
        output_layers=2,
        use_bias=False
    ),
    ReLU((batch_size, 2, 20, 20)),
    Vectorization((batch_size, 2, 20, 20)),
    Linear((batch_size, 800), (batch_size, n_output))
]
loss_fn = SoftMaxLoss((batch_size, n_output))
network = NeuralNetwork(
    n_input=n_input, 
    n_output=n_output, 
    batch_size=batch_size, 
    optimizer=gds,
    layers=layers,  
    loss_fn=loss_fn
)
network.fit(train_X, train_y_ohe, n_epochs=2)

100%|██████████| 20/20 [01:43<00:00,  5.18s/it, loss=2.34]


Epoch 1, Loss: 2.3388391875914034


100%|██████████| 20/20 [01:51<00:00,  5.58s/it, loss=2.33]

Epoch 2, Loss: 2.3270086079741086





array([2.32700861])

Great, loss is decreasing very slowly. Strange, let's check the predictions:

In [7]:
def compute_accuracy(test_X: np.array, test_y: np.array, model) -> float:
    correct_predictions = 0
    total = 0

    for test_inputs, test_labels in zip(test_X, test_y):
        predicts = model.predict(test_inputs)
        pred_class = np.argmax(predicts, axis=1)

        correct_predictions += (pred_class == test_labels).sum()
        total += len(test_labels)

    return correct_predictions / total

In [8]:
test_X_batches = test_X.reshape((test_X.shape[0] // batch_size, batch_size, 28, 28))
test_y_batches = test_y.reshape((test_y.shape[0] // batch_size, batch_size,))

acc = compute_accuracy(test_X_batches, test_y_batches, network)
print(f"Accuracy: {acc}")

Accuracy: 0.1135


Well, they are random... Maybe the model itself is invalid, let's train tensorflow analogue

## Part 2. Learning TensorFlow network

This part is straight-forward, would not explain much:

In [29]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()
train_X = np.expand_dims(train_X / 255, axis=-1)
test_X = np.expand_dims(test_X / 255, axis=-1)

In [30]:
tf_layers = [
    t_layers.Conv2D(
        4, 3,
        input_shape=train_X.shape[1:],
        use_bias=False,
    ),
    t_layers.ReLU(),
    t_layers.Conv2D(
        2, 7,
        use_bias=False,
    ),
    t_layers.ReLU(),
    t_layers.Flatten(),
    t_layers.Dense(
        10,
        use_bias=True,
    )
]
model = tf.keras.models.Sequential()
for l in tf_layers:
    model.add(l)

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_8 (Conv2D)           (None, 26, 26, 4)         36        
                                                                 
 re_lu_8 (ReLU)              (None, 26, 26, 4)         0         
                                                                 
 conv2d_9 (Conv2D)           (None, 20, 20, 2)         392       
                                                                 
 re_lu_9 (ReLU)              (None, 20, 20, 2)         0         
                                                                 
 flatten_4 (Flatten)         (None, 800)               0         
                                                                 
 dense_4 (Dense)             (None, 10)                8010      
                                                                 
Total params: 8438 (32.96 KB)
Trainable params: 8438 (

In [32]:
model.compile(optimizer='sgd',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(train_X, train_y, epochs=5, 
                    validation_data=(test_X, test_y))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


The accuracy there is not random, and the network performs great. Maybe the weights are different?..

## Part 3. Comparison with TensorFlow

We should compare:
1. Outputs of the model (given the same initial weights and the same input)
2. Partial derivative of loss w.r.t. input. If they are close enough, then all the inner states are similar as well:

In [9]:
example_pic, example_label = train_X[0], train_y_ohe[0]

In [10]:
state = example_pic.copy()
for layer in layers:
    state = layer.forward(state)

loss = loss_fn.forward(state, example_label)
dL_dy = loss_fn.backward()
partial_derivative = dL_dy
dL_dx = []
dL_dw = []

for layer in layers[::-1]:
    partial_derivative = layer.backward(partial_derivative)
    dL_dx.append(partial_derivative)

    if isinstance(layer, Neuron):
        dL_dw.append(layer._W_pd)

In [11]:
example_pic.shape

(50, 1, 28, 28)

In [12]:
example_pic_tf = tf.constant(
    np.moveaxis(
        example_pic,
        1, -1
    ), 
    dtype=tf.float32
)

weights_tf = [
    tf.constant(
        np.moveaxis(
            l.W,
            0, -1
        ).astype(np.float32)
    )
    for l in [layers[0], layers[2]]
]
weights_tf.extend([
    tf.constant(
        np.moveaxis(
            layers[5].W[1:, :],
            0, -1
        ).astype(np.float32)
    ),
    tf.constant(
        np.moveaxis(
            layers[5].W[0, :],
            0, -1
        ).astype(np.float32)
    ),  
])

2023-11-07 14:16:17.564457: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-07 14:16:17.565690: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [13]:
example_pic_tf.shape

TensorShape([50, 28, 28, 1])

In [14]:
tf_layers = [
    t_layers.Conv2D(
        4, 3,
        input_shape=example_pic_tf.shape[1:],
        use_bias=False,
        kernel_initializer=tf.keras.initializers.Constant(weights_tf[0])
    ),
    t_layers.ReLU(),
    t_layers.Conv2D(
        2, 7,
        use_bias=False,
        kernel_initializer=tf.keras.initializers.Constant(weights_tf[1])
    ),
    t_layers.ReLU(),
    t_layers.Flatten(),
    t_layers.Dense(
        10,
        use_bias=True,
        kernel_initializer=tf.keras.initializers.Constant(weights_tf[2]),
        bias_initializer=tf.keras.initializers.Constant(weights_tf[3])
    )
]
model = tf.keras.models.Sequential()
for l in tf_layers:
    model.add(l)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 26, 26, 4)         36        
                                                                 
 re_lu (ReLU)                (None, 26, 26, 4)         0         
                                                                 
 conv2d_1 (Conv2D)           (None, 20, 20, 2)         392       
                                                                 
 re_lu_1 (ReLU)              (None, 20, 20, 2)         0         
                                                                 
 flatten (Flatten)           (None, 800)               0         
                                                                 
 dense (Dense)               (None, 10)                8010      
                                                                 
Total params: 8438 (32.96 KB)
Trainable params: 8438 (32

In [15]:
with tf.GradientTape(persistent=True) as tape:
    tape.watch(example_pic_tf)
    conv_output = model(example_pic_tf)

conv_output_np = conv_output.numpy()

1. The outputs are similar:

In [16]:
np.max((conv_output_np - state[0]) < 2e-1)

True

In [17]:
dL_dy_keras = tf.constant(np.moveaxis(dL_dy, 1, -1), dtype=tf.float32)

dL_dx_keras = tape.gradient(conv_output, example_pic_tf, output_gradients=dL_dy_keras)

dL_dw_keras = [
    tape.gradient(conv_output, layer_i.trainable_variables)
    for layer_i in [tf_layers[0], tf_layers[2], tf_layers[5]]
]

dL_dx_keras_np = np.moveaxis(dL_dx_keras.numpy(), -1, 1)
dL_dw_keras_np = [dL_dw_keras_i[0].numpy() for dL_dw_keras_i in dL_dw_keras][::-1]

In [18]:
dL_dw[0] = dL_dw[0][1:, :]
dL_dw[1] = dL_dw[1].T
dL_dw[2] = dL_dw[2].T

In [19]:
for a in dL_dw_keras_np:
    print(a.shape)

(800, 10)
(7, 7, 4, 2)
(3, 3, 1, 4)


2. The output of the backpropogation is similar (hence, all the inner states of back propogation are similar as well)

In [20]:
np.all((dL_dx_keras_np - dL_dx[-1]) < 1e-1)

True

I have several ideas why results diverge and will try to fix them in future works.