In [6]:
import keras
from keras import ops
import tensorflow as tf
(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()

# Convolutional Neural Networks

In this simple example, we'll use fashion MNIST, first with just dense mlp layers, then with including a custom convolutional layer. If it's interesting, I will bake in some optimizations for the convolution

### Simple MLP Model
accuracy is around .86, and runs in less than 30 seconds

In [2]:
simple_mlp_model = keras.Sequential([
    keras.Input(shape=(28,28)),
    keras.layers.Rescaling(1.0 / 255.0),
    keras.layers.Flatten(),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(.5),
    keras.layers.Dense(10),
])

simple_mlp_model.compile(
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer = keras.optimizers.RMSprop(),
    metrics = ['accuracy']
)

history = simple_mlp_model.fit(
    x_train,
    y_train,
    batch_size=64,
    epochs=10,
    validation_split=0.15
)

test_scores=simple_mlp_model.evaluate(x_test, y_test, verbose=1)

print(f'Test Loss: {test_scores[0]}')
print(f'Test Accuracy: {test_scores[1]}')

Epoch 1/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7184 - loss: 0.7936 - val_accuracy: 0.8527 - val_loss: 0.4077
Epoch 2/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8413 - loss: 0.4388 - val_accuracy: 0.8629 - val_loss: 0.3868
Epoch 3/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8586 - loss: 0.3984 - val_accuracy: 0.8596 - val_loss: 0.3958
Epoch 4/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8683 - loss: 0.3729 - val_accuracy: 0.8769 - val_loss: 0.3603
Epoch 5/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8730 - loss: 0.3695 - val_accuracy: 0.8763 - val_loss: 0.3717
Epoch 6/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8759 - loss: 0.3629 - val_accuracy: 0.8759 - val_loss: 0.3881
Epoch 7/10
[1m797/797[0m 

### Keras Convolution and MaxPool2D
accuracy is around .90 and takes a few minutes to train

In [4]:
keras_cnn_model = keras.Sequential([
    keras.layers.Input(shape=(28, 28, 1)),
    keras.layers.Rescaling(1.0 / 255.0),

    keras.layers.Conv2D(filters=16, kernel_size=(3, 3), padding='same'),
    keras.layers.BatchNormalization(),
    keras.layers.Activation(keras.activations.relu),
    keras.layers.MaxPool2D(pool_size=(2, 2)),

    keras.layers.Conv2D(filters=32, kernel_size=(3, 3), padding='same'),
    keras.layers.BatchNormalization(),
    keras.layers.Activation(keras.activations.relu),
    keras.layers.MaxPool2D(pool_size=(2, 2)),

    keras.layers.Flatten(),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(10)
])

keras_cnn_model.compile(
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer = keras.optimizers.RMSprop(),
    metrics = ['accuracy']
)

history = keras_cnn_model.fit(
    x_train,
    y_train,
    batch_size=64,
    epochs=10,
    validation_split=0.15
)

test_scores=keras_cnn_model.evaluate(x_test, y_test, verbose=1)

print(f'Test Loss: {test_scores[0]}')
print(f'Test Accuracy: {test_scores[1]}')

Epoch 1/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 16ms/step - accuracy: 0.7165 - loss: 0.8453 - val_accuracy: 0.8679 - val_loss: 0.3759
Epoch 2/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 18ms/step - accuracy: 0.8709 - loss: 0.3680 - val_accuracy: 0.8859 - val_loss: 0.3135
Epoch 3/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 20ms/step - accuracy: 0.8931 - loss: 0.3079 - val_accuracy: 0.9038 - val_loss: 0.2855
Epoch 4/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 21ms/step - accuracy: 0.9021 - loss: 0.2773 - val_accuracy: 0.8891 - val_loss: 0.3351
Epoch 5/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 22ms/step - accuracy: 0.9139 - loss: 0.2571 - val_accuracy: 0.8976 - val_loss: 0.2974
Epoch 6/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 20ms/step - accuracy: 0.9187 - loss: 0.2364 - val_accuracy: 0.9024 - val_loss: 0.2862
Epoch 7/10
[1m7

### My implementation

In [None]:
#I'm fixing stride and padding. Bite me
class MyConv2D(keras.layers.Layer):
    def __init__(self, filters, kernel_size ,**kwargs):
        super().__init__(**kwargs)
        self.filters = filters
        self.kernel_size = kernel_size
        self.k_h, self.k_w = kernel_size
    
    def build(self, input_shape):
        self.input_channels = input_shape[-1]
        self.kernel_weights = self.add_weight(
            shape=(self.k_h,self.k_w, self.input_channels, self.filters), #channels last convention in keras
            initializer='random_normal',
            trainable=True
        )
        self.bias = self.add_weight(
            shape=(self.filters,),
            initializer='zeros',
            trainable=True
        )
    
    # nested for loops were a disaster so I'm doing the extra credit im2col method
    def call(self, inputs):
        batch_size = ops.shape(inputs)[0]
        input_h, input_w = ops.shape(inputs)[1], ops.shape(inputs)[2]
        k_h, k_w = self.k_h, self.k_w
        patches = tf.image.extract_patches(
            images=inputs,
            sizes=[1,k_h, k_w, 1],
            strides=[1,1,1,1],
            rates=[1,1,1,1],
            padding='SAME'
        )
        patches_reshaped = ops.reshape(patches, (batch_size, -1, ops.shape(patches)[-1]))
        patches_reshaped = ops.reshape(patches_reshaped, (-1, ops.shape(patches_reshaped)[-1]))
        kernel_reshaped = ops.reshape(self.kernel_weights, (-1, self.filters))
        output = ops.matmul(patches_reshaped, kernel_reshaped)
        output = output + self.bias
        out_h = input_h
        out_w = input_w
        final_output = ops.reshape(output, (batch_size, out_h, out_w, self.filters))
        return final_output

                
    def get_config(self):
        config = super().get_config()
        config.update({
            "filters": self.filters,
            "kernel_size" : self.kernel_size,
        })
        return config

class MyBatchNormalization(keras.layers.Layer):
    def __init__(self, momentum = 0.99, epsilon = 1e-3, **kwargs):
        super().__init__(**kwargs)
        self.momentum = momentum
        self.epsilon = epsilon
        
    def build(self, input_shape):
        num_channels = input_shape[-1]
        self.gamma = self.add_weight(
            name='gamma',
            shape=(num_channels,),
            initializer='ones',
            trainable=True
        )
        self.beta = self.add_weight(
            name='beta',
            shape=(num_channels,),
            initializer='zeros',
            trainable=True
        )
        self.moving_mean = self.add_weight(
            name='moving_mean',
            shape=(num_channels,),
            initializer='zeros',
            trainable=False
        )
        self.moving_variance = self.add_weight(
            name='moving_variance',
            shape=(num_channels,),
            initializer='ones',
            trainable=False
        )
        super().build(input_shape)
    
    def call(self, inputs, training=None):
        if training:
            if len(inputs.shape) == 4:
                axes = [0,1,2]
            else:
                axes = [0]
            batch_mean = ops.mean(inputs, axis=axes)
            batch_variance = ops.var(inputs, axis=axes)

            self.moving_mean.assign(
                self.moving_mean * self.momentum + batch_mean * (1 - self.momentum)
            )
            self.moving_variance.assign(
                self.moving_variance * self.momentum + batch_variance * (1 - self.momentum)
            )
            
            normalized_inputs = (inputs - batch_mean) / ops.sqrt(batch_variance + self.epsilon)

        else:
            normalized_inputs = (inputs - self.moving_mean) / ops.sqrt(self.moving_variance + self.epsilon)

        return self.gamma * normalized_inputs + self.beta
    

In [12]:
keras_cnn_model = keras.Sequential([
    keras.layers.Input(shape=(28, 28, 1)),
    keras.layers.Rescaling(1.0 / 255.0),

    MyConv2D(filters=16, kernel_size=(3, 3)),
    MyBatchNormalization(),
    keras.layers.Activation(keras.activations.relu),
    keras.layers.MaxPool2D(pool_size=(2, 2)),

    MyConv2D(filters=32, kernel_size=(3, 3)),
    MyBatchNormalization(),
    keras.layers.Activation(keras.activations.relu),
    keras.layers.MaxPool2D(pool_size=(2, 2)),

    keras.layers.Flatten(),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(10)
])

keras_cnn_model.compile(
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer = keras.optimizers.RMSprop(),
    metrics = ['accuracy']
)

history = keras_cnn_model.fit(
    x_train,
    y_train,
    batch_size=64,
    epochs=10,
    validation_split=0.15
)

test_scores=keras_cnn_model.evaluate(x_test, y_test, verbose=1)

print(f'Test Loss: {test_scores[0]}')
print(f'Test Accuracy: {test_scores[1]}')

Epoch 1/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 24ms/step - accuracy: 0.7244 - loss: 0.8281 - val_accuracy: 0.8780 - val_loss: 0.3421
Epoch 2/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 23ms/step - accuracy: 0.8773 - loss: 0.3484 - val_accuracy: 0.8964 - val_loss: 0.3068
Epoch 3/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 26ms/step - accuracy: 0.8929 - loss: 0.2995 - val_accuracy: 0.9074 - val_loss: 0.2614
Epoch 4/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 25ms/step - accuracy: 0.9062 - loss: 0.2645 - val_accuracy: 0.8849 - val_loss: 0.3418
Epoch 5/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 24ms/step - accuracy: 0.9126 - loss: 0.2503 - val_accuracy: 0.9061 - val_loss: 0.2677
Epoch 6/10
[1m797/797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 25ms/step - accuracy: 0.9154 - loss: 0.2433 - val_accuracy: 0.9102 - val_loss: 0.2797
Epoch 7/10
[1m7

## Takeaways

The overall takeaways on what the architecture does is relatively simple, but the implementation is suprisingly complex, especially if you want the model to not suck (using im2col for faster training and batch normalization). To be honest, my implementation is more so a hodgepodge of snippets and formulas online that I just pasted in, and it worked. To be honest, for the first time, I don't think I need to fully understand the mathematical details of the im2col trick.

I find batch normalization to be a cool trick though. It's basically giving hyperparameters to the neural network to optimize like inputs. This is only possible due to the differentiability of the gamma switch, which we can't say about something like dropout (a binary switch).