# Convolutional Neural Network: MNIST

based on https://towardsdatascience.com/going-beyond-99-mnist-handwritten-digits-recognition-cfff96337392

In [None]:
import random
import compyute as cp

In [None]:
device = cp.cuda if cp.backend.gpu_available() else cp.cpu
device

## Prepare the Data

In [None]:
# ! pip install pandas

In [None]:
import pandas as pd

# download the datasets
train_url = "https://pjreddie.com/media/files/mnist_train.csv"
train_data = pd.read_csv(train_url)
train_tensor = cp.tensor(train_data.to_numpy())

test_url = "https://pjreddie.com/media/files/mnist_test.csv"
test_data = pd.read_csv(test_url)
test = cp.tensor(test_data.to_numpy())

# split the data into train, val, test
train, val, _ = cp.preprocessing.split_train_val_test(train_tensor, ratio_val=0.2, ratio_test=0.0)

# split features from targets
X_train, y_train = train[:, 1:], train[:, 0].to_int()
X_val, y_val = val[:, 1:], val[:, 0].to_int()
X_test, y_test = test[:, 1:], test[:, 0].to_int()

# reshape the data into an image format (B, 784) -> (B, 1, 28, 28)
X_train = cp.reshape(X_train, shape=(X_train.shape[0], 1 , 28, -1)).to_float()
X_val = cp.reshape(X_val, shape=(X_val.shape[0], 1, 28, -1)).to_float()
X_test = cp.reshape(X_test, shape=(X_test.shape[0], 1, 28, -1)).to_float()

# scaling
def scale(x: cp.Tensor) -> cp.Tensor:
    mean_px = x.mean().to_type(cp.float32)
    std_px = x.std().to_type(cp.float32)
    return (x - mean_px)/(std_px)

X_train = scale(X_train)
X_val = scale(X_val)
X_test = scale(X_test)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_val.shape=}')
print(f'{y_val.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

## Build the Neural Network

In [None]:
from compyute import nn

model = nn.Sequential(
    nn.Conv2D(1, 32, 5), nn.ReLU(),
    nn.Conv2D(32, 32, 5, bias=False), nn.BatchNorm2D(32), nn.ReLU(),
    nn.MaxPooling2D(2), nn.Dropout(0.25),
    
    nn.Conv2D(32, 64, 3), nn.ReLU(),
    nn.Conv2D(64, 64, 3, bias=False), nn.BatchNorm2D(64), nn.ReLU(),
    nn.MaxPooling2D(2), nn.Dropout(0.25),

    nn.Flatten(),
    nn.Linear(576, 256, bias=False), nn.BatchNorm1D(256), nn.ReLU(),
    nn.Linear(256, 128, bias=False), nn.BatchNorm1D(128), nn.ReLU(),
    nn.Linear(128, 84, bias=False), nn.BatchNorm1D(84), nn.ReLU(), nn.Dropout(0.25),
    nn.Linear(84, 10),
)

model.to_device(device)

In [None]:
summary = cp.nn.utils.get_module_summary(model, input_shape=(1, 28, 28))
print(summary)

## Training

In [None]:
from compyute.nn.trainer import Trainer
from compyute.nn.trainer.callbacks import History, ProgressBar
from compyute.nn.trainer.callbacks.lr_schedulers import AdaptiveLrScheduler

optim = nn.optimizers.Adam(model.get_parameters())
history = History()
scheduler = AdaptiveLrScheduler(optim, target="val_loss", patience=2, lr_downscale_factor=0.2)  # reduces the learning rate when the target metric is not improving

trainer = Trainer(
    model=model,
    optimizer=optim,
    loss="cross_entropy",
    metric="accuracy",
    callbacks=[history, ProgressBar(mode="step"), scheduler]
)

In [None]:
epochs = 10
batch_size = 128

model.retain_values = True  # keep activations for visualization
trainer.train(X_train, y_train, epochs=epochs, batch_size=batch_size, val_data=(X_val, y_val))

Plot the training history

In [None]:
# ! pip install matplotlib

In [None]:
import matplotlib.pyplot as plt

def plot_history(t1, t2):
    trace1 = history[t1]
    trace2 = history[t2]
    plt.figure(figsize=(10, 3))
    plt.plot(cp.arange(start=1, stop=len(trace1) + 1), trace1, linewidth=1)
    plt.plot(cp.arange(start=1, stop=len(trace2) + 1), trace2, linewidth=1)

plot_history("loss", "accuracy_score")

## Evaluate the Model

In [None]:
loss, accuracy = trainer.evaluate_model(X_test, y_test, batch_size)
print(f'loss {loss:.4f}')
print(f'accuracy {accuracy*100:.2f}')

In [None]:
# ! pip install scikit-learn

Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np
from compyute.nn.utils import batched

y_pred = batched(model, batch_size, device, False)(X_test)
probs = nn.functional.softmax(y_pred)

cm = confusion_matrix(
    y_true=y_test.to_numpy(),
    y_pred=cp.argmax(probs, dim=-1).to_numpy(),
    labels=cp.unique(y_test).to_numpy()
)

r = cp.arange(10, dtype=cp.int32).to_numpy()
plt.imshow(cm, cmap="Blues")
plt.xlabel("prediction")
plt.ylabel("target")
plt.xticks(ticks=r, labels=r)
plt.yticks(ticks=r, labels=r)
for (j, i), label in np.ndenumerate(cm):
    plt.text(i, j, str(int(label)), ha="center", va="center")

## Explore what the Model has learned
Pick a random image from the testing dataset.

In [None]:
i = random.randint(0, len(X_test) - 1)
image = cp.movedim(X_test[i], from_dim=0, to_dim=-1)  # matplotlib needs the color channel to be the last dim

plt.figure(figsize=(3, 3))
plt.imshow(image.data, cmap='gray')
plt.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False)

Use it to predict a number and show the probability distribution of the outcome.

In [None]:
print(f"correct label: {y_test[i].item()}")

image_tensor = X_test[None, i].to_device(device)

# retain values so we can look at intermediates
model.retain_values = True
logits = model(image_tensor)
model.retain_values = False

probs = cp.squeeze(cp.nn.functional.softmax(logits)[0])
pred = cp.squeeze(cp.argmax(probs, dim=-1)).item()

print(f"predicted label: {pred}")

plt.figure(figsize=(5, 3))
plt.xticks(ticks=r)
plt.bar(r, probs.to_numpy())
plt.xlabel("class")
plt.ylabel("probability");

Every layer of the model can be accessed to explore their output. Here we iterate over all the kernels of the convolutional layer to explore what they learned to focus on in images.

In [None]:
def plot_channels(array, label):
    plt.figure(figsize=(20, 20))
    for i in range(array.shape[0]):
        plt.subplot(10, 8, i + 1)
        image = array[i, :, :]
        plt.imshow(image, vmin=cp.min(image).item(), vmax=cp.max(image).item(), cmap="gray")
        plt.xlabel(f"channel {str(i + 1)}")
        plt.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False)
    plt.show()

In [None]:
conv1 = model.layers[0]

out = conv1.y[0].to_cpu()
out_min = cp.min(out, dim=0)
out_max = cp.max(out, dim=0)
out = (out - out_min) / (out_max - out_min)
plot_channels(out, "channel")

In [None]:
conv2 = model.layers[2]

out2 = conv2.y[0].to_cpu()
out_min2 = cp.min(out2, dim=0)
out_max2 = cp.max(out2, dim=0)
out2 = (out2 - out_min2) / (out_max2 - out_min2)
plot_channels(out2, "channel")

Learned filters

In [None]:
weights1 = cp.sum(conv1.w, dim=1).to_cpu()
weights_min1 = cp.min(weights1, dim=0)
weights_max1 = cp.max(weights1, dim=0)
weights1 = (weights1 - weights_min1) / (weights_max1 - weights_min1)
plot_channels(weights1, "filter")

In [None]:
weights2 = cp.sum(conv2.w, dim=1).to_cpu()
weights_min2 = cp.min(weights2, dim=0)
weights_max2 = cp.max(weights2, dim=0)
weights2 = (weights2 - weights_min2) / (weights_max2 - weights_min2)
plot_channels(weights2, "filter")