# Project: Example of Training a CNN using MNIST datasaet

Below is a PyTorch implementation of a CNN modeling the MNIST dataset. The network has two convolution layers, each followed by a pooling layer. After the convolution layers there is one fully connected hidden layer followed by an output layer. I added batch normalization layers to help with convergence. The ReLU activation function is used.

After training, reclassification, test, and 10-fold CV accuracies are calculated.

First, a check is done for GPU availability.

In [26]:
import logging
logging.basicConfig(level=logging.ERROR)

In [1]:
%%time
import numpy as np
import torch
import torchvision.transforms as tt
from torchvision.datasets import ImageFolder

print(f'PyTorch version= {torch.__version__}')
print(f'CUDA available= {torch.cuda.is_available()}')
device_type = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'Number of GPUs: {torch.cuda.device_count()}')

PyTorch version= 2.2.1+cu121
CUDA available= True
Number of GPUs: 1
CPU times: user 4.56 s, sys: 698 ms, total: 5.26 s
Wall time: 14.7 s


Below is a utility method used to define layers in the network.

In [3]:
from math import floor
# Calculate output dimensions of convolution layer for 2D matrix.
# in_d - input dimensions, e.g. (28, 28)
# pad_d - padding dimensions for top and bottom and left and right, eg. ((0,0),(0,0))
# k_d - kernel dimensions, e.g. (3,3)
# str_d - stride dimensions, e.g. (2,2)
# pool_d - dimenfor square pool filter, e.g. 2
def conv_output(in_d, pad_d, k_d, str_d, pool_d=None):
    height = floor((in_d[0] + pad_d[0][0] + pad_d[0][1] - k_d[0])/(str_d[0]) + 1)
    width  = floor((in_d[1] + pad_d[1][0] + pad_d[1][1] - k_d[1])/(str_d[1]) + 1)
    if pool_d is not None:
        height, width = height/pool_d, width/pool_d
    return (height, width)

Next, some configuration for the network is set.

In [4]:
IMG_SIZE = (28,28)
IMG_CHANNELS= 1
KERNEL = (5,5)
OUT_FEATURE_MAPS = 16
STRIDE = (1,1)
POOL = 2
LAST_LINEAR_SIZE = 1024  # 1024 is arbitrary
N_CLASSES = 10  # output layer size
BATCH_SIZE = 5000

Using the function created for Problem 3, the output of the last convolution layer is calculated. This is done to define the input layer size of the fully connected layer.

In [5]:
l1_out = conv_output(IMG_SIZE, ((0,0),(0,0)), KERNEL, STRIDE, POOL)
print(f'Conv. 1 output dimension: {l1_out}')
l2_out = conv_output((l1_out[0],l1_out[1]), ((0,0),(0,0)), KERNEL, STRIDE, POOL)
print(f'Conv. 2 output dimension: {l2_out}')

Conv. 1 output dimension: (12.0, 12.0)
Conv. 2 output dimension: (4.0, 4.0)


The CNN model is defined below.

In [17]:
import torch.nn as nn
from tqdm.notebook import tqdm

class MNIST_CNN_v1(torch.nn.Sequential):
    def __init__(self, epochs=100, eta=0.001, seed=0):
        super(MNIST_CNN_v1, self).__init__()

        self.random = np.random.RandomState(seed)  # shuffle mini batches
        self.epochs = epochs
        self.eta = eta
        self.optimizer = None

        self.init_layers()

    def init_layers(self):

        print('Using MNISTConv_v1 init_layers... ')

        self.append(nn.Conv2d(IMG_CHANNELS, OUT_FEATURE_MAPS, KERNEL[0]).to(device_type))
        self.append(nn.BatchNorm2d(OUT_FEATURE_MAPS).to(device_type))
        self.append(nn.ReLU().to(device_type))
        self.append(nn.MaxPool2d(POOL).to(device_type))

        # OUT_FEATURE_MAPS * 2 just doubles the number of output features maps.
        self.append(nn.Conv2d(OUT_FEATURE_MAPS, OUT_FEATURE_MAPS * 2, KERNEL[0]).to(device_type))
        self.append(nn.BatchNorm2d(OUT_FEATURE_MAPS * 2).to(device_type))
        self.append(nn.ReLU().to(device_type))
        self.append(nn.MaxPool2d(POOL).to(device_type))

        self.append(nn.Flatten(start_dim=1).to(device_type))

        self.append(nn.Linear(OUT_FEATURE_MAPS * POOL * int(l2_out[0]) * int(l2_out[1]), LAST_LINEAR_SIZE).to(device_type))
        self.append(nn.ReLU().to(device_type))
        self.append(nn.Linear(LAST_LINEAR_SIZE, N_CLASSES).to(device_type))


    def predict(self, _X):
        _X = _X.to(device_type, non_blocking=True)
        self.eval()
        with torch.no_grad():
            y_pred = np.argmax(self(_X).cpu(), axis=1)
        self.train()
        return y_pred.numpy()

    def fit(self, _X_train_dl, info=False):
        import sys

        self.optimizer = torch.optim.Rprop(self.parameters(), lr=self.eta)
        loss_func = torch.nn.CrossEntropyLoss()

        for i in tqdm(range(self.epochs), desc='epochs', leave=False):
            if info:
                print(f'Epoc {i+1}...',end='\n')

            for _X, _y in tqdm(_X_train_dl, desc='batches', leave=False):
                _X = _X.to(device_type, non_blocking=True)
                _y = _y.to(device_type, non_blocking=True)
                self.optimizer.zero_grad()
                net_out = self(_X)
                loss = loss_func(net_out, _y)
                loss.backward()
                self.optimizer.step()
                if info:
                    sys.stderr.write(f"\r{i+1:03d} Loss: {loss.item():6.5f}")
                    sys.stderr.flush()
        return self

Below, data-loaders are defined for the MNIST data. A normalization transformation is applied to the data.

In [18]:
# Get the training and testing datasets from a path with resize and normalization
import torchvision.datasets as dset
import torchvision.transforms as vtransforms
def get_dataloader(_img_size, _bs, _ds, _path):
    train_ds = _ds(
        root=_path, download=True, train=True,
        transform=vtransforms.Compose([
            vtransforms.Resize(_img_size),
            vtransforms.ToTensor(),
            vtransforms.Normalize((0.5,), (0.5,))
        ]))

    # Use pin_memory=True to fix GPU memory
    train_dl = torch.utils.data.DataLoader(train_ds, batch_size=_bs, shuffle=True,
                                           pin_memory=False if device_type == 'cpu' else True,
                                           num_workers=2)

    test_ds = _ds(
        root=_path, download=False, train=False,
        transform=vtransforms.Compose([
            vtransforms.Resize(_img_size),
            vtransforms.ToTensor(),
            vtransforms.Normalize((0.5,), (0.5,))
        ]))

    test_dl = torch.utils.data.DataLoader(test_ds, batch_size=_bs, shuffle=True,
                                          pin_memory=False if device_type == 'cpu' else True,
                                          num_workers=2)

    return train_dl, train_ds, test_dl, test_ds

In [14]:
# Download MNISTS files and create training and test data sets.
def get_dl_mnist(_img_size, _bs):
    return get_dataloader(_img_size, _bs, dset.MNIST, './mnist')

The training and test data is loaded below.

In [15]:
train_dl, train_ds, test_dl, test_ds = get_dl_mnist(IMG_SIZE[0], BATCH_SIZE)

Finally, the model is trained. The output below displays the loss calculated on the final iteration of training for an epoch.

In [19]:
%%time
mlp1 = MNIST_CNN_v1(epochs=7, eta=0.001)
mlp1 = mlp1.to(device_type)
mlp1 = mlp1.fit(train_dl, info=True)

Using MNISTConv_v1 init_layers... 


epochs:   0%|          | 0/7 [00:00<?, ?it/s]

Epoc 1...


batches:   0%|          | 0/12 [00:00<?, ?it/s]

001 Loss: 0.22686

Epoc 2...


batches:   0%|          | 0/12 [00:00<?, ?it/s]

002 Loss: 0.09645

Epoc 3...


batches:   0%|          | 0/12 [00:00<?, ?it/s]

003 Loss: 0.07167

Epoc 4...


batches:   0%|          | 0/12 [00:00<?, ?it/s]

004 Loss: 0.05531

Epoc 5...


batches:   0%|          | 0/12 [00:00<?, ?it/s]

005 Loss: 0.04738

Epoc 6...


batches:   0%|          | 0/12 [00:00<?, ?it/s]

006 Loss: 0.04585

Epoc 7...


batches:   0%|          | 0/12 [00:00<?, ?it/s]

007 Loss: 0.04849

CPU times: user 3.09 s, sys: 1.02 s, total: 4.11 s
Wall time: 1min 43s


The model's reclassification accuracy is calculated below.

In [27]:
%%time
from sklearn.metrics import accuracy_score
y_preds = []
for X, y in train_dl:
    y_pred = mlp1.predict(X)
    y_preds.append( accuracy_score(y, y_pred) )
print(f'Reclassification accuracy: {np.mean(y_preds):.6f}')

  self.pid = os.fork()
  self.pid = os.fork()


Reclassification accuracy: 0.986417
CPU times: user 229 ms, sys: 165 ms, total: 394 ms
Wall time: 13.1 s


Next, the test accuracy is calculated.

In [21]:
%%time
from sklearn.metrics import accuracy_score
y_preds = []
for X, y in test_dl:
    y_pred = mlp1.predict(X)
    y_preds.append( accuracy_score(y, y_pred) )
print(f'Test accuracy: {np.mean(y_preds):.6f}')

  self.pid = os.fork()
  self.pid = os.fork()


Test accuracy: 0.985200
CPU times: user 53.4 ms, sys: 58.6 ms, total: 112 ms
Wall time: 2.17 s


Ad-hoc experimenting with dropout layers showed no increase in either reclassification or test accuracy, so they were left out.

To get a better idea of generalization of the model, 5-fold CV is done below.

In [28]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

epocs = 5
device = device_type

def run_k_fold(get_model):

    k_folds = 5
    batch_size = 5000

    # Initialize the k-fold cross validation
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=0)

    accuracy_results = []

    # Loop through each fold
    for fold, (train_idx, test_idx) in enumerate(kf.split(train_ds)):
        print(f"Fold {fold + 1}")
        print("-------")

        # Define the data loaders for the current fold
        train_loader = torch.utils.data.DataLoader(
            dataset=train_ds,
            batch_size=batch_size,
            sampler=torch.utils.data.SubsetRandomSampler(train_idx),
        )
        test_loader = torch.utils.data.DataLoader(
            dataset=train_ds,
            batch_size=batch_size,
            sampler=torch.utils.data.SubsetRandomSampler(test_idx),
        )

        # Initialize the model and optimizer
        model = get_model(fold) #RGBConv_v1(epochs=epocs, eta=0.0001).to(device)

        # Train the model on the current fold
        model.fit(train_loader, info=False)

        # Evaluate the model on the test set
        model.eval()
        test_loss = 0
        correct = 0
        fold_accuracy = []
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model.predict(data)
                score = accuracy_score(target.cpu(), output)
                fold_accuracy.append( score )
            accuracy_results.append( fold_accuracy )

        # Print the results for the current fold
        print(f"Average Fold Accuracy: {np.mean(fold_accuracy):0.6f}, {chr(177)} {np.std(fold_accuracy):.6f}")
    print(f'10-fold CV Average Accuracy: {np.mean(accuracy_results):.6f} {chr(177)} {np.std(accuracy_results):.6f}')

In [29]:
%%time
def get_mode_v1(fold):
    return MNIST_CNN_v1(epochs=epocs, eta=0.001, seed=fold).to(device)

run_k_fold(get_mode_v1)

Fold 1
-------
Using MNISTConv_v1 init_layers... 


epochs:   0%|          | 0/5 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

Average Fold Accuracy: 0.981867, ± 0.000822
Fold 2
-------
Using MNISTConv_v1 init_layers... 


epochs:   0%|          | 0/5 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

Average Fold Accuracy: 0.979033, ± 0.001775
Fold 3
-------
Using MNISTConv_v1 init_layers... 


epochs:   0%|          | 0/5 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

Average Fold Accuracy: 0.978833, ± 0.003466
Fold 4
-------
Using MNISTConv_v1 init_layers... 


epochs:   0%|          | 0/5 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

Average Fold Accuracy: 0.980267, ± 0.000772
Fold 5
-------
Using MNISTConv_v1 init_layers... 


epochs:   0%|          | 0/5 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/10 [00:00<?, ?it/s]

Average Fold Accuracy: 0.978833, ± 0.001658
10-fold CV Average Accuracy: 0.979767 ± 0.002286
CPU times: user 5min 17s, sys: 1.15 s, total: 5min 18s
Wall time: 5min 22s


Per the results above, the model is expected to generalize well to unseen data.