# MNIST Convolutional Neural Network (non-OOP approach)

This notebook is actually an assignment for the CNN module in the Duke University's _Introduction to Machine Learning_ course on Coursera (great course by the way, check it out!). 

The assignment consisted in building a CNN that have the following properties: 

1. Image (28x28 pixels);
2. Convolution, $C_{out} = 32$;
3. (ReLU);
4. Convolution, $C_{out} = 32$;
5. (ReLU);
6. 2x2 maxpool;
7. Convolution, $C_{out} = 64$;
8. (ReLU);
9. Convolution, $C_{out} = 64$;
10. (ReLU);
11. 2x2 maxpool;
12. fully connected hidden layer $(\mathbb{R}^{256})$;
13. (ReLU);
14. fully connected hidden layer $(\mathbb{R}^{10})$;
15. softmax.

As a challenge to myself (and because I did not know in what I was getting myself into) I decided to do it without the higher level API from nn.Module. 



### Imports and data loading

In [87]:
import torch
import torch.nn.functional as F
import numpy as np
from torchvision import datasets, transforms

mnist_train = datasets.MNIST(root="./datasets", train=True, 
                             transform=transforms.ToTensor(), download=True)
mnist_test = datasets.MNIST(root="./datasets", train=False, 
                            transform=transforms.ToTensor(), download=True)
train_loader = torch.utils.data.DataLoader(mnist_train, batch_size=100, shuffle=True)
test_loader = torch.utils.data.DataLoader(mnist_test, batch_size=100, shuffle=False)

### Constants

Trying to calculate the resulting dimensions after successive convolutions was giving me a headache, but it turns out there’s a closed-form expression for that.

$$d = \left\lfloor\frac{H + 2P-D(K-1) - 1}{S} + 1\right\rfloor$$

Credits: https://docs.pytorch.org/docs/stable/generated/torch.nn.Conv2d.html#torch.nn.Conv2d

In [None]:
K1 = 3
K2 = 3
K3 = 3
K4 = 3

MP_K1 = 2
MP_K2 = 2

P1 = 2
P2 = 2
P3 = 2
P4 = 2

S1 = 1
S2 = 1
S3 = 1
S4 = 1

BATCH_SIZE = 100

IN = 1
IMG_WIDTH = 28
IMG_HEIGHT = 28
IMG_VEC = IMG_WIDTH*IMG_HEIGHT

#output calcultions 
C1_OUT = 32
SDIM_OUT1 =  int(np.floor((IMG_WIDTH + 2*P1-1*(K1-1)-1)/S1 + 1))
C2_OUT = 32
#after 2x2 maxpool 
SDIM_OUT2 = int(np.floor((SDIM_OUT1 + 2*P2-1*(K2-1)-1)/S2 + 1))//2
C3_OUT = 64
SDIM_OUT3 = int(np.floor((SDIM_OUT2 + 2*P3-1*(K3-1)-1)/S3 + 1))
C4_OUT = 64
#after 2x2 maxpool
SDIM_OUT4 = int(np.floor((SDIM_OUT3 + 2*P4-1*(K4-1)-1)/S4 + 1))//2

FC1_OUT = 256
FC2_OUT = 10

EPOCHS = 4

### Tensor definitions

In [None]:
#convolutional layers

#convolutional layer 1
#batch_size = 100, 1 channel, 28x28 pixels
w1 = torch.randn(C1_OUT, 
                 IN, 
                 K1, 
                 K1, 
                 requires_grad = True)
w1.data = w1.data/np.sqrt(C1_OUT*K1**2)
print(w1.shape)
b1 = torch.zeros(C1_OUT, requires_grad = True)

#convolutional layer 2
w2 = torch.randn(C2_OUT, 
                 C1_OUT, 
                 K2, 
                 K2, 
                 requires_grad = True)

w2.data = w2.data/np.sqrt(C2_OUT*K2**2)

b2 = torch.zeros(C2_OUT, requires_grad = True)

#convolutional layer 3
w3 = torch.randn(C3_OUT, 
                 C2_OUT, 
                 K3, 
                 K3, 
                 requires_grad = True)

w3.data = w3.data/np.sqrt(C3_OUT*K3**2)

b3 = torch.zeros(C3_OUT, requires_grad = True)

#convolutional layer 4
w4 = torch.randn(C4_OUT, 
                 C3_OUT, 
                 K4, 
                 K4, 
                 requires_grad = True)

w4.data = w4.data/np.sqrt(C4_OUT*K4**2)

b4 = torch.zeros(C4_OUT, requires_grad = True)


#fully connected layers
#fc1
w_fc1 = torch.randn(C4_OUT*SDIM_OUT4**2, 
                    FC1_OUT, 
                    requires_grad = True)
w_fc1.data = w_fc1.data/np.sqrt(SDIM_OUT4)

b_fc1 = torch.zeros(FC1_OUT, requires_grad = True)

#fc2
w_fc2 = torch.randn(FC1_OUT, 
                    FC2_OUT,
                    requires_grad = True)
w_fc2.data = w_fc2.data/np.sqrt(FC1_OUT)

b_fc2 = torch.zeros(FC2_OUT, requires_grad = True)


torch.Size([32, 1, 3, 3])


### Calculations

In [90]:
def foward(x):
    #first convolution
    c1 = F.conv2d(x, w1, bias = b1, stride = S1, padding = P1)
    c1 = F.relu(c1)
    #print("c1: ", c1.shape)

    #second convolution
    c2 = F.conv2d(c1, w2, bias = b2, stride = S2, padding = P2)
    c2 = F.relu(c2)    
    c2 = F.max_pool2d(c2, kernel_size = MP_K1)
    #print("c2: ", c2.shape)

    #third convolution
    c3 = F.conv2d(c2, w3, bias = b3, stride = S3, padding = P3)
    c3 = F.relu(c3)
    #print("c3: ", c3.shape)

    #fourth convolution
    c4 = F.conv2d(c3, w4, bias = b4, stride = S4, padding = P4)
    c4 = F.relu(c4)
    c4 = F.max_pool2d(c4, kernel_size = MP_K2)    
    #print("c4: ", c4.shape)

    x = c4.view(-1, C4_OUT*SDIM_OUT4**2)
    x = torch.matmul(x, w_fc1) + b_fc1
    x = F.relu(x)

    x = torch.matmul(x, w_fc2) + b_fc2
    #x = F.relu(x)
    return x


### Training

In [None]:
optimizer = torch.optim.Adam([w1, b1, 
                              w2, b2,
                              w3, b3,
                              w4, b4,
                              w_fc1, b_fc1, 
                              w_fc2, b_fc2], 
                              lr=0.001)
for e in range(1, EPOCHS+1):
    i = 0 
    for images, labels in train_loader:
        i += 1
        if i%100 == 0: print("Epoch: {} | Batch number: {}".format(e, i))
        #reset the gradient at each batch
        optimizer.zero_grad()

        y = foward(images)
        
        #sigmoid built-in
        cross_entropy = F.cross_entropy(y, labels)

        cross_entropy.backward()
        optimizer.step()

### Testing!

In [None]:
n_correct = 0
n_samples = len(mnist_test)

with torch.no_grad():
    #go through the minibatchs 
    for images, labels in test_loader:
        
        #forward pass
        y = foward(images)

        predictions = torch.argmax(y, dim=1)

        #right predictions "boolean" vector
        predictions_vec = (predictions == labels)        
        n_correct += torch.sum(predictions_vec)

print('Test accuracy: {}'.format(n_correct/n_samples))

Test accuracy: 0.9926999807357788
