In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, ConcatDataset, random_split
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
# Load in the data
mnist_train = datasets.MNIST("./data", train=True, download=True, transform=transforms.ToTensor())
mnist_test = datasets.MNIST("./data", train=False, download=True, transform=transforms.ToTensor())
combined_dataset = ConcatDataset([mnist_train, mnist_test])   # Combine the training and test datasets

# Define the size of the new training and test sets (80% training, 20% testing)
train_size = int(0.8 * len(combined_dataset))  # 80% of the data
test_size = len(combined_dataset) - train_size  # The remaining 20% for testing
train_dataset, test_dataset = random_split(combined_dataset, [train_size, test_size])

# Only use 0's and 1's
train_idx = mnist_train.targets <= 1 #only retrieve those with labels less than this value
mnist_train.data = mnist_train.data[train_idx]
mnist_train.targets = mnist_train.targets[train_idx]

test_idx = mnist_test.targets <= 1 #only retrieve those with labels less than this value
mnist_test.data = mnist_test.data[test_idx]
mnist_test.targets = mnist_test.targets[test_idx]

# Make data loaders
train_loader = DataLoader(mnist_train, batch_size = 100, shuffle=True)
test_loader = DataLoader(mnist_test, batch_size = 100, shuffle=False)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 16191419.32it/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 488517.35it/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 4435569.64it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 3826175.69it/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



In [None]:
# do a single pass over the data
def epoch(loader, model, opt=None):
    total_loss, total_err = 0., 0.
    for X, y in loader:
        y_hat = model(X.view(X.shape[0], -1))[:,0]  # Make prediction
        loss = nn.BCEWithLogitsLoss()(y_hat, y.float())  # Calculate loss

        if opt:    # Perform gradient descent
            opt.zero_grad()   # Prevent gradient accumulation
            loss.backward()   # Calculate gradient
            opt.step()        # Make a step on the weights

        total_err += ((y_hat > 0) * (y==0) + (y_hat < 0) * (y==1)).sum().item() # error for 0 and 1 targets
        total_loss += loss.item() * X.shape[0]

    return total_err / len(loader.dataset), total_loss / len(loader.dataset)


**Prob 1:**

Train a 2-layer linear network with ReLU activations to identify 0s and 1s from MNIST. What did you use as the training set (what percentage)?  How 'wide' was your network?



In [None]:
################ Prob 1 ################

class TwoLayerModel(nn.Module):
    def __init__(self):   # Initialize the layers
        super(TwoLayerModel, self).__init__()
        self.linear1 = nn.Linear(784,512)
        self.linear2 = nn.Linear(512,256)
        self.linear3 = nn.Linear(256, 1)

    def forward(self, x): # Pass input through
       x = F.relu(self.linear1(x))
       x = F.relu(self.linear2(x))
       return self.linear3(x)

twoLayerModel = TwoLayerModel()
opt = optim.SGD(twoLayerModel.parameters(), lr=0.1)

print("Train Err", "Train Loss", "Test Err", "Test Loss", sep="\t")
for i in range(10):
    train_err, train_loss = epoch(train_loader, twoLayerModel, opt)
    test_err, test_loss = epoch(test_loader, twoLayerModel)
    print(*("{:.6f}".format(i) for i in (train_err, train_loss, test_err, test_loss)), sep="\t")

Train Err	Train Loss	Test Err	Test Loss
0.010896	0.105598	0.000946	0.006438
0.001895	0.008260	0.000473	0.003616
0.001579	0.005896	0.000473	0.002884
0.001184	0.004803	0.000473	0.002574
0.001184	0.004209	0.000473	0.002594
0.001026	0.003647	0.000946	0.002561
0.001026	0.003399	0.000473	0.002367
0.000790	0.003084	0.000946	0.002162
0.000711	0.002821	0.000946	0.002142
0.000632	0.002677	0.000946	0.002112


In the above cell is the code I used to train a two layer network on MNIST. For some context, the MNIST dataset is loaded in before with an 80%-20% training-test split. The network uses the ReLU activation function (only once to avoid its use on the final layer). I chose to use a width of 256 (meaning that the hidden layer had 256 units).

As you can see, the training error steadily decreased. But the test error did not get belo 0.000473. So, it was very accurate but not perfect. This is likely due to the simple nature of the dataset. If our data was more complicated, our model would likely not have worked as well.

Note: I assume the 2-layers means 2 hidden layers and one final layer. If not, the differences that I could make to the code above to use two hidden layers are pretty trivial anyway (see below).

**Prob 2:**

Now try to train a deeper network (4+ layers) to do the same thing, but this time feel free to use different activation functions.  Can you make the size of each layer smaller, i.e. make it less wide?

In [None]:
################ Prob 2 ################

class FiveLayerModel(nn.Module):
    def __init__(self):   # Initialize the layers
        super(FiveLayerModel, self).__init__()
        self.linear1 = nn.Linear(784,1024)
        self.linear2 = nn.Linear(1024, 512)
        self.linear3 = nn.Linear(512, 256)
        self.linear4 = nn.Linear(256, 128)
        self.linear5 = nn.Linear(128, 1)

    def forward(self, x): # Pass input through
       x = F.hardtanh(self.linear1(x))
       x = F.leaky_relu(self.linear2(x))
       x = F.sigmoid(self.linear3(x))
       x = F.hardswish(self.linear4(x))
       return self.linear5(x)

fiveLayerModel = FiveLayerModel()
opt = optim.SGD(fiveLayerModel.parameters(), lr=0.1)

print("Train Err", "Train Loss", "Test Err", "Test Loss", sep="\t")
for i in range(10):
    train_err, train_loss = epoch(train_loader, fiveLayerModel, opt)
    test_err, test_loss = epoch(test_loader, fiveLayerModel)
    print(*("{:.6f}".format(i) for i in (train_err, train_loss, test_err, test_loss)), sep="\t")

Train Err	Train Loss	Test Err	Test Loss
0.432452	0.675094	0.051064	0.577297
0.008843	0.101627	0.002364	0.008344
0.002764	0.009281	0.000946	0.004433
0.002369	0.006711	0.000473	0.003092
0.001816	0.005374	0.000473	0.002454
0.001579	0.004465	0.000473	0.002182
0.001263	0.003766	0.000473	0.002093
0.001421	0.003387	0.000473	0.001873
0.000869	0.002815	0.000473	0.001870
0.000790	0.002721	0.000473	0.001691


This time I trained a deeper network (with 5 layers) on the same data. For fun, I threw in a bunch of different activation functions (from tanh, to leakyReLu, to sigmoid, to swish). Also, each layer got less and less wide (with the idea that they will learn more global features later on in the network).

Since the data is easily separable, we have similar testing errors to the previous model. However, one thing to note is that the training error is less than before, meaning that the model is doing better at classifying images that it trains on.

**Prob 3:**

Now train a 2-layer convolutional network to do the same thing as the previous 2 problems.  Does this train more efficiently?

In [None]:
# do a single pass over the data
def epoch2(loader, model, opt=None):   # Modify this function to work for 2d input
    total_loss, total_err = 0., 0.
    for X, y in loader:
        y_hat = model(X).squeeze(1)  # Make prediction
        loss = nn.BCEWithLogitsLoss()(y_hat, y.float())  # Calculate loss

        if opt:    # Perform gradient descent
            opt.zero_grad()   # Prevent gradient accumulation
            loss.backward()   # Calculate gradient
            opt.step()        # Make a step on the weights

        total_err += ((y_hat > 0) * (y==0) + (y_hat <= 0) * (y==1)).sum().item() # error for 0 and 1 targets
        total_loss += loss.item() * X.shape[0]

    return total_err / len(loader.dataset), total_loss / len(loader.dataset)

In [None]:
################ Prob 3 ################

class TwoLayerConvModel(nn.Module):
    def __init__(self):   # Initialize the layers
        super(TwoLayerConvModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, stride=1, padding=2)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2)
        self.fc1 = nn.Linear(32*7*7, 1)

    def forward(self, x): # Pass input through
        x = F.relu(self.conv1(x))   # First convolutional layer
        x = nn.MaxPool2d(kernel_size=2, stride=2)(x)  # Max pooling layer
        x = F.relu(self.conv2(x))   # Second convolutional layer
        x = nn.MaxPool2d(kernel_size=2, stride=2)(x)  # Max pooling layer
        x = x.view(-1, 32*7*7)     # Flatten the output from the conv layers
        return self.fc1(x)    # Fully connected layer

twoLayerConvModel = TwoLayerConvModel()
opt = optim.SGD(twoLayerConvModel.parameters(), lr=0.1)

print("Train Err", "Train Loss", "Test Err", "Test Loss", sep="\t")
for i in range(10):
    train_err, train_loss = epoch2(train_loader, twoLayerConvModel, opt)
    test_err, test_loss = epoch2(test_loader, twoLayerConvModel)
    print(*("{:.6f}".format(i) for i in (train_err, train_loss, test_err, test_loss)), sep="\t")

Train Err	Train Loss	Test Err	Test Loss
0.014133	0.052054	0.001891	0.004624
0.002448	0.008310	0.000946	0.002688
0.001895	0.005422	0.000946	0.002227
0.001500	0.004253	0.000946	0.002132
0.001342	0.003377	0.000946	0.002482
0.000711	0.003031	0.000946	0.002053
0.000790	0.002659	0.000946	0.002180
0.000790	0.002573	0.000946	0.001900
0.000790	0.002440	0.000946	0.002045
0.000711	0.002274	0.000946	0.002086


This time, I trained a two-layer convolutional neural network (with a fully connected layer at the end to classify). The first layer outputted 16 pannels and the second 32. I chose the kernel size of the convolutions to be 5. Also, I implemented two pooling layers, with a stride and kernel size of 2. To be honest, these choices were somewhat arbitrary.

In this training, we got the training error to be lower than it was for the previous networks. But, to my surprise, the test error was actually a bit larger at the end. This means that it likely overfit the data (or at least more than the previous models). I assume that this model would work better if we were to use a larger data set.

**Prob 4:**

Now consider a 'deep' convolutional network, i.e. one with 4+ layers.  How does this change things?

In [None]:
################ Prob 3 ################

class FourLayerConvModel(nn.Module):
    def __init__(self):   # Initialize the layers
        super(FourLayerConvModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=4, kernel_size=7, stride=1, padding=2)
        self.conv2 = nn.Conv2d(in_channels=4, out_channels=4, kernel_size=5, stride=1, padding=2)
        self.conv3 = nn.Conv2d(in_channels=4, out_channels=8, kernel_size=5, stride=1, padding=2)
        self.conv4 = nn.Conv2d(in_channels=8, out_channels=16, kernel_size=5, stride=1, padding=2)
        self.conv5 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=1, padding=2)
        self.conv6 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=2)
        self.fc1 = nn.Linear(32*30*30, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x): # Pass input through
        x = F.relu(self.conv1(x))   # First convolutional layer
        x = F.relu(self.conv2(x))   # Second convolutional layer
        x = F.relu(self.conv3(x))   # Third convolutional layer
        x = F.relu(self.conv4(x))   # Fourth convolutional layer
        x = F.relu(self.conv5(x))   # Fifth convolutional layer
        x = F.relu(self.conv6(x))   # Sixth convolutional layer

        x = x.view(-1, 32*30*30)   # Flatten out
        x = F.relu(self.fc1(x))     # Fully connected layer 1
        return self.fc2(x)    # Fully connected layer 2

fourLayerConvModel = FourLayerConvModel()
opt = optim.SGD(fourLayerConvModel.parameters(), lr=0.1)

print("Train Err", "Train Loss", "Test Err", "Test Loss", sep="\t")
for i in range(10):
    train_err, train_loss = epoch2(train_loader, fourLayerConvModel, opt)
    test_err, test_loss = epoch2(test_loader, fourLayerConvModel)
    print(*("{:.6f}".format(i) for i in (train_err, train_loss, test_err, test_loss)), sep="\t")

Train Err	Train Loss	Test Err	Test Loss
0.458508	0.688395	0.153664	0.623597
0.089143	0.168663	0.002364	0.006737
0.003079	0.011256	0.001891	0.004425
0.001342	0.005204	0.001891	0.003893
0.001263	0.004473	0.001891	0.003913
0.001184	0.003671	0.001891	0.002525
0.000790	0.002955	0.001418	0.003153
0.000790	0.002442	0.001891	0.003338
0.000790	0.002308	0.001891	0.002443
0.000632	0.001473	0.000946	0.001760


This time I trained a deep convolutional network. It had 6 convolutional layers where the kernels gradually decreased in size and the number of panels gradually increased. Finally, the network finishes with a two-layer neural network. To reduce maintain complexity, I chose to not include any pooling layers.

This is a very deep network. So, it took a significant amount of time to train (unlike the previous models). And, for some reason it started with pretty high errors. But, the training error consistently decreased (more so than other models).

To my surprise, although the final training error was the lowest for this model, the final test error was actually higher than (or equal to) the other models. I think that this is because the model was overly complicated for the simple task, encouraging it to overfit the data. This model would likely work more effectively if we had a larger dataset.