### imports

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,TensorDataset
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

### function that returns a dataset with a specified size

In [None]:
dataFull = np.loadtxt(open("mnist_train_small.csv", "rb"), delimiter=",")
dataFull[:, 1:] = dataFull[:, 1:] / np.max(dataFull)


def makeTheDataset(N, doubleTheData=False):

    # get n rows of the data 
    data = dataFull[:N, 1:]
    labels = dataFull[:N, 0]

    # make a noisy copy of ALL the data
    # if doubleTheData:
    #     dataN = data + np.random.random_sample(data.shape) / 2
    #     data = np.concatenate((data, dataN), axis=0)
    #     labels = np.concatenate((labels, labels), axis=0)

    # convert to tensor
    dataT = torch.tensor(data).float()
    labelsT = torch.tensor(labels).long()

    # split into training and devset
    train_data, devset_data, train_labels, devset_labels = train_test_split(
        dataT, labelsT, train_size=0.9
    )

    # make a noisy copy of the TRAIN data
    if doubleTheData:
      train_dataN  = train_data + torch.rand_like(train_data)/2
      train_data   = torch.cat((train_data,train_dataN),axis=0)
      train_labels = torch.cat((train_labels,train_labels),axis=0)

    # convert to tensor datasets
    train_data = TensorDataset(train_data, train_labels)
    devset_data = TensorDataset(devset_data, devset_labels)

    # create dataloaders
    batchsize = 32
    train_loader = DataLoader(train_data, batch_size=batchsize, shuffle=True, drop_last=True)
    devset_loader = DataLoader(devset_data, batch_size=devset_data.tensors[0].shape[0])

    # Create a test set (don't need a dataloader)
    testdata = torch.tensor(dataFull[N:, 1:]).float()
    testlabels = torch.tensor(dataFull[N:, 0]).long()

    return train_loader, devset_loader, (testdata, testlabels)

### visualize the images

In [None]:
train_loader, devset_loader, testdataset = makeTheDataset(12, True)

# pop out the data matrices
img = train_loader.dataset.tensors[0].detach()

# show the numbers
fig, ax = plt.subplots(3, 4, figsize=(12, 8))
for i, ax in enumerate(ax.flatten()):
    ax.imshow(np.reshape(img[i, :], (28, 28)), cmap="gray")
    ax.axis("off")
plt.show()

### ANN model

In [None]:
class mnistNet(nn.Module):
    def __init__(self):
        super().__init__()

        self.input = nn.Linear(784, 64)
        self.fc1 = nn.Linear(64, 32)
        self.fc2 = nn.Linear(32, 32)
        self.output = nn.Linear(32, 10)

    def forward(self, x):
        x = F.relu(self.input(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.output(x)

In [None]:
def create_model():
    net = mnistNet()
    lossfun = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(net.parameters(), lr=0.01)
    return net, lossfun, optimizer

### function that trains the model

In [None]:
def function2trainTheModel():
    numepochs = 50
    net, lossfun, optimizer = create_model()
    losses = torch.zeros(numepochs)
    trainAcc = []
    devsetAcc = []

    for epochi in range(numepochs):
        batchAcc = []
        batchLoss = []
        for X, y in train_loader:

            # forward pass and loss
            yHat = net(X)
            loss = lossfun(yHat, y)

            # backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # loss from this batch
            batchLoss.append(loss.item())

            # compute accuracy
            batchAcc.append(100 * torch.mean((torch.argmax(yHat, axis=1) == y).float()))

        # average training accuracy
        trainAcc.append(np.mean(batchAcc))

        # and get average losses across the batches
        losses[epochi] = np.mean(batchLoss)

        # devset accuracy
        X, y = next(iter(devset_loader))
        with torch.no_grad():
            yHat = net(X)

        devsetAcc.append(100 * torch.mean((torch.argmax(yHat, axis=1) == y).float()))

    return trainAcc, devsetAcc, losses, net

### Run an experiment 



In [None]:
samplesizes = np.arange(500, 4001, 500)
resultsSingle = np.zeros((len(samplesizes), 3))
resultsDouble = np.zeros((len(samplesizes), 3))


for idx, ssize in enumerate(samplesizes):

    ### without doubling the data!
    # generate a dataset and train the model
    train_loader, devset_loader, testdataset = makeTheDataset(ssize, False)
    trainAcc, devsetAcc, losses, net = function2trainTheModel()

    # grab the results
    resultsSingle[idx, 0] = np.mean(trainAcc[-5:])
    resultsSingle[idx, 1] = np.mean(devsetAcc[-5:])
    resultsSingle[idx, 2] = torch.mean(losses[-5:]).item()

    ### with doubling the data!
    # generate a dataset and train the model
    train_loader, devset_loader, testdataset = makeTheDataset(ssize, True)
    trainAcc, devsetAcc, losses, net = function2trainTheModel()

    # grab the results
    resultsDouble[idx, 0] = np.mean(trainAcc[-5:])
    resultsDouble[idx, 1] = np.mean(devsetAcc[-5:])
    resultsDouble[idx, 2] = torch.mean(losses[-5:]).item()

In [None]:
fig,ax = plt.subplots(1,3,figsize=(15,5))

titles    = ['Train','Devset','Losses']
yaxlabels = ['Accuracy','Accuracy','Losses']

for i in range(3):

  # plot the lines
  ax[i].plot(samplesizes,resultsSingle[:,i],'s-',label='Original')
  ax[i].plot(samplesizes,resultsDouble[:,i],'s-',label='Augmented')

  # make it look nicer
  ax[i].set_ylabel(yaxlabels[i])
  ax[i].set_title(titles[i])
  ax[i].legend()
  ax[i].set_xlabel('Unique sample size')
  ax[i].grid('on')

  if i<2:
    ax[i].set_ylim([20,102])


plt.tight_layout()
plt.show()

### test the model on the test set

In [None]:
# we need to re-run the models for N=500
samplesize = 500

train_loader, devset_loader, testdataset = makeTheDataset(samplesize, False)
trainAccO, devsetAccO, lossesO, netO = function2trainTheModel()  # O = original

train_loader, devset_loader, testdataset = makeTheDataset(samplesize, True)
trainAccA, devsetAccA, lossesA, netA = function2trainTheModel()  # A = augmented

In [None]:
# extract the test data
X,y = testdataset

# run the original model
yHat = netO(X)
testO = 100*torch.mean((torch.argmax(yHat,axis=1)==y).float())

# And the augmented model
yHat = netA(X)
testA = 100*torch.mean((torch.argmax(yHat,axis=1)==y).float())

# print the results!
print( f'ORIGINAL MODEL (N={samplesize}):\n  Train: {trainAccO[-1]:.2f}%, devset: {devsetAccO[-1]:.2f}%, test: {testO:.2f}%\n\n')
print(f'AUGMENTED MODEL (N={samplesize}):\n  Train: {trainAccA[-1]:.2f}%, devset: {devsetAccA[-1]:.2f}%, test: {testA:.2f}%')
