## [Deep Learning is Robust to Massive Label Noise](https://arxiv.org/pdf/1705.10694)


The paper shows that neural networks can generalize when large numbers of (non-adversarially) incorrectly labeled examples are added to datasets (MNIST, CIFAR, and ImageNet).

We'll focus on uniform label noise (Experiment 1) and the MNIST dataset for computational reasons.

In [58]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from PIL import Image

import numpy as np
import matplotlib.pyplot as plt
import os, itertools, time

os.makedirs('logs', exist_ok = True)
os.makedirs('models', exist_ok = True)

seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

device = torch.device(
    'cuda' if torch.cuda.is_available() else
    ('mps' if torch.backends.mps.is_available() else
    'cpu')
)
# device = 'cpu' # faster for the small models we are using

def eval_model(model, test, criterion = nn.CrossEntropyLoss()):
    # Returns loss and accuracy of the model on the test set
    model.eval()
    correct, loss = 0, 0.0
    with torch.no_grad():
        for images, labels in test:
            images, labels = images.to(device), labels.to(device)
            _, pred = torch.max(model(images), 1)
            correct += (pred == labels).float().sum().item()
            loss += criterion(model(images), labels).item()
    return loss / len(test.dataset), correct / len(test.dataset)

In [59]:
class NoisyLabelDataset(torch.utils.data.Dataset):
    """Adds alpha uniformly noisy labels for every example in the original dataset"""
    
    def __init__(self, dataset, alpha):

        self.dataset = dataset
        self.alpha = alpha

    def is_noisy(self, idx):
        return idx % (self.alpha + 1) != 0

    def __len__(self):
        n = len(self.dataset)
        return n + (self.alpha * n)

    def __getitem__(self, idx):
        x, y = self.dataset[idx // (self.alpha + 1)]
        if self.is_noisy(idx):
            y = np.random.choice(len(self.dataset.classes))
        return x, y

In [60]:
def train(model, train_loader, val_loader, lr = 0.01, patience = 3, max_epochs = 100, verbose = False):
    
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adadelta(model.parameters(), lr = lr)

    log = {'train_loss': [], 'val_loss': [], 'val_acc': []}

    best_val_acc = float('inf')
    best_model = None

    for epoch in range(max_epochs):

        model.train()
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            loss = criterion(model(images), labels)
            loss.backward()
            optimizer.step()

        val_loss, val_acc = eval_model(model, val_loader)
        log['train_loss'].append(loss.item())
        log['val_loss'].append(val_loss)
        log['val_acc'].append(val_acc)

        if verbose: print(', '.join([f'Epoch {epoch + 1}'] + [f'{k}: {v[-1]:.4f}' for k, v in log.items()]))

        if val_acc < best_val_acc:
            best_val_acc = val_acc
            best_model = model.state_dict()

        # Early stopping: stop if val acc has not increased in the last `patience` epochs
        if epoch > patience and val_acc <= max(log['val_acc'][-patience-1:-1]): break 
    
    if best_model: model.load_state_dict(best_model)
    return model, log

In [65]:
learning_rates = [0.01, 0.05, 0.1, 0.5]
alphas = range(0, 125, 25)

lin_relu = lambda n_in, n_out: nn.Sequential(nn.Linear(n_in, n_out), nn.ReLU())
models = {
    'perceptron':nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 10)),
    'MLP1':nn.Sequential(nn.Flatten(), lin_relu(28 * 28, 256), nn.Linear(256, 10)),
    'MLP2':nn.Sequential(nn.Flatten(), lin_relu(28 * 28, 256), lin_relu(256, 128), nn.Linear(128, 10)),
    'MLP4':nn.Sequential(nn.Flatten(), lin_relu(28 * 28, 256), lin_relu(256, 128), lin_relu(128, 64), nn.Linear(64, 10)),
    'Conv4':nn.Sequential(
        nn.Conv2d(1, 16, 3, 1), nn.ReLU(),
        nn.Conv2d(16, 32, 3, 1), nn.ReLU(),
        nn.Conv2d(32, 64, 3, 1), nn.ReLU(),
        nn.Conv2d(64, 128, 3, 1), nn.ReLU(),
        nn.Flatten(), nn.Linear(128, 10)
    )
}

In [66]:
batch_size = 128

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = datasets.MNIST('data', download = True, train = True,   transform = transform)
test_dataset =  datasets.MNIST('data', download = True, train = False,  transform = transform)

noisy_train_dataset = NoisyLabelDataset(train_dataset, alpha = 5)
val_dataset, test_dataset = torch.utils.data.random_split(test_dataset, (0.2, 0.8), generator = torch.Generator().manual_seed(seed))

train_loader = DataLoader(noisy_train_dataset, batch_size = batch_size, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = False)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False)

In [69]:
for alpha in alphas:

    noisy_train_dataset = NoisyLabelDataset(train_dataset, alpha = alpha)
    train_loader = DataLoader(noisy_train_dataset, batch_size = batch_size, shuffle = True)
    print(alpha, len(noisy_train_dataset))

    for (name, model), lr in itertools.product({'Conv4':models['Conv4']}.items(), learning_rates):

        start = time.time()
        model, log = train(model, train_loader, val_loader, lr = lr, verbose = True)
        test_loss, test_acc = eval_model(model, test_loader)
        log['test_loss'] = test_loss
        log['test_acc'] = test_acc

        print(f'{name} - alpha: {alpha}, lr: {lr}, test acc: {test_acc:.4f}, took: {time.time() - start:.2f}s')
        torch.save(log, f'logs/{name}_{alpha}_{lr}.pt')
        torch.save(model, f'models/{name}_{alpha}_{lr}.pt')

0 60000


RuntimeError: linear(): input and weight.T shapes cannot be multiplied (128x51200 and 128x10)

In [302]:
# Get name of files in logs/perceptron*
best_lr_file = lambda m, a: max([f for f in pathlib.Path('logs').glob(f'{m}_{a}*')], key = lambda f: torch.load(f)['test_acc'])
best_lr_file('perceptron', 10)
test_accs = [torch.load(best_lr_file('perceptron', a))['test_acc'] for a in range(0, 40, 10)]
test_accs

[0.9145, 0.724, 0.780625, 0.735]

Questions:

- does early stopping help?
- does order matter: intuitively if we put all the true labelled exampes at the start (of the epoch) it should perform worse and vice versa