In [None]:
# Code adapted from the Confident Learning CIFAR-10 reproduce folder:
# https://github.com/cgnorthcutt/confidentlearning-reproduce/blob/master/cifar10/cifar_create_label_errors.ipynb
# Northcutt, C.; Jiang, L.; and Chuang, I. 2021. 
# Confident learning: Estimating uncertainty in dataset labels. 
# Journal of Artificial Intelligence Research.

from cleanlab import noise_generation
import torchvision
from torchvision import transforms
import os
import sys
import numpy as np
import json
import pickle

In [None]:
###
dataset = "cifar10"
sparsities = [0, 20, 40, 60]
noise_rates = [0, 10, 20]
###
num_seeds = 10

data_path = 'datasets/datasets/{}/{}/'.format(dataset, dataset)
noisy_label_path = dataset + '_noisy_labels/'
train_dataset = torchvision.datasets.ImageFolder(
    root=data_path + 'train/',
    transform=transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ]),
)
y = train_dataset.targets
K = int(dataset[5:])

for noise_amount in noise_rates:
    for frac_zero_noise_rates in sparsities:
        for seed in range(num_seeds):
            if noise_amount < 1e-2 and frac_zero_noise_rates>0:
                continue
            print('noise_amount', round(noise_amount, 2), 
                  '| frac_zero_noise_rates', round(frac_zero_noise_rates, 2), 
                  '| seed', seed)
            
            # Generate class-conditional noise        
            nm = noise_generation.generate_noise_matrix_from_trace(
                K=K,
                trace=int(K * (1 - noise_amount/100.)),
                valid_noise_matrix=False,
                frac_zero_noise_rates=frac_zero_noise_rates/100.,
                seed=seed,
            )

            # noise matrix is valid if diagonal maximizes row and column
            valid = all((nm.argmax(axis=0) == range(K)) & (nm.argmax(axis=1) == range(K)))
            print('\tValid:', valid)

            # Create noisy labels
            np.random.seed(seed=seed)
            s = noise_generation.generate_noisy_labels(y, nm)
            
            # Check accuracy of s and y
            print('\tAccuracy of s and y:', sum(s==y)/len(s))

            # Create map of filenames to noisy labels
            d = dict(zip([i for i,j in train_dataset.imgs], [int(i) for i in s]))

            # Store dictionary as json
            wfn_base = '{}_noisy_labels__frac_zero_noise_rates__0.{}__noise_amount__0.{}'.format(
                dataset,
                '{0:0=2d}'.format(frac_zero_noise_rates),
                '{0:0=2d}'.format(noise_amount),
            )
            
            folder_path = noisy_label_path + wfn_base.replace('.','_') + '/seed_{}'.format(seed)
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
            
            wfn = os.path.join(folder_path, wfn_base)
#             print('\t', wfn)
            # Store the dictionary        
            with open(wfn + ".json", 'w') as wf:
                wf.write(json.dumps(d))

            # Store the noise matrix as well
            wfn_base = "{}_noise_matrix".format(dataset) + "__" + "__".join(wfn_base.split("__")[1:])
            wfn = os.path.join(folder_path, wfn_base)
#             print('\t', wfn)
            with open(wfn + ".pickle", 'wb') as wf:
                pickle.dump(nm, wf, protocol=pickle.HIGHEST_PROTOCOL)