In [10]:
import sys
import numpy as np
import pandas as pd
import pickle
from pathlib import Path
from autogluon.vision import ImageDataset

## Read original data

In [2]:
CIFAR_10_DATA_PATH = "/Data/cifar10_png/"

# read data from root folder
train_dataset, _, _ = ImageDataset.from_folders(
    root=CIFAR_10_DATA_PATH,
)

In [3]:
# get original label name to idx mapping
# alphabetical when using AutoGluon!
label_name_to_idx_map = {v:k for k, v in enumerate(train_dataset.classes)}
label_name_to_idx_map

{'airplane': 0,
 'automobile': 1,
 'bird': 2,
 'cat': 3,
 'deer': 4,
 'dog': 5,
 'frog': 6,
 'horse': 7,
 'ship': 8,
 'truck': 9}

## Read pickle files from cross-validation

In [4]:
# utils functions below (can consider moving these to separate utils.py file)

def load_pickle(pickle_file_name, verbose=1):
    """Load pickle file"""

    if verbose:
        print(f"Loading {pickle_file_name}")

    with open(pickle_file_name, 'rb') as handle:
        out = pickle.load(handle)
        
    return out

# get the original label from file path (aka "true labels" y)
get_orig_label_idx_from_file_path = np.vectorize(lambda f: label_name_to_idx_map[Path(f).parts[-2]])

In [7]:
num_cv_folds = 5 # number K in stratified K-folds cross-validation
verbose = 0

models = [
    "resnet18", 
    "resnet50d",
    "efficientnet_b1",
    "twins_pcpvt_base",
    "swin_base_patch4_window7_224"
]

results_list = []

for model in models:

    pred_probs = []
    labels = []
    images = []
    
    for split_num in range(num_cv_folds):

        out_subfolder = f"./cifar10_train_dataset_noise_amount_0.2_sparsity_0.4_cv_{model}/split_{split_num}/"
        
        # pickle file name to read
        get_pickle_file_name = (
            lambda object_name: f"{out_subfolder}_{object_name}_split_{split_num}"
        )

        # NOTE: the "test_" prefix in the pickle name correspond to the "test" split during cross-validation.
        pred_probs_split = load_pickle(get_pickle_file_name("test_pred_probs"), verbose=verbose)
        labels_split = load_pickle(get_pickle_file_name("test_labels"), verbose=verbose)
        images_split = load_pickle(get_pickle_file_name("test_image_files"), verbose=verbose)
        indices_split = load_pickle(get_pickle_file_name("test_indices"), verbose=verbose)

        # append to list so we can combine data from all the splits
        pred_probs.append(pred_probs_split)
        labels.append(labels_split)
        images.append(images_split)    

    # convert list to array
    pred_probs = np.vstack(pred_probs)
    labels = np.hstack(labels) # remember that this is the noisy labels (s)
    images = np.hstack(images)
    
    # get the true labels (y) from the original file path
    true_labels = get_orig_label_idx_from_file_path(images)
    
    # save to Numpy files
    numpy_out_folder = f"./cifar10_train_dataset_noise_amount_0.2_sparsity_0.4_cv_{model}/" 
    
    print(f"Saving to numpy files in this folder: {numpy_out_folder}")
    
    np.save(numpy_out_folder + "pred_probs", pred_probs)
    np.save(numpy_out_folder + "noisy_labels", labels)
    np.save(numpy_out_folder + "images", images)
    np.save(numpy_out_folder + "true_labels", true_labels)

    # check the accuracy
    acc_labels = (pred_probs.argmax(axis=1) == labels).mean() # noisy labels (s)
    acc_true_labels = (pred_probs.argmax(axis=1) == true_labels).mean() # true labels (y)    
    acc_noisy_vs_true_labels = (labels == true_labels).mean()
    
    print(f"Model: {model}")
    print(f"  Accuracy (argmax pred vs noisy labels): {acc_labels}")
    print(f"  Accuracy (argmax pred vs true labels) : {acc_true_labels}")
    print(f"  Accuracy (noisy vs true labels)       : {acc_noisy_vs_true_labels}")
    
    results = {
        "model": model,
        "Accuracy (argmax pred vs noisy labels)": acc_labels,
        "Accuracy (argmax pred vs true labels)": acc_true_labels,
        "Accuracy (noisy vs true labels)": acc_noisy_vs_true_labels
    }
    
    results_list.append(results)

Saving to numpy files in this folder: ./cifar10_train_dataset_noise_amount_0.2_sparsity_0.4_cv_resnet18/
Model: resnet18
  Accuracy (argmax pred vs noisy labels): 0.73976
  Accuracy (argmax pred vs true labels) : 0.92358
  Accuracy (noisy vs true labels)       : 0.80054
Saving to numpy files in this folder: ./cifar10_train_dataset_noise_amount_0.2_sparsity_0.4_cv_resnet50d/
Model: resnet50d
  Accuracy (argmax pred vs noisy labels): 0.75506
  Accuracy (argmax pred vs true labels) : 0.94312
  Accuracy (noisy vs true labels)       : 0.80054
Saving to numpy files in this folder: ./cifar10_train_dataset_noise_amount_0.2_sparsity_0.4_cv_efficientnet_b1/
Model: efficientnet_b1
  Accuracy (argmax pred vs noisy labels): 0.73582
  Accuracy (argmax pred vs true labels) : 0.91784
  Accuracy (noisy vs true labels)       : 0.80054
Saving to numpy files in this folder: ./cifar10_train_dataset_noise_amount_0.2_sparsity_0.4_cv_twins_pcpvt_base/
Model: twins_pcpvt_base
  Accuracy (argmax pred vs noisy l

## Analyze model accuracy

In [11]:
df = pd.DataFrame(results_list)

In [12]:
df.head()

Unnamed: 0,model,Accuracy (argmax pred vs noisy labels),Accuracy (argmax pred vs true labels),Accuracy (noisy vs true labels)
0,resnet18,0.73976,0.92358,0.80054
1,resnet50d,0.75506,0.94312,0.80054
2,efficientnet_b1,0.73582,0.91784,0.80054
3,twins_pcpvt_base,0.78098,0.97382,0.80054
4,swin_base_patch4_window7_224,0.79048,0.9871,0.80054


## Read numpy file

In [26]:
numpy_out_folder = "./cifar10_train_dataset_noise_amount_0.2_sparsity_0.4_cv_swin_base_patch4_window7_224/"

pred_probs = np.load(numpy_out_folder + "pred_probs.npy")
labels = np.load(numpy_out_folder + "noisy_labels.npy")
true_labels = np.load(numpy_out_folder + "true_labels.npy")
images = np.load(numpy_out_folder + "images.npy", allow_pickle=True)

In [27]:
(pred_probs.argmax(axis=1) == labels).mean()

0.79048

In [28]:
(pred_probs.argmax(axis=1) == true_labels).mean()

0.9871

In [17]:
np.unique(images)

array(['/Data/cifar10_png/train/airplane/0001.png',
       '/Data/cifar10_png/train/airplane/0002.png',
       '/Data/cifar10_png/train/airplane/0003.png', ...,
       '/Data/cifar10_png/train/truck/4998.png',
       '/Data/cifar10_png/train/truck/4999.png',
       '/Data/cifar10_png/train/truck/5000.png'], dtype=object)