In [1]:
import sys
import numpy as np
import pandas as pd
import pickle
from pathlib import Path
from autogluon.vision import ImageDataset

## Read CSV file with true labels for each file name

See notebook from previous step: "1_Run_Cross_Val_Noisy_Labels.ipynb"

In [2]:
df_train = pd.read_csv("cifar-10n-dataset-noise-type-worse.csv")
df_train["file_name"] = df_train.image.map(lambda f: Path(f).name)
df_train.head(3)

Unnamed: 0,index,image,label,true_label,file_name
0,0,./cifar-10n-png/train/image_id_0.png,4,6,image_id_0.png
1,1,./cifar-10n-png/train/image_id_1.png,9,9,image_id_1.png
2,2,./cifar-10n-png/train/image_id_2.png,0,9,image_id_2.png


## Read pickle files from cross-validation

In [3]:
# utils functions below (can consider moving these to separate utils.py file)

def load_pickle(pickle_file_name, verbose=1):
    """Load pickle file"""

    if verbose:
        print(f"Loading {pickle_file_name}")

    with open(pickle_file_name, 'rb') as handle:
        out = pickle.load(handle)
        
    return out

# get the original label from file path (aka "true labels" y)
get_orig_label_idx_from_file_path = np.vectorize(lambda f: label_name_to_idx_map[Path(f).parts[-2]])

In [62]:
num_cv_folds = 5 # number K in stratified K-folds cross-validation
verbose = 0

models = [
    "resnet18", 
    "resnet50d",
    "efficientnet_b1",
    "twins_pcpvt_base",
    "swin_base_patch4_window7_224"
]

results_list = []

for model in models:

    pred_probs = []
    labels = []
    images = []
    
    for split_num in range(num_cv_folds):

        out_subfolder = f"./cifar-10n-png_noise_type_worst_cv_{model}/split_{split_num}/"
        
        # pickle file name to read
        get_pickle_file_name = (
            lambda object_name: f"{out_subfolder}_{object_name}_split_{split_num}"
        )

        # NOTE: the "test_" prefix in the pickle name correspond to the "test" split during cross-validation.
        pred_probs_split = load_pickle(get_pickle_file_name("test_pred_probs"), verbose=verbose)
        labels_split = load_pickle(get_pickle_file_name("test_labels"), verbose=verbose)
        images_split = load_pickle(get_pickle_file_name("test_image_files"), verbose=verbose)
        indices_split = load_pickle(get_pickle_file_name("test_indices"), verbose=verbose)

        # append to list so we can combine data from all the splits
        pred_probs.append(pred_probs_split)
        labels.append(labels_split)
        images.append(images_split)    

    # convert list to array
    pred_probs = np.vstack(pred_probs)
    labels = np.hstack(labels) # remember that this is the noisy labels
    images = np.hstack(images)

    # create temp DataFrame so we can join to df_train to get the true labels
    # note: stratified K-folds can reorder the files! this is why we need to join on file name
    df_cv_temp = pd.DataFrame({
        "file_name": pd.Series(images).map(lambda f: Path(f).name),
        "noisy_labels": labels
    }).merge(df_train, how="left", on="file_name") # join on file_name to get the true labels
    
    # sanity check: noisy labels should be equal to "label" from the original CSV file
    assert (df_cv_temp.noisy_labels.values == df_cv_temp.label.values).all()
    
    # get the true labels
    true_labels = df_cv_temp.true_label
    
    # sanity check: accuracy of noisy labels vs true labels
    print(f"Accuracy of noisy labels vs true labels: {(labels == true_labels).mean()}")
    
    # save to Numpy files
    numpy_out_folder = f"./cifar-10n-png_noise_type_worst_cv_{model}/"
    
    print(f"Saving to numpy files in this folder: {numpy_out_folder}")
    print()
    
    np.save(numpy_out_folder + "pred_probs", pred_probs)
    np.save(numpy_out_folder + "noisy_labels", labels)
    np.save(numpy_out_folder + "images", images)
    np.save(numpy_out_folder + "true_labels", true_labels)

Accuracy of noisy labels vs true labels: 0.59792
Saving to numpy files in this folder: ./cifar-10n-png_noise_type_worst_cv_resnet18/

Accuracy of noisy labels vs true labels: 0.59792
Saving to numpy files in this folder: ./cifar-10n-png_noise_type_worst_cv_resnet50d/

Accuracy of noisy labels vs true labels: 0.59792
Saving to numpy files in this folder: ./cifar-10n-png_noise_type_worst_cv_efficientnet_b1/

Accuracy of noisy labels vs true labels: 0.59792
Saving to numpy files in this folder: ./cifar-10n-png_noise_type_worst_cv_twins_pcpvt_base/

Accuracy of noisy labels vs true labels: 0.59792
Saving to numpy files in this folder: ./cifar-10n-png_noise_type_worst_cv_swin_base_patch4_window7_224/



## Read numpy file

In [12]:
model = "swin_base_patch4_window7_224"
numpy_out_folder = f"./cifar-10n-png_noise_type_worst_cv_{model}/"

pred_probs = np.load(numpy_out_folder + "pred_probs.npy")
labels = np.load(numpy_out_folder + "noisy_labels.npy")
images = np.load(numpy_out_folder + "images.npy", allow_pickle=True)