In [None]:
import sys
import os
import numpy as np
import pandas as pd
import datetime
import pickle
from pathlib import Path
from sklearn.metrics import roc_auc_score
from label_errors import get_label_errors

sys.path.insert(0, "../")
from eval_metrics import lift_at_k

## Get list of label errors (ground truth from manual review)

In [None]:
ANNOTATION_PATH = "andrew-ng-dcai-comp-2021-manual-review-for-label-errors.xlsx"
label_errors = get_label_errors(annotation_path=ANNOTATION_PATH)

In [None]:
# check out some files with label error
label_errors[:3]

In [None]:
print(f"Number of label errors: {len(label_errors)}")

## Save to numpy files

In [None]:
# utils functions below (can consider moving these to separate utils.py file)

def load_pickle(pickle_file_name, verbose=1):
    """Load pickle file"""

    if verbose:
        print(f"Loading {pickle_file_name}")

    with open(pickle_file_name, 'rb') as handle:
        out = pickle.load(handle)

    return out

In [None]:
num_cv_folds = 5 # number K in stratified K-folds cross-validation
verbose = 0

models = [
    "resnet18", 
    "resnet50d",
    "efficientnet_b1",
    "twins_pcpvt_base",
    "swin_base_patch4_window7_224"
]

results_list = []

for model in models:

    pred_probs = []
    labels = []
    images = []
    
    for split_num in range(num_cv_folds):

        out_subfolder = f"./dcai_train_val_dataset_cv_{model}_20220329175851/split_{split_num}/"
        
        # pickle file name to read
        get_pickle_file_name = (
            lambda object_name: f"{out_subfolder}_{object_name}_split_{split_num}"
        )

        # NOTE: the "test_" prefix in the pickle name correspond to the "test" split during cross-validation.
        pred_probs_split = load_pickle(get_pickle_file_name("test_pred_probs"), verbose=verbose)
        labels_split = load_pickle(get_pickle_file_name("test_labels"), verbose=verbose)
        images_split = load_pickle(get_pickle_file_name("test_image_files"), verbose=verbose)
        indices_split = load_pickle(get_pickle_file_name("test_indices"), verbose=verbose)

        # append to list so we can combine data from all the splits
        pred_probs.append(pred_probs_split)
        labels.append(labels_split)
        images.append(images_split)    

    # convert list to array
    pred_probs = np.vstack(pred_probs)
    labels = np.hstack(labels)
    images = np.hstack(images)
    
    # label error binary target
    label_errors_mask = pd.Series(images).map(lambda x: Path(x).name in label_errors).values    
    
    # save to Numpy files
    numpy_out_folder = f"./dcai_train_val_dataset_cv_{model}/"
    
    print(f"Saving to numpy files in this folder: {numpy_out_folder}")
    
    np.save(numpy_out_folder + "pred_probs", pred_probs)
    np.save(numpy_out_folder + "labels", labels)
    np.save(numpy_out_folder + "images", images)
    np.save(numpy_out_folder + "label_errors_mask", label_errors_mask)

    # check the accuracy
    acc_labels = (pred_probs.argmax(axis=1) == labels).mean()
    
    print(f"Model: {model}")
    print(f"  Accuracy (argmax pred vs noisy labels): {acc_labels}")
    
    results = {
        "model": model,
        "Accuracy (argmax pred vs noisy labels)": acc_labels,
    }
    
    results_list.append(results)