In [10]:
import sys
import os
import numpy as np
import pandas as pd
import datetime
import pickle
from pathlib import Path
from sklearn.metrics import roc_auc_score
from label_errors import get_label_errors

sys.path.insert(0, "../")
from eval_metrics import lift_at_k

## Read pickle files from cross-validation

In [2]:
PICKLE_FILE_NAME = "cv_pred_probs_data_deduped_20220202_v0.pickle"

# read pickle file
with open(PICKLE_FILE_NAME, 'rb') as handle:
    cv_experiment = pickle.load(handle)

## Get list of label errors (ground truth from manual review)

In [11]:
ANNOTATION_PATH = "andrew-ng-dcai-comp-2021-manual-review-for-label-errors.xlsx"
label_errors = get_label_errors(annotation_path=ANNOTATION_PATH)

In [12]:
# check out some files with label error
label_errors[:3]

['abf15a58-ce5d-11eb-b317-38f9d35ea60f.png',
 'ac14e8ba-ce5d-11eb-b317-38f9d35ea60f.png',
 'ac156ed4-ce5d-11eb-b317-38f9d35ea60f.png']

In [13]:
print(f"Number of label errors: {len(label_errors)}")

Number of label errors: 373


## Save to numpy files

In [15]:
models = [
    "resnet18", 
    "resnet50d",
    "efficientnet_b1",
    "twins_pcpvt_base",
    "swin_base_patch4_window7_224"
]

for model in models:
    
    # get model output
    model_results = cv_experiment[model]
    framework = cv_experiment["metadata"]["framework"]
    
    pred_probs = model_results["pred_probs"] # cross-validated predicted probabilities
    labels = model_results["labels"] # labels
    images = model_results["images"] # image file path
    
    # label error binary target
    label_errors_mask = pd.Series(images).map(lambda x: Path(x).name in label_errors).values    

    # save to folder
    numpy_out_folder = f"./dcai_train_val_dataset_cv_{model}/"

    try:
        os.makedirs(numpy_out_folder, exist_ok=False)
    except OSError:
        print(f"Folder {numpy_out_folder} already exists!")
    finally:
        np.save(numpy_out_folder + "pred_probs", pred_probs)
        np.save(numpy_out_folder + "labels", labels)
        np.save(numpy_out_folder + "images", images)
        np.save(numpy_out_folder + "label_errors_mask", label_errors_mask)

Folder ./dcai_train_val_dataset_cv_resnet18/ already exists!
Folder ./dcai_train_val_dataset_cv_resnet50d/ already exists!
Folder ./dcai_train_val_dataset_cv_efficientnet_b1/ already exists!
Folder ./dcai_train_val_dataset_cv_twins_pcpvt_base/ already exists!
Folder ./dcai_train_val_dataset_cv_swin_base_patch4_window7_224/ already exists!
