In [1]:
%load_ext autoreload
%autoreload 2

import cleanlab
from cleanlab.rank import get_label_quality_scores, get_label_quality_ensemble_scores
from cleanlab.internal.label_quality_utils import get_normalized_entropy
import sys
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, precision_recall_curve, accuracy_score, log_loss
from matplotlib import pyplot as plt
import copy

sys.path.insert(0, "../")
from eval_metrics import lift_at_k
from active_learning_scores import least_confidence

# experimental version of label quality ensemble scores with additional weighting schemes
from label_quality_ensemble_scores_experimental import get_label_quality_ensemble_scores_experimental

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:
?get_label_quality_ensemble_scores_experimental

## Models


In [2]:
models = [
    "resnet18",
    "resnet50d",
    "efficientnet_b1",
    "twins_pcpvt_base",
    "swin_base_patch4_window7_224",
]


In [None]:
!pwd

## Dictionaries to map to display names


In [3]:
# dictionaries to map to display name

method_adjust_pred_probs_display_dict = {
    "self_confidence-False": "Self Confidence",
    "self_confidence-True": "Adjusted Self Confidence",
    "normalized_margin-False": "Normalized Margin",
    "normalized_margin-True": "Adjusted Normalized Margin",
    "confidence_weighted_entropy-False": "Confidence Weighted Entropy",
    "entropy-False": "Entropy",
    "least_confidence-False": "Least Confidence",
}

model_display_name_dict = {
    "swin_base_patch4_window7_224": "Swin Transformer",
    "twins_pcpvt_base": "Twins PCPVT",
    "efficientnet_b1": "EfficientNet-B1",
    "resnet50d": "ResNet-50d",
    "resnet18": "ResNet-18",
}


## Load files from experiments


**Note:** we can refactor the code later to make it more concise but for now it reads the .npy files for each dataset within the for-loop


In [4]:
%%time

experiments = []

for model in models:

    #### Andrew Ng DCAI Roman Numerals ####

    # read numpy files
    numpy_out_folder = f"./roman-numeral/roman-numeral_train_val_dataset_cv_{model}/"
    pred_probs = np.load(numpy_out_folder + "pred_probs.npy")
    labels = np.load(numpy_out_folder + "labels.npy")
    images = np.load(numpy_out_folder + "images.npy", allow_pickle=True)
    label_errors_mask = np.load(numpy_out_folder + "label_errors_mask.npy")

    # store results of experiment
    experiment_results = {
        "dataset": "roman-numeral",
        "model": model,
        "pred_probs": pred_probs,
        "labels": labels,
        "images": images,
        "label_errors_mask": label_errors_mask
    }
    experiments.append(experiment_results)


    #### Cifar-10n-worst

    # read numpy files
    numpy_out_folder = f"./cifar-10n-worst/cifar-10n-png_noise_type_worst_cv_{model}/"
    pred_probs = np.load(numpy_out_folder + "pred_probs.npy")
    labels = np.load(numpy_out_folder + "noisy_labels.npy")
    true_labels = np.load(numpy_out_folder + "true_labels.npy")
    images = np.load(numpy_out_folder + "images.npy", allow_pickle=True)

    label_errors_mask = (true_labels != labels) # boolean mask of label errors

    # store results of experiment
    experiment_results = {
        "dataset": "cifar-10n-worst",
        "model": model,
        "pred_probs": pred_probs,
        "labels": labels,
        "images": images,
        "label_errors_mask": label_errors_mask
    }
    experiments.append(experiment_results)


    #### Cifar-10n-aggregate

    # read numpy files
    numpy_out_folder = f"./cifar-10n-aggregate/cifar-10n-png_noise_type_aggre_cv_{model}/"
    pred_probs = np.load(numpy_out_folder + "pred_probs.npy")
    labels = np.load(numpy_out_folder + "noisy_labels.npy")
    true_labels = np.load(numpy_out_folder + "true_labels.npy")
    images = np.load(numpy_out_folder + "images.npy", allow_pickle=True)

    label_errors_mask = (true_labels != labels) # boolean mask of label errors

    # store results of experiment
    experiment_results = {
        "dataset": "cifar-10n-aggregate",
        "model": model,
        "pred_probs": pred_probs,
        "labels": labels,
        "images": images,
        "label_errors_mask": label_errors_mask    
    }
    experiments.append(experiment_results)


    #### Cifar-10

    # synthetic noise amount 20% and sparsity 40% (as defined in confident learning paper)

    # read numpy files
    numpy_out_folder = f"./cifar-10/cifar10_train_dataset_noise_amount_0.2_sparsity_0.4_cv_{model}/"
    pred_probs = np.load(numpy_out_folder + "pred_probs.npy")
    labels = np.load(numpy_out_folder + "noisy_labels.npy")
    true_labels = np.load(numpy_out_folder + "true_labels.npy")
    images = np.load(numpy_out_folder + "images.npy", allow_pickle=True)

    label_errors_mask = (true_labels != labels) # boolean mask of label errors

    # store results of experiment
    experiment_results = {
        "dataset": "cifar-10",
        "model": model,
        "pred_probs": pred_probs,
        "labels": labels,
        "images": images,
        "label_errors_mask": label_errors_mask    
    }
    experiments.append(experiment_results)


    #### Food-101n

    # we only have verified labels for ~50K images so we have to evaluate within this subset

    # read numpy files
    numpy_out_folder = f"./food-101n/food-101n_cv_{model}/"
    pred_probs = np.load(numpy_out_folder + "pred_probs.npy")
    labels = np.load(numpy_out_folder + "noisy_labels.npy")
    images = np.load(numpy_out_folder + "images.npy", allow_pickle=True)

    # read verified training labels
    path_verified_train = "./food-101n/verified_train.tsv"
    df_verified_train = pd.read_csv(path_verified_train, sep='\t')

    # instantiate DataFrame with all training data
    df_image_paths = pd.DataFrame({
        "class_name/key": pd.Series(images).map(lambda f: "/".join(Path(f).parts[-2:]))
    })

    # join to append verification_label column
    df_image_paths_w_verified = df_image_paths.merge(df_verified_train, on="class_name/key", how="left")

    # subset of data with verified labels
    verified_subset_mask = ~df_image_paths_w_verified.verification_label.isnull().values

    # filter on verified subset
    pred_probs = pred_probs[verified_subset_mask]
    labels = labels[verified_subset_mask]
    images = images[verified_subset_mask]

    # boolean mask of label errors
    label_errors_mask = df_image_paths_w_verified["verification_label"].values[verified_subset_mask] == 0

    # store results of experiment
    experiment_results = {
        "dataset": "food-101n",
        "model": model,
        "pred_probs": pred_probs,
        "labels": labels,
        "images": images,
        "label_errors_mask": label_errors_mask  
    }
    experiments.append(experiment_results)

CPU times: user 17.7 s, sys: 2.47 s, total: 20.2 s
Wall time: 20.4 s


## Prepare data for ensemble model


In [None]:
%%time

dataset_model_output = {}

for i, experiment in enumerate(experiments):

    # experiment results
    dataset = experiment["dataset"]
    model = experiment["model"]
    pred_probs = experiment["pred_probs"]
    labels = experiment["labels"]
    images = experiment["images"]
    label_errors_target = experiment["label_errors_mask"]

    # check
    if dataset not in dataset_model_output.keys():

        # init list of pred_probs and labels
        dataset_model_output[dataset] = {}
        dataset_model_output[dataset]["pred_probs_list"] = []
        dataset_model_output[dataset]["labels_list"] = []
        dataset_model_output[dataset]["images_list"] = []
        dataset_model_output[dataset]["label_errors_target_list"] = []

    # store model output on dataset as key
    dataset_model_output[dataset]["pred_probs_list"].append(pred_probs)
    dataset_model_output[dataset]["labels_list"].append(labels)
    dataset_model_output[dataset]["images_list"].append(images)
    dataset_model_output[dataset]["label_errors_target_list"].append(
        label_errors_target
    )

In [None]:
model

## Evaluate ensemble model


In [None]:
ensemble_accuracies = []
df_model_weights_list = []

for dataset_key in dataset_model_output.keys():

    # get list of pred_probs, labels for dataset
    pred_probs_list = dataset_model_output[dataset_key]["pred_probs_list"]
    labels_list = dataset_model_output[dataset_key]["labels_list"]
    images_list = dataset_model_output[dataset_key]["images_list"]
    label_errors_target_list = dataset_model_output[dataset_key]["label_errors_target_list"]
    
    # use for sanity check (noisy labels and images from each model should be the same because they were generated from the same cross-val procedure
    for i, (labels_temp, images_temp) in enumerate(zip(labels_list, images_list)):

        if i == 0:
            labels_temp_previous = copy.deepcopy(labels_temp)
            images_temp_previous = copy.deepcopy(images_temp)       
        else:
            assert (labels_temp_previous == labels_temp).all()
            assert (images_temp_previous == images_temp).all()    
    
    # take the first (the others are the same)
    labels = labels_list[0]
    label_errors_target = label_errors_target_list[0]

    # compute accuracy and log-loss of individual models for weighting
    accuracy_list = []
    inv_log_loss_list = []
    for pred_probs in pred_probs_list:
        
        # accuracy of single model
        accuracy = (pred_probs.argmax(axis=1) == labels).mean()
        accuracy_list.append(accuracy)
        
        # log-loss of single model
        log_loss_ = np.exp(log_loss(labels, pred_probs))
        inv_log_loss_list.append(1 / log_loss_)
        
        
    # accuracy weights
    acc_weights = np.array(accuracy_list) / sum(accuracy_list)    
    
    # log_loss weights
    inv_log_loss_weights = np.array(inv_log_loss_list) / sum(inv_log_loss_list)
    
    # average predictions
    pred_probs_avg = sum(pred_probs_list) / len(pred_probs_list)
    
    #### can refactor below to a function that accepts weights and pred_probs_list
    
    # accuracy-weighted predictions
    pred_probs_avg_acc_weighted = sum([acc_weights[i] * p for i, p in enumerate(pred_probs_list)])
    
    # inv-log-loss-weighted predictions
    pred_probs_avg_inv_log_loss_weighted = sum([inv_log_loss_weights[i] * p for i, p in enumerate(pred_probs_list)])
    
    accuracy = {
        "dataset": dataset_key,
        "ensemble accuracy (avg models)": (pred_probs_avg.argmax(axis=1) == labels).mean(),
        "ensemble accuracy (avg models weighted by accuracy)": (pred_probs_avg_acc_weighted.argmax(axis=1) == labels).mean(),
        "ensemble accuracy (avg models weighted by inv log loss)": (pred_probs_avg_inv_log_loss_weighted.argmax(axis=1) == labels).mean(),        
    }
    
    ensemble_accuracies.append(accuracy)
    
    
    
    df_model_weights = pd.DataFrame({
        "dataset": dataset_key,
        "models": models,
        "model_weights_by_accuracy": acc_weights,
        "model_weights_by_inv_exp_log_loss": inv_log_loss_weights
    })
    
    df_model_weights_list.append(df_model_weights)

In [None]:
# pd.concat(df_model_weights_list).reset_index().to_csv("model_weights.csv")

In [None]:
pd.DataFrame(ensemble_accuracies)

In [None]:
for dataset_key in dataset_model_output.keys():

    # get list of pred_probs, labels for dataset
    pred_probs_list = dataset_model_output[dataset_key]["pred_probs_list"]
    labels_list = dataset_model_output[dataset_key]["labels_list"]
    images_list = dataset_model_output[dataset_key]["images_list"]
    label_errors_target_list = dataset_model_output[dataset_key]["label_errors_target_list"]
    
    # use for sanity check (noisy labels and images from each model should be the same because they were generated from the same cross-val procedure
    for i, (labels_temp, images_temp) in enumerate(zip(labels_list, images_list)):

        if i == 0:
            labels_temp_previous = copy.deepcopy(labels_temp)
            images_temp_previous = copy.deepcopy(images_temp)       
        else:
            assert (labels_temp_previous == labels_temp).all()
            assert (images_temp_previous == images_temp).all()    
    
    # take the first (the others are the same)
    labels = labels_list[0]
    label_errors_target = label_errors_target_list[0]
    
    print(dataset_key)

In [None]:
%%time

# args to pass to get_label_quality_scores()
score_params = \
[
    ("self_confidence", False),
    ("self_confidence", True),
    ("normalized_margin", False),
    ("normalized_margin", True),
    ("confidence_weighted_entropy", False)
]


ensemble_evaluations = []

dataset_best_weights = []

for dataset_key in dataset_model_output.keys():

    # get list of pred_probs, labels for dataset
    pred_probs_list = dataset_model_output[dataset_key]["pred_probs_list"]
    labels_list = dataset_model_output[dataset_key]["labels_list"]
    images_list = dataset_model_output[dataset_key]["images_list"]
    label_errors_target_list = dataset_model_output[dataset_key]["label_errors_target_list"]
    
    # use for sanity check (noisy labels and images from each model should be the same because they were generated from the same cross-val procedure
    for i, (labels_temp, images_temp) in enumerate(zip(labels_list, images_list)):

        if i == 0:
            labels_temp_previous = copy.deepcopy(labels_temp)
            images_temp_previous = copy.deepcopy(images_temp)       
        else:
            assert (labels_temp_previous == labels_temp).all()
            assert (images_temp_previous == images_temp).all()    
    
    # take the first (the others are the same)
    labels = labels_list[0]
    label_errors_target = label_errors_target_list[0]
    
    # compute accuracy
    accuracy_list = []
    for pred_probs in pred_probs_list:
        
        # accuracy of single model
        accuracy = (pred_probs.argmax(axis=1) == labels).mean()
        accuracy_list.append(accuracy)
        
    # accuracy weights
    acc_weights = np.array(accuracy_list) / sum(accuracy_list)    
    
    # average predictions
    pred_probs_avg = sum(pred_probs_list) / len(pred_probs_list)
    
    #### can refactor below to a function that accepts weights and pred_probs_list
    
    # accuracy-weighted predictions
    pred_probs_avg_acc_weighted = sum([acc_weights[i] * p for i, p in enumerate(pred_probs_list)])
    

    #### find best t in T for exp-log-loss weighting
    T = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 2e2]

    pred_probs_avg_log_loss_weighted = None
    inv_log_loss_weights = None
    best_eval_log_loss = float("inf")
    best_t = None

    for t in T:

        log_loss_list = []

        # pred_probs for each model
        for pred_probs in pred_probs_list:
            log_loss_ = np.exp(t * (-log_loss(labels, pred_probs)))
            log_loss_list.append(log_loss_)

        # weights using log loss
        inv_log_loss_weights_temp = np.array(log_loss_list) / sum(log_loss_list)

        # weighted average
        pred_probs_avg_log_loss_weighted_temp = sum([inv_log_loss_weights_temp[i] * p for i, p in enumerate(pred_probs_list)])

        # evaluate log_loss with this weighted average
        eval_log_loss = log_loss(labels, pred_probs_avg_log_loss_weighted_temp)


        # check if this is the best eval_log_loss so far
        if best_eval_log_loss > eval_log_loss:
            best_eval_log_loss = eval_log_loss
            best_t = t
            pred_probs_avg_log_loss_weighted = pred_probs_avg_log_loss_weighted_temp.copy()
            inv_log_loss_weights = inv_log_loss_weights_temp.copy()    

    df_temp = pd.DataFrame({
        "dataset": dataset_key,
        "models": models,
        "best_t": best_t,
        "best_eval_log_loss": best_eval_log_loss,
        "inv_log_loss_weights": inv_log_loss_weights,
    })
    
    # save the weights for analysis later
    dataset_best_weights.append(df_temp)
            
    print()
    print(dataset_key)
    print(best_eval_log_loss)
    print(inv_log_loss_weights)
    print(pred_probs_avg_log_loss_weighted)
    
    
    #### label quality scoring
    
    for score_param in score_params:
        
        # label quality scoring method
        method, adjust_pred_probs = score_param
    
        # compute scores
        
        # use average pred_probs
        label_quality_scores_avg = get_label_quality_scores(labels=labels, pred_probs=pred_probs_avg, method=method, adjust_pred_probs=adjust_pred_probs)
        
        # use average pred_probs weighted by accuracy
        label_quality_scores_avg_acc_weighted = get_label_quality_scores(labels=labels, pred_probs=pred_probs_avg_acc_weighted, method=method, adjust_pred_probs=adjust_pred_probs)
        
        # use average pred_probs weighted by log loss
        label_quality_scores_avg_log_loss_weighted = get_label_quality_scores(labels=labels, pred_probs=pred_probs_avg_log_loss_weighted, method=method, adjust_pred_probs=adjust_pred_probs)
        
        # use pred_probs_list (weighted by accuracy)
        label_quality_scores_agg_acc = get_label_quality_ensemble_scores_experimental(
            labels=labels, 
            pred_probs_list=pred_probs_list, 
            method=method, 
            adjust_pred_probs=adjust_pred_probs, 
            verbose=0,
            weight_ensemble_members_by="accuracy"
        )

        # use pred_probs_list (uniform_weights)
        label_quality_scores_agg_uni = get_label_quality_ensemble_scores_experimental(
            labels=labels, 
            pred_probs_list=pred_probs_list, 
            method=method, 
            adjust_pred_probs=adjust_pred_probs,
            verbose=0,
            weight_ensemble_members_by="uniform"
        )
        
        # use pred_probs_list (weight by inverse log loss)
        label_quality_scores_agg_log_loss = get_label_quality_ensemble_scores_experimental(
            labels=labels, 
            pred_probs_list=pred_probs_list, 
            method=method, 
            adjust_pred_probs=adjust_pred_probs,
            verbose=0, 
            weight_ensemble_members_by="custom",
            custom_weights=inv_log_loss_weights # custom weights!
        )        
        
        # compute accuracy of detecting label errors
        auroc_avg = roc_auc_score(label_errors_target, 1 - label_quality_scores_avg)
        auroc_avg_acc_weighted = roc_auc_score(label_errors_target, 1 - label_quality_scores_avg_acc_weighted)
        auroc_avg_log_loss_weighted = roc_auc_score(label_errors_target, 1 - label_quality_scores_avg_log_loss_weighted)        
        
        auroc_agg_acc = roc_auc_score(label_errors_target, 1 - label_quality_scores_agg_acc)
        auroc_agg_uni = roc_auc_score(label_errors_target, 1 - label_quality_scores_agg_uni)
        auroc_agg_log_loss = roc_auc_score(label_errors_target, 1 - label_quality_scores_agg_log_loss)        
        
        # lift at K where K = number of label errors
        lift_at_num_label_errors_avg = lift_at_k(label_errors_target, 1 - label_quality_scores_avg, k=label_errors_target.sum())
        lift_at_num_label_errors_avg_acc_weighted = lift_at_k(label_errors_target, 1 - label_quality_scores_avg_acc_weighted, k=label_errors_target.sum())
        lift_at_num_label_errors_avg_log_loss_weighted = lift_at_k(label_errors_target, 1 - label_quality_scores_avg_log_loss_weighted, k=label_errors_target.sum())
        
        lift_at_num_label_errors_agg_acc = lift_at_k(label_errors_target, 1 - label_quality_scores_agg_acc, k=label_errors_target.sum())
        lift_at_num_label_errors_agg_uni = lift_at_k(label_errors_target, 1 - label_quality_scores_agg_uni, k=label_errors_target.sum())
        lift_at_num_label_errors_agg_log_loss = lift_at_k(label_errors_target, 1 - label_quality_scores_agg_log_loss, k=label_errors_target.sum())        
        
        # lift at k=100
        lift_at_100_avg = lift_at_k(label_errors_target, 1 - label_quality_scores_avg, k=100)
        lift_at_100_avg_acc_weighted = lift_at_k(label_errors_target, 1 - label_quality_scores_avg_acc_weighted, k=100)
        lift_at_100_avg_log_loss_weighted = lift_at_k(label_errors_target, 1 - label_quality_scores_avg_log_loss_weighted, k=100)
        
        lift_at_100_agg_acc = lift_at_k(label_errors_target, 1 - label_quality_scores_agg_acc, k=100)
        lift_at_100_agg_uni = lift_at_k(label_errors_target, 1 - label_quality_scores_agg_uni, k=100)
        lift_at_100_agg_log_loss = lift_at_k(label_errors_target, 1 - label_quality_scores_agg_log_loss, k=100)        

        ensemble_evaluation_results_avg = {
            "ensemble_method": "avg_pred_probs",
            "method": method,
            "adjust_pred_probs": adjust_pred_probs,
            "dataset": dataset_key,
            "model": "ensemble",
            "dataset_num_samples": labels.shape[0],
            "dataset_num_label_errors": label_errors_target.sum(),
            "auroc": auroc_avg,
            "lift_at_num_label_errors": lift_at_num_label_errors_avg,
            "lift_at_100": lift_at_100_avg
        }
        
        ensemble_evaluation_results_avg_acc_weighted = {
            "ensemble_method": "avg_pred_probs_weighted_by_accuracy",
            "method": method,
            "adjust_pred_probs": adjust_pred_probs,
            "dataset": dataset_key,
            "model": "ensemble",
            "dataset_num_samples": labels.shape[0],
            "dataset_num_label_errors": label_errors_target.sum(),
            "auroc": auroc_avg_acc_weighted,
            "lift_at_num_label_errors": lift_at_num_label_errors_avg_acc_weighted,
            "lift_at_100": lift_at_100_avg_acc_weighted
        }        
        
        ensemble_evaluation_results_avg_log_loss_weighted = {
            "ensemble_method": "avg_pred_probs_weighted_by_inv_log_loss",
            "method": method,
            "adjust_pred_probs": adjust_pred_probs,
            "dataset": dataset_key,
            "model": "ensemble",
            "dataset_num_samples": labels.shape[0],
            "dataset_num_label_errors": label_errors_target.sum(),
            "auroc": auroc_avg_log_loss_weighted,
            "lift_at_num_label_errors": lift_at_num_label_errors_avg_log_loss_weighted,
            "lift_at_100": lift_at_100_avg_log_loss_weighted
        }                
        
        ensemble_evaluation_results_agg_acc = {
            "ensemble_method": "avg_scores_weighted_by_accuracy",
            "method": method,
            "adjust_pred_probs": adjust_pred_probs,
            "dataset": dataset_key,
            "model": "ensemble",
            "dataset_num_samples": labels.shape[0],
            "dataset_num_label_errors": label_errors_target.sum(),
            "auroc": auroc_agg_acc,
            "lift_at_num_label_errors": lift_at_num_label_errors_agg_acc,
            "lift_at_100": lift_at_100_agg_acc
        }
        
        ensemble_evaluation_results_agg_uni = {
            "ensemble_method": "avg_scores",
            "method": method,
            "adjust_pred_probs": adjust_pred_probs,
            "dataset": dataset_key,
            "model": "ensemble",
            "dataset_num_samples": labels.shape[0],
            "dataset_num_label_errors": label_errors_target.sum(),
            "auroc": auroc_agg_uni,
            "lift_at_num_label_errors": lift_at_num_label_errors_agg_uni,
            "lift_at_100": lift_at_100_agg_uni
        }
        
        ensemble_evaluation_results_agg_log_loss = {
            "ensemble_method": "avg_scores_weighted_by_inv_log_loss",
            "method": method,
            "adjust_pred_probs": adjust_pred_probs,
            "dataset": dataset_key,
            "model": "ensemble",
            "dataset_num_samples": labels.shape[0],
            "dataset_num_label_errors": label_errors_target.sum(),
            "auroc": auroc_agg_log_loss,
            "lift_at_num_label_errors": lift_at_num_label_errors_agg_log_loss,
            "lift_at_100": lift_at_100_agg_log_loss
        }
        
        # store evaluation results
        ensemble_evaluations.append(ensemble_evaluation_results_avg)
        ensemble_evaluations.append(ensemble_evaluation_results_avg_acc_weighted)
        ensemble_evaluations.append(ensemble_evaluation_results_avg_log_loss_weighted)
        
        ensemble_evaluations.append(ensemble_evaluation_results_agg_acc)
        ensemble_evaluations.append(ensemble_evaluation_results_agg_uni)
        ensemble_evaluations.append(ensemble_evaluation_results_agg_log_loss)

In [None]:
# dataset_best_weights_w_model = []

# for df_temp in dataset_best_weights:
#     df_temp["models"] = models
    
#     dataset_best_weights_w_model.append(df_temp)

In [None]:
df_dataset_best_weights = pd.concat(dataset_best_weights_w_model)

df_dataset_best_weights.head(3)

In [None]:
df_evaluations_ensemble = pd.DataFrame(ensemble_evaluations)


df_evaluations_ensemble["method_adjust_pred_probs"] = (
    df_evaluations_ensemble.method
    + "-"
    + df_evaluations_ensemble.adjust_pred_probs.astype(str)
)
df_evaluations_ensemble["dataset_model"] = (
    df_evaluations_ensemble.dataset + " | " + df_evaluations_ensemble.model
)

df_evaluations_ensemble[
    "scoring_method"
] = df_evaluations_ensemble.method_adjust_pred_probs.map(
    lambda x: method_adjust_pred_probs_display_dict[x]
)


In [None]:
df_evaluations_ensemble.head(3)

In [None]:
df_evaluations_ensemble.groupby("ensemble_method")["ensemble_method"].count()

In [None]:
# df_evaluations_ensemble.to_csv("evaluation_ensemble.csv")

In [None]:
df_evaluations_ensemble_pivot = pd.pivot_table(
    df_evaluations_ensemble,
    values="auroc",
    index=["scoring_method", "ensemble_method"],
    columns=["dataset"],
).reset_index()


In [None]:
df_evaluations_ensemble_pivot.head(2)

In [None]:
df_evaluations_ensemble_auroc = (
    pd.pivot_table(
        df_evaluations_ensemble,
        values="auroc",
        index=["dataset", "scoring_method"],
        columns=["ensemble_method"],
    )
    .reset_index()
    .sort_values(by=["dataset", "scoring_method"])
)

df_evaluations_ensemble_lift_at_num_errors = (
    pd.pivot_table(
        df_evaluations_ensemble,
        values="lift_at_num_label_errors",
        index=["dataset", "scoring_method"],
        columns=["ensemble_method"],
    )
    .reset_index()
    .sort_values(by=["dataset", "scoring_method"])
)

df_evaluations_ensemble_lift_at_100 = (
    pd.pivot_table(
        df_evaluations_ensemble,
        values="lift_at_100",
        index=["dataset", "scoring_method"],
        columns=["ensemble_method"],
    )
    .reset_index()
    .sort_values(by=["dataset", "scoring_method"])
)

df_evaluations_ensemble_auroc["dataset_scoring_method"] = \
    df_evaluations_ensemble_auroc.dataset + " | " + df_evaluations_ensemble_auroc.scoring_method

df_evaluations_ensemble_lift_at_num_errors["dataset_scoring_method"] = \
    df_evaluations_ensemble_lift_at_num_errors.dataset + " | " + df_evaluations_ensemble_lift_at_num_errors.scoring_method

df_evaluations_ensemble_lift_at_100["dataset_scoring_method"] = \
    df_evaluations_ensemble_lift_at_100.dataset + " | " + df_evaluations_ensemble_lift_at_100.scoring_method

df_evaluations_ensemble_lift_at_100.head()

In [None]:
df_evaluations_ensemble_auroc.head(1)

In [None]:
df_evaluations_swin = df_evaluations[df_evaluations.model_name == "Swin Transformer"]

df_evaluations_swin_auroc = (
    pd.pivot_table(
        df_evaluations_swin,
        values="auroc",
        index=["dataset", "scoring_method"],
        columns=["model_name"],
    )
    .reset_index()
    .sort_values(by=["dataset", "scoring_method"])
)

df_evaluations_swin_lift_at_num_errors = (
    pd.pivot_table(
        df_evaluations_swin,
        values="lift_at_num_label_errors",
        index=["dataset", "scoring_method"],
        columns=["model_name"],
    )
    .reset_index()
    .sort_values(by=["dataset", "scoring_method"])
)

df_evaluations_swin_lift_at_100 = (
    pd.pivot_table(
        df_evaluations_swin,
        values="lift_at_100",
        index=["dataset", "scoring_method"],
        columns=["model_name"],
    )
    .reset_index()
    .sort_values(by=["dataset", "scoring_method"])
)

df_evaluations_swin_lift_at_100.head()

In [None]:
# combine ensemble results with single best model results (Swin Transformer)

df_evaluations_ensemble_auroc = \
    df_evaluations_ensemble_auroc.merge(df_evaluations_swin_auroc, on=["dataset", "scoring_method"], how="left")

df_evaluations_ensemble_lift_at_num_errors = \
    df_evaluations_ensemble_lift_at_num_errors.merge(df_evaluations_swin_lift_at_num_errors, on=["dataset", "scoring_method"], how="left")

df_evaluations_ensemble_lift_at_100 = \
    df_evaluations_ensemble_lift_at_100.merge(df_evaluations_swin_lift_at_100, on=["dataset", "scoring_method"], how="left")

df_evaluations_ensemble_lift_at_num_errors.shape

In [None]:
df_evaluations_ensemble_auroc.head()

In [None]:
title = "AUROC"

# pointer
df = df_evaluations_ensemble_auroc.sort_values(by=["dataset", "avg_scores_weighted_by_inv_log_loss"])
# df = df[df.scoring_method == "Self Confidence"]

labels = df["dataset_scoring_method"].tolist()
x = np.arange(len(labels))  # the label locations


s0 = df["Swin Transformer"].tolist()

s1 = df["avg_pred_probs"].tolist()
s2 = df["avg_pred_probs_weighted_by_accuracy"].tolist()
s3 = df["avg_pred_probs_weighted_by_inv_log_loss"].tolist()

s4 = df["avg_scores"].tolist()
s5 = df["avg_scores_weighted_by_accuracy"].tolist()
s6 = df["avg_scores_weighted_by_inv_log_loss"].tolist()

jf = 0.15 # jitter factor
markersize = 8
alpha = 1

fig, ax = plt.subplots()
_ = ax.plot(s0, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Swin Transformer", markersize=markersize, alpha=alpha)
_ = ax.plot(s1, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Preds", markersize=markersize, alpha=alpha)
# _ = ax.plot(s2, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Preds (Weighted By Accuracy)", markersize=markersize, alpha=alpha)
# _ = ax.plot(s3, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Preds (Weighted By Exp(-T*log_loss)", markersize=markersize, alpha=alpha)
# _ = ax.plot(s4, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Scores", markersize=markersize, alpha=alpha)
# _ = ax.plot(s5, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Scores (Weighted By Accuracy)", markersize=markersize, alpha=alpha)
_ = ax.plot(s6, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Weighted Avg Scores (Weighted By Exponential Negative Log Loss)", markersize=markersize, alpha=alpha)

# _ = ax.plot(s6, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Scores (Weighted By Exp(-T*log_loss)", markersize=markersize, alpha=alpha)



# Add some text for labels, title and custom x-axis tick labels, etc.
# ax.set_xlabel("")
ax.set_title(title, fontsize=24, fontweight="bold")
# ax.set_title("Dataset | Scoring Method", fontsize=12, fontweight="bold")
ax.set_yticks(x, labels)
ax.tick_params(axis='both', which='major', labelsize=12)
ax.legend(fontsize=12)
# fig.tight_layout()
plt.rcParams["figure.figsize"] = (15, 10)
plt.show()

In [None]:
title = "Lift at # Errors"

# pointer
df = df_evaluations_ensemble_lift_at_num_errors.sort_values(by=["dataset", "avg_scores_weighted_by_inv_log_loss"])

labels = df["dataset_scoring_method"].tolist()
x = np.arange(len(labels))  # the label locations


s0 = df["Swin Transformer"].tolist()

s1 = df["avg_pred_probs"].tolist()
s2 = df["avg_pred_probs_weighted_by_accuracy"].tolist()
s3 = df["avg_pred_probs_weighted_by_inv_log_loss"].tolist()

s4 = df["avg_scores"].tolist()
s5 = df["avg_scores_weighted_by_accuracy"].tolist()
s6 = df["avg_scores_weighted_by_inv_log_loss"].tolist()

jf = 0.15 # jitter factor
markersize = 8
alpha = 1

fig, ax = plt.subplots()
_ = ax.plot(s0, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Swin Transformer", markersize=markersize, alpha=alpha)
_ = ax.plot(s1, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Preds", markersize=markersize, alpha=alpha)
# _ = ax.plot(s2, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Preds (Weighted By Accuracy)", markersize=markersize, alpha=alpha)
# _ = ax.plot(s3, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Preds (Weighted By Exp(-T*log_loss)", markersize=markersize, alpha=alpha)
# _ = ax.plot(s4, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Scores", markersize=markersize, alpha=alpha)
# _ = ax.plot(s5, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Scores (Weighted By Accuracy)", markersize=markersize, alpha=alpha)
_ = ax.plot(s6, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Weighted Avg Scores (Weighted By Exponential Negative Log Loss)", markersize=markersize, alpha=alpha)


# _ = ax.plot(s6, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Scores (Weighted By Exp(-T*log_loss)", markersize=markersize, alpha=alpha)


# Add some text for labels, title and custom x-axis tick labels, etc.
# ax.set_xlabel("")
ax.set_title(title, fontsize=24, fontweight="bold")
# ax.set_title("Dataset | Scoring Method", fontsize=12, fontweight="bold")
ax.set_yticks(x, labels)
ax.tick_params(axis='both', which='major', labelsize=12)
ax.legend(fontsize=12)
# fig.tight_layout()
plt.show()

In [None]:
title = "Lift at 100"

# pointer
df = df_evaluations_ensemble_lift_at_100.sort_values(by=["dataset", "avg_scores_weighted_by_inv_log_loss"])

labels = df["dataset_scoring_method"].tolist()
x = np.arange(len(labels))  # the label locations


s0 = df["Swin Transformer"].tolist()

s1 = df["avg_pred_probs"].tolist()
s2 = df["avg_pred_probs_weighted_by_accuracy"].tolist()
s3 = df["avg_pred_probs_weighted_by_inv_log_loss"].tolist()

s4 = df["avg_scores"].tolist()
s5 = df["avg_scores_weighted_by_accuracy"].tolist()
s6 = df["avg_scores_weighted_by_inv_log_loss"].tolist()

jf = 0.15 # jitter factor
markersize = 8
alpha = 1

fig, ax = plt.subplots()
_ = ax.plot(s0, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Swin Transformer", markersize=markersize, alpha=alpha)
_ = ax.plot(s1, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Preds", markersize=markersize, alpha=alpha)
# _ = ax.plot(s2, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Preds (Weighted By Accuracy)", markersize=markersize, alpha=alpha)
# _ = ax.plot(s3, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Preds (Weighted By Exp(-T*log_loss)", markersize=markersize, alpha=alpha)
# _ = ax.plot(s4, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Scores", markersize=markersize, alpha=alpha)
# _ = ax.plot(s5, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Scores (Weighted By Accuracy)", markersize=markersize, alpha=alpha)
_ = ax.plot(s6, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Weighted Avg Scores (Weighted By Exponential Negative Log Loss)", markersize=markersize, alpha=alpha)

# _ = ax.plot(s6, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Ensemble: Avg Scores (Weighted By Exp(-T*log_loss)", markersize=markersize, alpha=alpha)


# Add some text for labels, title and custom x-axis tick labels, etc.
# ax.set_xlabel("")
ax.set_title(title, fontsize=24, fontweight="bold")
# ax.set_title("Dataset | Scoring Method", fontsize=12, fontweight="bold")
ax.set_yticks(x, labels)
ax.tick_params(axis='both', which='major', labelsize=12)
ax.legend(fontsize=12)
# fig.tight_layout()
plt.show()

## Evaluate all experiments (all dataset-models)


In [None]:
%%time

# args to pass to get_label_quality_scores()
score_params = \
[
    ("self_confidence", False),
    ("self_confidence", True),
    ("normalized_margin", False),
    ("normalized_margin", True),
    ("confidence_weighted_entropy", False)
]

evaluations = []
precision_recall_curves = [] # store this separately
accuracy_list = []

for experiment in experiments:
    
    # experiment results
    dataset = experiment["dataset"]
    model = experiment["model"]
    pred_probs = experiment["pred_probs"]
    labels = experiment["labels"]
    images = experiment["images"]
    label_errors_target = experiment["label_errors_mask"]
    
    accuracy = {
        "dataset": dataset,
        "model": model,
        "cv_accuracy": (pred_probs.argmax(axis=1) == labels).mean()
    }
    
    accuracy_list.append(accuracy)
    
    #### label quality scoring
    
    for score_param in score_params:
        
        # label quality scoring method
        method, adjust_pred_probs = score_param    
    
        # compute scores
        label_quality_scores = get_label_quality_scores(labels=labels, pred_probs=pred_probs, method=method, adjust_pred_probs=adjust_pred_probs)

        # compute accuracy of detecting label errors
        auroc = roc_auc_score(label_errors_target, 1 - label_quality_scores)
        
        # lift at K where K = number of label errors
        lift_at_num_label_errors = lift_at_k(label_errors_target, 1 - label_quality_scores, k=label_errors_target.sum())
        
        # lift at k=100
        lift_at_100 = lift_at_k(label_errors_target, 1 - label_quality_scores, k=100)

        evaluation_results = {
            "method": method,
            "adjust_pred_probs": adjust_pred_probs,
            "dataset": dataset,
            "model": model,
            "dataset_num_samples": labels.shape[0],
            "dataset_num_label_errors": label_errors_target.sum(),
            "auroc": auroc,
            "lift_at_num_label_errors": lift_at_num_label_errors,
            "lift_at_100": lift_at_100            
        }

        # store evaluation results
        evaluations.append(evaluation_results)
        

        # compute precision-recall curve using label quality scores
        precision, recall, thresholds = precision_recall_curve(label_errors_target, 1 - label_quality_scores)        
        
        precision_recall_curve_results = {
            "method": method,
            "adjust_pred_probs": adjust_pred_probs,
            "dataset": dataset,
            "model": model,
            "label_quality_scores": label_quality_scores,
            "precision": precision,
            "recall": recall,
            "thresholds": thresholds
        }
        
        # store precision-recall curve results
        precision_recall_curves.append(precision_recall_curve_results)
        

    #### active learning scores to use as comparison
    
    al_scoring_funcs = {
        "entropy": get_normalized_entropy,
        "least_confidence": least_confidence
    }
    
    for al_method in al_scoring_funcs.keys():
        
        # active learning scoring function
        scoring_func = al_scoring_funcs[al_method]
    
        # score
        al_scores = scoring_func(pred_probs)

        # compute accuracy of detecting label errors
        auroc = roc_auc_score(label_errors_target, al_scores)

        # lift at K where K = number of label errors
        lift_at_num_label_errors = lift_at_k(label_errors_target, al_scores, k=label_errors_target.sum())
        
        # lift at k=100
        lift_at_100 = lift_at_k(label_errors_target, al_scores, k=100)

        evaluation_results = {
            "method": al_method,
            "adjust_pred_probs": False,
            "dataset": dataset,
            "model": model,
            "dataset_num_samples": labels.shape[0],
            "dataset_num_label_errors": label_errors_target.sum(),
            "auroc": auroc,
            "lift_at_num_label_errors": lift_at_num_label_errors,
            "lift_at_100": lift_at_100,
        }

        # store evaluation results
        evaluations.append(evaluation_results)

In [None]:
# cross-validation accuracy
df_cv_accuracy = pd.DataFrame(accuracy_list)

df_cv_accuracy_pivot = (
    pd.pivot_table(
        df_cv_accuracy, values="cv_accuracy", index=["model"], columns=["dataset"]
    )
    .reset_index()
    .sort_values(by="roman-numeral", ascending=False)
)

df_cv_accuracy_pivot["model"] = df_cv_accuracy_pivot.model.map(
    lambda x: model_display_name_dict[x]
)


In [None]:
df_cv_accuracy_pivot


In [None]:
df_cv_accuracy.head()

In [None]:
# master table with AUROC and Lift at K evaluation metrics for all methods, datasets, and models
df_evaluations = pd.DataFrame(evaluations)

# append cv accuracy
df_evaluations = df_evaluations.merge(df_cv_accuracy, how="left", on=["dataset", "model"])

In [None]:
df_evaluations["method_adjust_pred_probs"] = (
    df_evaluations.method + "-" + df_evaluations.adjust_pred_probs.astype(str)
)
df_evaluations["dataset_model"] = df_evaluations.dataset + " | " + df_evaluations.model

df_evaluations["scoring_method"] = df_evaluations.method_adjust_pred_probs.map(
    lambda x: method_adjust_pred_probs_display_dict[x]
)
df_evaluations["model_name"] = df_evaluations.model.map(
    lambda x: model_display_name_dict[x]
)


df_evaluations["model_name_w_acc"] = df_evaluations.model_name + " (" + df_evaluations.cv_accuracy.round(4).astype(str) + ") "



In [None]:
# df_evaluations.to_csv("evaluation_all_experiments.csv")

df_evaluations.head(3)


In [None]:
df_evaluations_auroc = (
    pd.pivot_table(
        df_evaluations,
        values="auroc",
        index=["dataset", "model_name", "model_name_w_acc"],
        columns=["scoring_method"],
    )
    .reset_index()
    .sort_values(by=["dataset", "model_name"])
)

df_evaluations_lift_at_num_errors = (
    pd.pivot_table(
        df_evaluations,
        values="lift_at_num_label_errors",
        index=["dataset", "model_name", "model_name_w_acc"],
        columns=["scoring_method"],
    )
    .reset_index()
    .sort_values(by=["dataset", "model_name"])
)

df_evaluations_lift_at_100 = (
    pd.pivot_table(
        df_evaluations,
        values="lift_at_100",
        index=["dataset", "model_name", "model_name_w_acc"],
        columns=["scoring_method"],
    )
    .reset_index()
    .sort_values(by=["dataset", "model_name"])
)


In [None]:
df_evaluations_auroc.tail()

In [None]:
# pointer
df = df_evaluations_auroc

# Draw plot
plt.figure(figsize=(10, 10), dpi=80)

s = 60
alpha = 0.9
marker = "o"

s0 = plt.scatter(
    df["confidence_weighted_entropy-False"], df.index, s=s, alpha=alpha, marker=marker
)
s1 = plt.scatter(df["self_confidence-False"], df.index, s=s, alpha=alpha, marker=marker)
s2 = plt.scatter(df["self_confidence-True"], df.index, s=s, alpha=alpha, marker=marker)
s3 = plt.scatter(
    df["normalized_margin-False"], df.index, s=s, alpha=alpha, marker=marker
)
s4 = plt.scatter(
    df["normalized_margin-True"], df.index, s=s, alpha=alpha, marker=marker
)
# s5 = plt.scatter(df["entropy-False"], df.index, s=s, alpha=alpha, marker=marker)
# s6 = plt.scatter(df["least_confidence-False"], df.index, s=s, alpha=alpha, marker=marker)

# for x, y, tex in zip(df["confidence_weighted_entropy-False"], df.index, df["confidence_weighted_entropy-False"]):
#     t = plt.text(x, y, round(tex, 1), horizontalalignment='center',
#                  verticalalignment='center', fontdict={'color':'white'})

plt.title("AUROC", fontsize=18)
plt.yticks(df.index, df.dataset_model)
plt.legend(
    (s0, s1, s2, s3, s4),
    (
        "Confidence Weighted Entropy (False)",
        "Self Confidence (False)",
        "Self Confidence (True)",
        "Normalized Margin (False)",
        "Normalized Margin (True)",
    ),
    loc="upper center",
    bbox_to_anchor=(0.5, 1.2),
    ncol=3,
    fancybox=True,
    shadow=True,
    fontsize=12,
)


In [None]:

title = "AUROC"

# pointer
df = df_evaluations_auroc.sort_values(by=["dataset", "Self Confidence"])
df["dataset_model"] = df.dataset + " | " + df.model_name_w_acc

labels = df["dataset_model"].tolist()
x = np.arange(len(labels))  # the label locations

s0 = df["Confidence Weighted Entropy"].tolist()
s1 = df["Self Confidence"].tolist()
s2 = df["Adjusted Self Confidence"].tolist()
s3 = df["Normalized Margin"].tolist()
s4 = df["Adjusted Normalized Margin"].tolist()


jf = 0.15 # jitter factor
markersize = 8
alpha = 1

fig, ax = plt.subplots()
_ = ax.plot(s0, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Confidence Weighted Entropy", markersize=markersize, alpha=alpha)
_ = ax.plot(s1, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Self Confidence", markersize=markersize, alpha=alpha)
_ = ax.plot(s2, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Adjusted Self Confidence", markersize=markersize, alpha=alpha)
_ = ax.plot(s3, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Normalized Margin", markersize=markersize, alpha=alpha)
_ = ax.plot(s4, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Adjusted Normalized Margin", markersize=markersize, alpha=alpha)

# Add some text for labels, title and custom x-axis tick labels, etc.
# ax.set_xlabel("")
ax.set_title(title, fontsize=24, fontweight="bold")
ax.set_yticks(x, labels)
ax.tick_params(axis='both', which='major', labelsize=12)
ax.legend(fontsize=12)
# fig.tight_layout()
plt.show()

In [None]:

title = "Lift at # Errors"

# pointer
df = df_evaluations_lift_at_num_errors.sort_values(by=["dataset", "Self Confidence"])
df["dataset_model"] = df.dataset + " | " + df.model_name_w_acc

labels = df["dataset_model"].tolist()
x = np.arange(len(labels))  # the label locations

s0 = df["Confidence Weighted Entropy"].tolist()
s1 = df["Self Confidence"].tolist()
s2 = df["Adjusted Self Confidence"].tolist()
s3 = df["Normalized Margin"].tolist()
s4 = df["Adjusted Normalized Margin"].tolist()


jf = 0.15 # jitter factor
markersize = 8
alpha = 1

fig, ax = plt.subplots()
_ = ax.plot(s0, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Confidence Weighted Entropy", markersize=markersize, alpha=alpha)
_ = ax.plot(s1, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Self Confidence", markersize=markersize, alpha=alpha)
_ = ax.plot(s2, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Adjusted Self Confidence", markersize=markersize, alpha=alpha)
_ = ax.plot(s3, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Normalized Margin", markersize=markersize, alpha=alpha)
_ = ax.plot(s4, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Adjusted Normalized Margin", markersize=markersize, alpha=alpha)

# Add some text for labels, title and custom x-axis tick labels, etc.
# ax.set_xlabel("")
ax.set_title(title, fontsize=24, fontweight="bold")
ax.set_yticks(x, labels)
ax.tick_params(axis='both', which='major', labelsize=12)
ax.legend(fontsize=12)
# fig.tight_layout()
plt.show()

In [None]:

title = "Lift at 100"

# pointer
df = df_evaluations_lift_at_100.sort_values(by=["dataset", "Confidence Weighted Entropy"])
df["dataset_model"] = df.dataset + " | " + df.model_name_w_acc

labels = df["dataset_model"].tolist()
x = np.arange(len(labels))  # the label locations

s0 = df["Confidence Weighted Entropy"].tolist()
s1 = df["Self Confidence"].tolist()
s2 = df["Adjusted Self Confidence"].tolist()
s3 = df["Normalized Margin"].tolist()
s4 = df["Adjusted Normalized Margin"].tolist()


jf = 0.15 # jitter factor
markersize = 8
alpha = 1

fig, ax = plt.subplots()
_ = ax.plot(s0, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Confidence Weighted Entropy", markersize=markersize, alpha=alpha)
_ = ax.plot(s1, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Self Confidence", markersize=markersize, alpha=alpha)
_ = ax.plot(s2, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Adjusted Self Confidence", markersize=markersize, alpha=alpha)
_ = ax.plot(s3, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Normalized Margin", markersize=markersize, alpha=alpha)
_ = ax.plot(s4, x + np.random.uniform(-jf, jf), marker="o", linestyle="None", label="Adjusted Normalized Margin", markersize=markersize, alpha=alpha)

# Add some text for labels, title and custom x-axis tick labels, etc.
# ax.set_xlabel("")
ax.set_title(title, fontsize=24, fontweight="bold")
ax.set_yticks(x, labels)
ax.tick_params(axis='both', which='major', labelsize=12)
ax.legend(fontsize=12)
# fig.tight_layout()
plt.show()

In [None]:
df_evaluations_lift_at_100.head()

## Summarize evaluation results across all datasets-models


In [None]:
def df_min_max_norm_group_by(
    df: pd.DataFrame,
    group_by_cols: list,
    val_col: str,
) -> pd.DataFrame:
    """
    Create new column with min-max norm of a value column grouped by list of columns
    """

    # calculate the max and min for each group
    _max = df.groupby(group_by_cols)[val_col].transform("max")
    _min = df.groupby(group_by_cols)[val_col].transform("min")

    # calculate min-max norm for each group

    df[f"min_max_norm_by_{'_'.join(group_by_cols)}_{val_col}"] = (
        df[val_col] - _min
    ) / (_max - _min)

    return df


In [None]:
# take the average of min-max normalized metrics (min-max normalization is done separately for each dataset-model)
df_eval_results = df_evaluations
df_eval_results = df_min_max_norm_group_by(
    df=df_eval_results, group_by_cols=["dataset", "model"], val_col="auroc"
)
df_eval_results = df_min_max_norm_group_by(
    df=df_eval_results,
    group_by_cols=["dataset", "model"],
    val_col="lift_at_num_label_errors",
)
df_eval_results = df_min_max_norm_group_by(
    df=df_eval_results, group_by_cols=["dataset", "model"], val_col="lift_at_100"
)


In [None]:
df_eval_results.head(7)


In [None]:
#### aggregate evaluation metrics across all dataset-model pairs

df_eval_results_agg = (
    df_eval_results.groupby(["scoring_method"])
    .agg(
        avg_min_max_norm_by_dataset_model_auroc=(
            "min_max_norm_by_dataset_model_auroc",
            "mean",
        ),
        avg_min_max_norm_by_dataset_model_lift_at_num_label_errors=(
            "min_max_norm_by_dataset_model_lift_at_num_label_errors",
            "mean",
        ),
        avg_min_max_norm_by_dataset_model_lift_at_100=(
            "min_max_norm_by_dataset_model_lift_at_100",
            "mean",
        ),
    )
    .reset_index()
    .sort_values(by="avg_min_max_norm_by_dataset_model_auroc", ascending=False)
)


df_eval_results_agg.to_csv("evaluation_all_experiments_aggregated.csv")

df_eval_results_agg


## Evaluation metrics for single best model (swin transformer)


In [None]:
df_eval_results_swin = df_eval_results[
    df_eval_results.model == "swin_base_patch4_window7_224"
]
df_eval_results_swin.head()


In [None]:
df_eval_results_swin_auroc = pd.pivot_table(
    df_eval_results_swin, values="auroc", index=["scoring_method"], columns=["dataset"]
).reset_index()

df_eval_results_swin_auroc


In [None]:
df_eval_results_swin_lift_at_num_errors = pd.pivot_table(
    df_eval_results_swin,
    values="lift_at_num_label_errors",
    index=["scoring_method"],
    columns=["dataset"],
).reset_index()

df_eval_results_swin_lift_at_num_errors


In [None]:
df_eval_results_swin_lift_at_100 = pd.pivot_table(
    df_eval_results_swin,
    values="lift_at_100",
    index=["scoring_method"],
    columns=["dataset"],
).reset_index()

df_eval_results_swin_lift_at_100


## Evaluate filter_by options for model with overall best cross-val accuracy (swin transformer)


In [None]:
# only use one selected model to evaluate filter_by options
selected_model = "swin_base_patch4_window7_224"

# Find label issues with different filter_by options
filter_by_list = [
    "prune_by_noise_rate",
    "prune_by_class",
    "both",
    "confident_learning",
    "predicted_neq_given",
]

results = []

for experiment in experiments:

    # experiment results
    dataset = experiment["dataset"]
    model = experiment["model"]
    pred_probs = experiment["pred_probs"]
    labels = experiment["labels"]
    images = experiment["images"]
    label_errors_target = experiment["label_errors_mask"]

    # only run for selected model
    if model == selected_model:
        print(model)

        for filter_by in filter_by_list:

            # Find label issues
            label_issues = cleanlab.filter.find_label_issues(
                labels=labels, pred_probs=pred_probs, filter_by=filter_by
            )

            precision = precision_score(label_errors_target, label_issues)
            recall = recall_score(label_errors_target, label_issues)
            f1 = f1_score(label_errors_target, label_issues)
            acc = accuracy_score(label_errors_target, label_issues)

            result = {
                "dataset": dataset,
                "model": selected_model,
                "filter_by": filter_by,
                "precision": precision,
                "recall": recall,
                "f1": f1,
                "accuracy": acc,
                "num_est_label_issues": label_issues.sum(),
                "dataset_num_samples": labels.shape[0],
                "dataset_num_label_errors": label_errors_target.sum(),
            }

            results.append(result)


In [None]:
df_filter_by_eval = pd.DataFrame(results)


In [None]:
df_filter_by_eval


In [None]:
df_filter_by_eval_f1 = pd.pivot_table(
    df_filter_by_eval, values="f1", index=["filter_by"], columns=["dataset"]
).reset_index()


In [None]:
df_filter_by_eval_f1


## Generate Precision-Recall curve


**Note:** we can refactor the code to make it more concise but for now we use a for-loop and only plot for certain methods


In [None]:
plt.rcParams["figure.figsize"] = (15, 10)

df_list = []

for data in precision_recall_curves:

    # get data needed to plot precision-recall curve
    method = data["method"]
    adjust_pred_probs = data["adjust_pred_probs"]
    dataset = data["dataset"]
    model = data["model"]
    label_quality_scores = data["label_quality_scores"]
    precision = data["precision"]
    recall = data["recall"]
    thresholds = data["thresholds"]

    # save to DataFrame
    # ignore last precision, recall value because it's always 1, 0 respectively with no corresponding threshold
    # https://stackoverflow.com/questions/31639016/in-scikits-precision-recall-curve-why-does-thresholds-have-a-different-dimensi
    df_temp = pd.DataFrame(
        {
            "precision": precision[:-1],
            "recall": recall[:-1],
            "thresholds": thresholds,
            "model": model,
            "method": method,
            "adjust_pred_probs": adjust_pred_probs,
        }
    )

    df_list.append(df_temp)


In [None]:
df_temp["precision"]


In [None]:
%%time

plt.rcParams["figure.figsize"] = (15, 10)

score_params = \
[
    # ("self_confidence", False),
    ("self_confidence", True),
    # ("normalized_margin", False),
    ("normalized_margin", True),
    ("confidence_weighted_entropy", False)
]

df_list = []

for score_param in score_params:

    method, adjust_pred_probs = score_param
    
    print(f"Scoring label quality...")
    print(f"  method: {method}")
    print(f"  adjust_pred_probs: {adjust_pred_probs}")

    label_quality_scores = get_label_quality_scores(labels, pred_probs, method=method, adjust_pred_probs=adjust_pred_probs)

    # compute accuracy of detecting label errors
    auroc = roc_auc_score(label_errors_target, 1 - label_quality_scores)

    # compute precision-recall curve
    precision, recall, thresholds = precision_recall_curve(label_errors_target, 1 - label_quality_scores)

    # save to DataFrame
    # ignore last precision, recall value because it's always 1, 0 respectively with no corresponding threshold
    # https://stackoverflow.com/questions/31639016/in-scikits-precision-recall-curve-why-does-thresholds-have-a-different-dimensi
    df_temp = pd.DataFrame({
        "precision": precision[:-1],
        "recall": recall[:-1],
        "thresholds": thresholds,
        "model": model,
        "method": method,
        "adjust_pred_probs": adjust_pred_probs
    })

    df_list.append(df_temp)
    
    # plot
    plt.plot(recall, precision, label=f"{method}-{str(adjust_pred_probs)}")
    
# combine DataFrames
# df_all = pd.concat(df_list)

# plot single dot (precision, recall) for each filter_by option
for index, row in df_filter_by.iterrows():
    filter_by = row["filter_by"]
    precision = row["precision"]
    recall = row["recall"]
    plt.plot(recall, precision, marker="o", markersize=10, label=filter_by)

plt.xlabel("Recall", fontsize=14)
plt.ylabel("Precision", fontsize=14)
plt.title("Precision-Recall Curve: Label Error Detection on CIFAR-10N-Worst \n Model: swin_base_patch4_window7_224", fontsize=20, fontweight="bold")
# plt.suptitle("")
plt.legend()
plt.show()