# Report Side Benchmarks
- Annotator vs Model Accuracy
- Annotator quality scores
- Annotator Agreement Consensus vs. Label Accuracy

TODO: Add visualization code?

In [None]:
import numpy as np
from IPython.display import display # pretty display
from utils.eval import (get_annotator_mask, 
                        get_labels_error_mask,
                        get_model_vs_consensus_accuracy,
                        plt_annotator_accuracy, 
                        plt_labels_multiannotator)
from cleanlab.multiannotator import get_majority_vote_label

In [None]:
!wget -nc "https://cleanlab-public.s3.amazonaws.com/Multiannotator/cifar-10h/benchmark_data.tar.gz" 
!tar -xf benchmark_data.tar.gz benchmark_data/
!mkdir benchmark_results

In [None]:
def load_data(dataset, model):
    multiannotator_labels = np.load(f"benchmark_data/{dataset}/{model}/annotator_labels.npy")
    true_labels = np.load(f"benchmark_data/{dataset}/{model}/true_labels.npy")
    pred_probs = np.load(f"benchmark_data/{dataset}/{model}/pred_probs.npy")
    
    return multiannotator_labels, true_labels, pred_probs

In [None]:
model_names = ["resnet18", "swin", "truelabels"]
datasets = ["worst_annotators", "uniform_1_5", "complete"]

In [None]:
for dataset in datasets:
    print('-'*80)
    print(f'- DATASET {dataset}')
    
    multiannotator_labels = None # Same for all models of specific dataset
    true_labels = None # Same for all models of speciic dataset
    
    for model_name in model_names:
        print(f'- MODEL {model_name}')
        multiannotator_labels, true_labels, pred_probs = load_data(dataset, model_name)
        consensus_labels = get_majority_vote_label(multiannotator_labels, pred_probs=None)

        print('--- [1] Annotator vs Model accuracy ---')
        # Report how much the consensus labels and model predictions differ from true labels
        df = get_model_vs_consensus_accuracy(pred_probs, consensus_labels, true_labels)
        display(df)
    
    print('- MODEL AGNOSTIC\n')
    annotator_mask = get_annotator_mask(multiannotator_labels)
    labels_error_mask = get_labels_error_mask(multiannotator_labels, true_labels)
        
    print('--- [2] Distribution of individual annotator accuracy vs ground truth ---')
    plt_title = f"{dataset}_annotator_accuracy_vs_gt_plot.pdf"
    annotator_accuracy_df = plt_annotator_accuracy(labels_error_mask, annotator_mask, plot=True, fig_title=plt_title)
    display(annotator_accuracy_df.describe())

    print('--- [3] Annotator Agreement with Consensus vs. Label Accuracy ---')
    # Plots the distribution of annotator agreement for correct/incorrect labels
    plt_title = f"{dataset}_annotator_agreement_with_consensus_plot.pdf"
    plt_labels_multiannotator(multiannotator_labels, consensus_labels, true_labels, fig_title=plt_title)