# CL for multi-annotator data [cifar10h] [benchmarking]
- This notebook uses the results from the model_train_pred notebook to evaluate model performance. Also evaluates general multi-annotator dataset health

In [None]:
from cleanlab.multiannotator import get_label_quality_multiannotator, get_multiannotator_stats

In [None]:
%load_ext autoreload
%autoreload 2

import cleanlab
from cleanlab.rank import get_label_quality_scores, get_label_quality_ensemble_scores
from cleanlab.internal.label_quality_utils import get_normalized_entropy
from cleanlab.filter import find_label_issues
import sys
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, precision_recall_curve, roc_curve, accuracy_score, log_loss
from matplotlib import pyplot as plt
from IPython.display import Image, display
import os

sys.path.insert(0, "../")
from utils.eval_metrics import lift_at_k
from utils.active_learning_scores import least_confidence
# experimental version of label quality ensemble scores with additional weighting schemes
from utils.label_quality_ensemble_scores_experimental import get_label_quality_ensemble_scores_experimental

path = os.getcwd()
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:
!pwd

### Load/Analyze Cifar10h Data

In [None]:
# If this line throws an error, make sure you correctly downloaded and unzipped cifar10h-raw data

pred_probs_multiannotator = np.load('./data/cifar10h/cifar10h-probs.npy')
df = pd.read_csv('./data/cifar10h/cifar10h-raw.csv')
df = df[df.cifar10_test_test_idx != -99999] # dropping all attention check trials
df.head()

In [None]:
df.describe()

#### restructure dataset information
- num_datapoints (N), num_annotators (M)
- hlabels: (N,M)
- hlabels_error_mask: (N,M) where True=error
- hannotator_mask: (N,M) where True=annotator x anotated that
- htrue_labels: (K=10,000,) # indexed same way as cifar10
- htrue_images: (K=10,000,)

In [None]:
# get annotations per annotator in numpy array (rows = annotations, cols = annotators)

num_datapoints = df['cifar10_test_test_idx'].max() + 1
num_annotators = df['annotator_id'].max() + 1

hlabels = np.full((num_datapoints, num_annotators), np.nan) # all annotator labels np.full([height, width, 9], np.nan)
hlabels_error_mask = np.zeros((num_datapoints, num_annotators), dtype=bool) # mask of annotator errors
hannotator_mask = np.zeros((num_datapoints, num_annotators), dtype=bool) # mask of what each person annotated

print(hlabels.shape, hlabels.sum(), hlabels_error_mask.shape, hlabels_error_mask.sum(), hannotator_mask.shape, hannotator_mask.sum())

for annotator_id in range(num_annotators):
    adf = df[df.annotator_id == annotator_id] # 200 annotations per annotator
    annotations_idx = adf['cifar10_test_test_idx'].values
    annotations = adf['chosen_label'].values
    errors = adf['correct_guess'].values
    
    hlabels[annotations_idx, annotator_id] = annotations
    hlabels_error_mask[annotations_idx, annotator_id] = errors
    hannotator_mask[annotations_idx, annotator_id] = True

print(hlabels.shape, hlabels.sum(), hlabels_error_mask.shape, hlabels_error_mask.sum(), hannotator_mask.shape, hannotator_mask.sum())

In [None]:
# get true labels as numpy array (rows = true labels,) and true images

htrue_labels = np.zeros((num_datapoints, ))
htrue_images = np.empty((num_datapoints, ) ,dtype=object)

idx_to_label = \
[(idx,label,image) for idx,label,image in zip(df['cifar10_test_test_idx'],df['true_label'],df['image_filename'])]
idx_to_label = list(set(idx_to_label))

idx = [idx_to_label[0] for idx_to_label in idx_to_label]
true_label = [idx_to_label[1] for idx_to_label in idx_to_label]
htrue_image = [idx_to_label[2] for idx_to_label in idx_to_label]

htrue_labels[idx] = true_label
htrue_images[idx] = htrue_image

#### get accuracy of individual annotators

In [None]:
# Get accuracy of individual annotators
def plt_annotator_accuracy(labels_error_mask, annotator_mask):
    annotator_accuracy = labels_error_mask.sum(axis=0) / annotator_mask.sum(axis=0)
    plt.boxplot(annotator_accuracy)
    plt.show()

    df_describe = pd.DataFrame(annotator_accuracy, columns=['score'])
    return df_describe

df_describe = plt_annotator_accuracy(hlabels_error_mask, hannotator_mask)
df_describe.describe()

#### get accuracy of consensus labels

In [None]:
# Compute the consensus_labels
# TODO: conditional based on consensus_method, consensus_method can be a List[str], add dawid-skene
def get_consensus_labels(labels_multiannotator, pred_probs):
    mode_labels_multiannotator = labels_multiannotator.mode(axis=1)
    consensus_labels = []
    for i in range(len(mode_labels_multiannotator)):
        consensus_labels.append( int(mode_labels_multiannotator.iloc[i][pred_probs[i][mode_labels_multiannotator.iloc[i].dropna().astype(int).to_numpy()
                    ].argmax()
                ]
            )
        )
    return np.array(consensus_labels)

def get_consensus_accuracy_report(labels, true_labels, annotator_mask, pred_probs_multiannotator):
    labels_multiannotator = pd.DataFrame(labels)
    consensus_labels = get_consensus_labels(labels_multiannotator, pred_probs_multiannotator)
    correct_consensus = (true_labels == consensus_labels) + 0
    all_consensus = [1] * len(true_labels)
    correct_consensus.sum() / len(correct_consensus)

    num_annotators_per_example = annotator_mask.sum(axis=1)

    consensus_accuracy = pd.DataFrame(zip(correct_consensus, num_annotators_per_example,all_consensus), columns=['consense','num_a','total_seen'])
    consensus_accuracy = consensus_accuracy.groupby('num_a')[["consense", "total_seen"]].sum().reset_index()
    consensus_accuracy['consensus_acc'] = consensus_accuracy['consense'] / consensus_accuracy['total_seen']
    return consensus_accuracy

# per example, number of annotators that agree with consensus label (% agreement = x with lower confidence bound = confidence interval for true proportion of annotators that greed (jonas share))
# plot accuracy of consensus label given number of annotators that agree
# if acc to num annotators and num annotator agreement % then we perform best

def plot_labels_multiannotator(labels, true_labels, pred_probs_multiannotator=None):
    labels_multiannotator = pd.DataFrame(labels)
    
    if pred_probs_multiannotator is None:
        pred_counts = labels.sum(axis=1)
        pred_probs_multiannotator = labels / pred_counts[:,np.newaxis]

    consensus_labels = get_consensus_labels(labels_multiannotator, pred_probs_multiannotator)
    consensus_labels_tile = np.repeat(consensus_labels[:,np.newaxis], labels_multiannotator.shape[1], axis=1)
    num_annotators_per_ex = np.count_nonzero(~np.isnan(labels), axis=1)
    annotator_agreement = (labels_multiannotator == consensus_labels_tile) # Number of annotators matches consensus
    annotator_agreement = annotator_agreement.sum(axis=1)
    bin_consensus = (true_labels == consensus_labels) + 0
    consensus_accuracy = pd.DataFrame(zip(annotator_agreement,bin_consensus), columns=['annotator_agreement','bin_consense'])
    _ = consensus_accuracy.boxplot(by=['bin_consense'], figsize=(7,7))
    consensus_accuracy = consensus_accuracy.groupby('bin_consense')[['annotator_agreement']].sum().reset_index()

    return consensus_accuracy    

In [None]:
plot_labels_multiannotator(hlabels, htrue_labels)

In [None]:
consensus_accuracy = get_consensus_accuracy_report(hlabels, 
                              htrue_labels, 
                              hannotator_mask, 
                              pred_probs_multiannotator)
consensus_accuracy

In [None]:
consensus_accuracy[['num_a','consensus_acc']].plot(kind='line',x='num_a',y='consensus_acc',color='pink')

In [None]:
# make prec/recall plot for conensus_labels correct/not correct for each example given preds from model are now our scores
# high level: when few annotators. unreliable

### Try random dropout of random number of examples
- randomly flip some 1 bits in annotator mask per row
- apply annotator mask to labels (nan where it is 0) and
- labels_error_mask (false where it is 0)
- calculate new pred_probs multiannotator

In [None]:
# labels- all annotator labels np.full([height, width, 9], np.nan)
# labels_error_mask- mask of annotator errors
# annotator_mask- mask of what each person annotated

def get_sample_labels(x_sample, y_sample, labels, labels_error_mask, annotator_mask):
    s_annotator_mask = annotator_mask.copy()
    s_annotator_mask[(x_sample,y_sample)] = 0
    s_labels_error_mask = s_annotator_mask & labels_error_mask
    s_labels = labels.copy()
    np.copyto(s_labels, np.nan, where=s_annotator_mask)
    print('Total idxs dropped: ', annotator_mask.sum() - s_annotator_mask.sum())
    return s_labels, s_labels_error_mask, s_annotator_mask

**randomly drop x percent of all labels**

In [None]:
# randomly drop x percent of all labels
percent_dropped = 0.4

x,y = np.where(hannotator_mask == 1)
drop_idx = np.random.choice(np.arange(len(x)), int(len(x)*percent_dropped), replace=False)
x_sample = x[drop_idx]
y_sample = y[drop_idx]
s_labels, s_labels_error_mask, s_annotator_mask = get_sample_labels(x_sample, y_sample, hlabels, hlabels_error_mask, hannotator_mask)

In [None]:
df_describe = plt_annotator_accuracy(s_labels_error_mask, s_annotator_mask)
df_describe.describe()

In [None]:
consensus_accuracy = get_consensus_accuracy_report(s_labels, 
                              htrue_labels, 
                              s_annotator_mask, 
                              pred_probs_multiannotator)
consensus_accuracy

In [None]:
consensus_accuracy[['num_a','consensus_acc']].plot(kind='line',x='num_a',y='consensus_acc',color='pink')
# x axis can be lower bound on consensus labels (i.e. how many annotators agreed)

In [None]:
plot_labels_multiannotator(s_labels, htrue_labels)

**Drop x rows per every annotator**

In [None]:
percent_dropped = 0.4
rows_dropped = int(200 * percent_dropped)
print('rows dropped: ', rows_dropped)

x,y = np.where(hannotator_mask == 1)
df_delete = pd.DataFrame(zip(x,y),columns=['x','y'])
df_keep = df_delete.drop(df_delete.groupby('y').sample(n=200 - rows_dropped).index)
x_sample,y_sample = df_keep['x'].values, df_keep['y'].values
s_labels, s_labels_error_mask, s_annotator_mask = get_sample_labels(x_sample, y_sample, hlabels, hlabels_error_mask, hannotator_mask)

In [None]:
df_describe = plt_annotator_accuracy(s_labels_error_mask, s_annotator_mask)
df_describe.describe()

In [None]:
consensus_accuracy = get_consensus_accuracy_report(s_labels, 
                              htrue_labels, 
                              s_annotator_mask, 
                              pred_probs_multiannotator)
consensus_accuracy

In [None]:
consensus_accuracy[['num_a','consensus_acc']].plot(kind='line',x='num_a',y='consensus_acc',color='pink')

In [None]:
plot_labels_multiannotator(s_labels, htrue_labels)

**Drop x values for every row**

In [None]:
# rows have min 50 values
percent_dropped = 0.4
vals_dropped = int(50 * percent_dropped)
print('vals dropped: ', vals_dropped)

x,y = np.where(hannotator_mask == 1)
df_delete = pd.DataFrame(zip(x,y),columns=['x','y'])
df_keep = df_delete.drop(df_delete.groupby('x').sample(n=50 - vals_dropped).index)
x_sample,y_sample = df_keep['x'].values, df_keep['y'].values
s_labels, s_labels_error_mask, s_annotator_mask = get_sample_labels(x_sample, y_sample, hlabels, hlabels_error_mask, hannotator_mask)

In [None]:
consensus_accuracy = get_consensus_accuracy_report(s_labels, 
                              htrue_labels, 
                              s_annotator_mask, 
                              pred_probs_multiannotator)
consensus_accuracy

In [None]:
plot_labels_multiannotator(s_labels, htrue_labels)

### Load models and data

In [None]:
# dictionaries to map to display name
method_adjust_pred_probs_display_dict = {
    "self_confidence-False": "Self Confidence",
    "self_confidence-True": "Adjusted Self Confidence",
    "normalized_margin-False": "Normalized Margin",
    "normalized_margin-True": "Adjusted Normalized Margin",
    "confidence_weighted_entropy-False": "Confidence Weighted Entropy",
    "entropy-False": "Entropy",
    "least_confidence-False": "Least Confidence",
}

model_display_name_dict = {"resnet18": "ResNet-18",}

models = ["resnet18"] # can also be: "resnet50d", "efficientnet_b1", "twins_pcpvt_base", "swin_base_patch4_window7_224"

# args to pass to get_label_quality_scores()
score_params = \
[
    ("self_confidence", False),
    ("self_confidence", True),
    ("normalized_margin", False),
    ("normalized_margin", True),
    ("confidence_weighted_entropy", False)
]

In [None]:
# read numpy files from model_train_pred
numpy_out_folder = './data/model_data_070622/'
pred_probs = np.load(numpy_out_folder + "test_pred_probs.npy")
pred_labels = np.load(numpy_out_folder + "test_preds.npy")
true_labels = np.load(numpy_out_folder + "test_labels.npy")
images = np.load(numpy_out_folder + "test_images.npy", allow_pickle=True)
idxs = [int(image.split('/')[-1][-8:-4]) for image in images]

# boolean mask of label errors
labels = pred_labels # labels can change to annotator labels!!
label_errors_target = labels != true_labels

# set all cifar10h annotator data to the correct indexing
htrue_labels = htrue_labels[idxs]
htrue_images = htrue_images[idxs]
hlabels = hlabels[idxs]
hlabels_error_mask = hlabels_error_mask[idxs]
hannotator_mask = hannotator_mask[idxs]

assert np.array_equal(htrue_labels, true_labels) # check cifar10h sort matches what our model predicted on

In [None]:
%%time
plt.rcParams["figure.figsize"] = (15, 10)

results = []
results_list = []
precision_recall_curves  = []
for score_param in score_params:
    method, adjust_pred_probs = score_param
    # compute scores
    label_quality_scores = get_label_quality_scores(labels=labels, pred_probs=pred_probs, method=method, adjust_pred_probs=adjust_pred_probs)
    # compute accuracy of detecting label errors
    auroc = roc_auc_score(label_errors_target, 1 - label_quality_scores)
    # compute Lift@K evaluation metric
    lift_at_k_dict = {}
    for k in range(1000, 11000, 1000):
        lift_at_k_dict[f"lift_at_{k}"] = lift_at_k(label_errors_target, 1 - label_quality_scores, k=k)
    # save results
    results = {
        "dataset": "cifar10",
        "model": "resnet18",
        "noise_config": "Noise Amount: 0.2 | Sparsity: 0.4",
        "method": method,
        "adjust_pred_probs": adjust_pred_probs,
        "auroc": auroc
    }
    # add the lift at k metrics
    results.update(lift_at_k_dict)
    # save results
    results_list.append(results)
    
    # compute precision-recall curve using label quality scores
    precision, recall, thresholds = precision_recall_curve(label_errors_target, 1 - label_quality_scores)
    
    # compute au-roc curve using label quality scores
    fpr, tpr, thresholds = roc_curve(label_errors_target,  1 - label_quality_scores)
    
    precision_recall_curve_results = {
        "method": method,
        "adjust_pred_probs": adjust_pred_probs,
        "dataset": "cifar10",
        "model": "resnet18",
        "label_quality_scores": label_quality_scores,
        "precision": precision,
        "recall": recall,
        "thresholds": thresholds
    }

#     # store precision-recall curve results
#     precision_recall_curves.append(precision_recall_curve_results)
    
#     # store precision-recall curve results
#     auroc_curves.append(precision_recall_curve_results)
    
    # plot prc
    plt.subplot(1, 2, 1)
    plt.plot(recall, precision, label=f"{method}-{str(adjust_pred_probs)}")
    plt.xlabel("Recall", fontsize=14)
    plt.ylabel("Precision", fontsize=14)
    plt.title("Precision-Recall Curve: Label Error Detection on CIFAR-10h \n Model: resnet-18", fontsize=14, fontweight="bold")
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(fpr, tpr, label=f"{method}-{str(adjust_pred_probs)}")
    plt.xlabel("False Positive Rate", fontsize=14)
    plt.ylabel("True Positive Rate", fontsize=14)
    plt.title("AU ROC Curve: Label Error Detection on CIFAR-10h \n Model: resnet-18", fontsize=14, fontweight="bold")
    plt.legend()

plt.show()

# use better legend (i.e. self_confidence-True .. what is true?)
# but overall no interest in showing different scores of single labels, show multiannotator

In [None]:
# Create dataframe and export to csv

df_result = pd.DataFrame(results_list)
df_result.to_csv(f"./data/benchmark_results/label_quality_scores_evaluation.csv", index=False)
df_result

#### Notes
- ROC Curves summarize the trade-off between the true positive rate and false positive rate for a predictive model using different probability threshold
- Precision-Recall curves summarize the trade-off between the true positive rate and the positive predictive value for a predictive model using different probability thresholds.
- Precision-Recall curves: imbalanced datasets (more sensitive to positive class), ROC curves: balanced datasets

#### Analyze Consensus label vs model performance

In [None]:
# check how much the consensus labels differ from
hlabels_df = pd.DataFrame(hlabels)
consensus_labels = get_consensus_labels(hlabels_df, pred_probs)

print('Probability annotators alone correctly predict labels: ', (true_labels == consensus_labels).sum() / 10000)
model_pred_labels = np.argmax(pred_probs, axis=1) # true labels == to what the model is likeley to predict
print('Probability model alone correctly predicts labels: ', (model_pred_labels == true_labels).sum() / 10000) # suggests model is x% likeley to predict with the consensus
print('Similar prediction between model preds and consensus_labels: ', (model_pred_labels == consensus_labels).sum())

## Analyze label issues

In [None]:
def get_label_issues(labels, pred_probs):
    label_issues = find_label_issues(labels=labels,
                        pred_probs=pred_probs,
                        return_indices_ranked_by='self_confidence',
                        )
    issue_consensus_labels = labels[label_issues]
    issue_images = images[label_issues]
    issue_true_labels = true_labels[label_issues]
    issue_is_issue = (issue_consensus_labels != issue_true_labels) + 0
    issue_real_image_paths = [path + '/' + '/'.join(image.split('/')[-5:]) for image in issue_images]

    print('Number of label issues detected: ', len(label_issues))
    print('Number of true label issues: ', true_labels.shape[0] - np.sum(labels == true_labels))
    print('Number of true label issues detected: ', np.sum(issue_is_issue))
    
    issues_df = pd.DataFrame(zip(issue_consensus_labels, issue_true_labels, issue_is_issue, issue_real_image_paths),
            columns = ['label','true_label','is_issue','image_png'])
    return issues_df

def visualize_label_issues(issues_df, classes, scale=100):
    print('Visualizing', len(issues_df), 'issues\n')
    
    listOfImageNames = issues_df['image_png'].values

    for index, row in issues_df.iterrows():
        print('Correctly identified: ', bool(row['is_issue']), '\nGiven label: ', classes[row['label']], '\nTrue label: ', classes[row['true_label']],)
        image = Image(filename=row['image_png'])
        display(Image(filename=row['image_png'], width=scale))

classes = {0:"airplane", 
           1:"automobile", 
           2:"bird", 
           3:"cat", 
           4:"deer",
           5:"dog", 
           6:"frog", 
           7:"horse", 
           8:"ship", 
           9:"truck"}

#### Run find_label_issues on the consensus labels and visualize label issues.

In [None]:
# check how much the consensus labels differ from
hlabels_df = pd.DataFrame(hlabels)
consensus_labels = get_consensus_labels(hlabels_df, pred_probs)
issues_df = get_label_issues(hlabels, pred_probs)
issues_df.head()

In [None]:
true_issues_df = issues_df[issues_df['is_issue'] == 1] # get a df of correctly identified true issues
# visualize_label_issues(issues_df, classes)
visualize_label_issues(true_issues_df, classes)

#### Run find_label_issues on an individual annotator's labels and visualize label issues.

In [None]:
annotator_accuracy = hlabels_error_mask.sum(axis=0) / hannotator_mask.sum(axis=0)
worst_annotator = np.argmin(annotator_accuracy)
best_annotator = np.argmax(annotator_accuracy)
print('worst annotator: ', worst_annotator, 'accuracy: ', annotator_accuracy[worst_annotator])
print('best annotator: ', best_annotator, 'accuracy: ', annotator_accuracy[best_annotator])

In [None]:
annotator_id = 1 # worst = 2561, best = 1957

a_hlabels = hlabels[:,annotator_id]
a_hannotator_mask = hannotator_mask[:,annotator_id]
a_hlabels_error_mask = hlabels_error_mask[:,annotator_id]

a_labels = true_labels.copy()
a_labels[a_hannotator_mask] = a_hlabels[a_hannotator_mask]

print('Annotator accuracy: ', np.sum(a_labels[a_hannotator_mask] == true_labels[a_hannotator_mask]) / a_hannotator_mask.sum())
print('Annotator accuracy: ', annotator_accuracy[annotator_id])
print('Num correctly labeled points for annotator ', annotator_id, ': ', np.sum(a_labels == true_labels))
print('Annotator + True label accuracy: ', np.sum(a_labels == true_labels) / len(a_labels))
issues_df = get_label_issues(a_labels, pred_probs)
issues_df.head()

## Using multiannotator library to analyze data

In [None]:
hlabels_df = pd.DataFrame(hlabels)
scores = get_label_quality_multiannotator(hlabels_df, pred_probs, return_annotator_stats=False)

In [None]:
stats.head()

In [None]:
stats = get_multiannotator_stats(hlabels_df, pred_probs, consensus_labels, 'agreement')

In [None]:
label_quality_scores = get_label_quality_scores(consensus_labels, pred_probs)

In [None]:
pd.DataFrame(label_quality_scores, columns=['']).plot.hist(bins=30,)

In [None]:
stats

In [None]:
stats['worst_class'].plot.hist(bins=10,)

In [None]:
print(stats['overall_quality'].min(), stats['overall_quality'].max())