In [3]:
import cleanlab
import sys
import numpy as np
import pandas as pd
import datetime
import copy
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

sys.path.insert(0, "../")
from eval_metrics import lift_at_k

from cleanlab.rank import order_label_issues, get_label_quality_scores, get_label_quality_ensemble_scores
from cleanlab.filter import find_label_issues

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Evaluate scores

In [4]:
%%time

models = [
    "resnet18", 
    "resnet50d",
    "efficientnet_b1",
    "twins_pcpvt_base",
    "swin_base_patch4_window7_224"
]

# args to pass to get_label_quality_scores()
score_params = \
[
    ("self_confidence", False),
    ("self_confidence", True),
    ("normalized_margin", False),
    ("normalized_margin", True),
    ("confidence_weighted_entropy", False)
]

results_list = []
pred_probs_list = [] # use for ensemble scoring
labels_list = [] # use for sanity check (labels from each model should be the same because they were generated from the same cross-val procedure

for model in models:
    
    # read numpy files
    numpy_out_folder = f"./cifar-10n-png_noise_type_aggre_cv_{model}/"
    pred_probs = np.load(numpy_out_folder + "pred_probs.npy")
    labels = np.load(numpy_out_folder + "noisy_labels.npy")
    true_labels = np.load(numpy_out_folder + "true_labels.npy")
    images = np.load(numpy_out_folder + "images.npy", allow_pickle=True)
    
    # boolean mask of label errors
    label_errors_target = labels != true_labels
    
    # save to list for ensemble scoring
    pred_probs_list.append(pred_probs)
    
    labels_list.append(labels)
    
    for score_param in score_params:
        
        method, adjust_pred_probs = score_param

        # compute scores
        label_quality_scores = get_label_quality_scores(labels=labels, pred_probs=pred_probs, method=method, adjust_pred_probs=adjust_pred_probs)

        # compute accuracy of detecting label errors
        auroc = roc_auc_score(label_errors_target, 1 - label_quality_scores)

        # compute Lift@K evaluation metric
        lift_at_k_dict = {}
        for k in range(1000, 21000, 1000):
            lift_at_k_dict[f"lift_at_{k}"] = lift_at_k(label_errors_target, 1 - label_quality_scores, k=k)

        # save results
        results = {
            "dataset": "cifar-10n",
            "model": model,
            "noise_type": "aggre_label",
            "method": method,
            "adjust_pred_probs": adjust_pred_probs,
            "auroc": auroc
        }

        # add the lift at k metrics
        results.update(lift_at_k_dict)

        # save results
        results_list.append(results)        

CPU times: user 29 s, sys: 52 ms, total: 29.1 s
Wall time: 29.1 s


In [5]:
# use for sanity check (noisy labels from each model should be the same because they were generated from the same cross-val procedure
for i, labels_temp in enumerate(labels_list):
    
    if i == 0:
        # labels_temp_previous = labels_temp.copy()
        labels_temp_previous = copy.deepcopy(labels_temp)
    else:
        assert (labels_temp_previous == labels_temp).all()

## Evaluate ensemble scores

In [6]:
%%time

for score_param in score_params:

    method, adjust_pred_probs = score_param
    
    print(f"Scoring label quality...")
    print(f"  method: {method}")
    print(f"  adjust_pred_probs: {adjust_pred_probs}")

    label_quality_ensemble_scores = get_label_quality_ensemble_scores(labels, pred_probs_list, method=method, adjust_pred_probs=adjust_pred_probs)
    
    # compute accuracy of detecting label errors
    auroc = roc_auc_score(label_errors_target, 1 - label_quality_ensemble_scores)

    # compute Lift@K evaluation metric
    lift_at_k_dict = {}
    for k in range(1000, 21000, 1000):
        lift_at_k_dict[f"lift_at_{k}"] = lift_at_k(label_errors_target, 1 - label_quality_ensemble_scores, k=k)

    # save results
    results = {
        "dataset": "cifar-10n",
        "model": "ensemble (all)",
        "noise_type": "aggre_label",
        "method": method,
        "adjust_pred_probs": adjust_pred_probs,
        "auroc": auroc
    }

    # add the lift at k metrics
    results.update(lift_at_k_dict)
    
    # save results
    results_list.append(results)

Scoring label quality...
  method: self_confidence
  adjust_pred_probs: False
Weighting scheme for ensemble: accuracy
Ensemble members will be weighted by: their relative accuracy
  Model 0 accuracy : 0.87448
  Model 0 weights  : 0.1974788967124488
  Model 1 accuracy : 0.87878
  Model 1 weights  : 0.1984499415114877
  Model 2 accuracy : 0.87154
  Model 2 weights  : 0.19681497305915244
  Model 3 accuracy : 0.89714
  Model 3 weights  : 0.2025960769790119
  Model 4 accuracy : 0.90628
  Model 4 weights  : 0.20466011173789922
Scoring label quality...
  method: self_confidence
  adjust_pred_probs: True
Weighting scheme for ensemble: accuracy
Ensemble members will be weighted by: their relative accuracy
  Model 0 accuracy : 0.87448
  Model 0 weights  : 0.1974788967124488
  Model 1 accuracy : 0.87878
  Model 1 weights  : 0.1984499415114877
  Model 2 accuracy : 0.87154
  Model 2 weights  : 0.19681497305915244
  Model 3 accuracy : 0.89714
  Model 3 weights  : 0.2025960769790119
  Model 4 accurac

## Create DataFrame with results

In [7]:
df = pd.DataFrame(results_list)

In [8]:
df

Unnamed: 0,dataset,model,noise_type,method,adjust_pred_probs,auroc,lift_at_1000,lift_at_2000,lift_at_3000,lift_at_4000,lift_at_5000,lift_at_6000,lift_at_7000,lift_at_8000,lift_at_9000,lift_at_10000,lift_at_11000,lift_at_12000,lift_at_13000,lift_at_14000,lift_at_15000,lift_at_16000,lift_at_17000,lift_at_18000,lift_at_19000,lift_at_20000
0,cifar-10n,resnet18,aggre_label,self_confidence,False,0.9773,11.009989,10.721421,10.210877,9.311876,8.279689,7.349242,6.510227,5.829634,5.263288,4.782464,4.393099,4.04458,3.754802,3.498494,3.278579,3.079218,2.902004,2.748181,2.607045,2.477802
1,cifar-10n,resnet18,aggre_label,self_confidence,True,0.975124,10.54384,10.432852,10.136885,9.242508,8.253052,7.306696,6.495957,5.812986,5.249723,4.773585,4.376955,4.036256,3.74968,3.490566,3.27266,3.075749,2.900046,2.740782,2.601203,2.473363
2,cifar-10n,resnet18,aggre_label,normalized_margin,False,0.977165,10.9101,10.449501,9.877913,9.175916,8.27525,7.336293,6.553036,5.860155,5.264521,4.805771,4.396126,4.055679,3.76334,3.504836,3.279319,3.081992,2.908533,2.749414,2.608213,2.478912
3,cifar-10n,resnet18,aggre_label,normalized_margin,True,0.976235,10.876804,10.460599,9.881613,9.167592,8.266371,7.336293,6.551451,5.854606,5.27192,4.786903,4.390072,4.051054,3.757364,3.504836,3.278579,3.079218,2.902657,2.746331,2.605877,2.477248
4,cifar-10n,resnet18,aggre_label,confidence_weighted_entropy,False,0.973501,10.799112,10.510544,9.877913,8.959489,8.046615,7.149464,6.418265,5.765816,5.212727,4.759156,4.37191,4.034406,3.747119,3.492152,3.27044,3.075749,2.899393,2.743865,2.60354,2.475583
5,cifar-10n,resnet50d,aggre_label,self_confidence,False,0.979977,10.99889,10.804661,10.425453,9.564373,8.468368,7.462079,6.613287,5.907325,5.305216,4.812431,4.400161,4.049205,3.758217,3.500872,3.279319,3.081299,2.904616,2.747564,2.606461,2.477802
6,cifar-10n,resnet50d,aggre_label,self_confidence,True,0.97862,10.63263,10.660377,10.27007,9.547725,8.506104,7.486127,6.616458,5.903163,5.315082,4.81909,4.396126,4.050129,3.751387,3.498494,3.27414,3.075055,2.89874,2.740165,2.600619,2.472808
7,cifar-10n,resnet50d,aggre_label,normalized_margin,False,0.979807,10.799112,10.499445,10.070292,9.295228,8.459489,7.476878,6.638655,5.908713,5.318782,4.830189,4.423368,4.068627,3.767609,3.513556,3.288938,3.089623,2.911144,2.752497,2.609381,2.480577
8,cifar-10n,resnet50d,aggre_label,normalized_margin,True,0.979087,10.732519,10.504994,10.08879,9.36182,8.481687,7.506474,6.660853,5.923973,5.32988,4.835738,4.422359,4.071402,3.766755,3.506421,3.279319,3.079911,2.903963,2.743865,2.601787,2.472808
9,cifar-10n,resnet50d,aggre_label,confidence_weighted_entropy,False,0.977013,10.887902,10.665927,10.170181,9.395117,8.321865,7.378838,6.576819,5.867092,5.279319,4.798002,4.380991,4.037181,3.739435,3.481846,3.260821,3.070893,2.896781,2.742015,2.601787,2.473363


## Export results to csv

In [9]:
# export results to CSV file
ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
df.to_csv(f"label_quality_scores_evaluation_{ts}.csv", index=False)

## Misc

In [11]:
model = "swin_base_patch4_window7_224"

# read numpy files
numpy_out_folder = f"./cifar-10n-png_noise_type_aggre_cv_{model}/"

pred_probs = np.load(numpy_out_folder + "pred_probs.npy")
labels = np.load(numpy_out_folder + "noisy_labels.npy")
true_labels = np.load(numpy_out_folder + "true_labels.npy")
images = np.load(numpy_out_folder + "images.npy", allow_pickle=True)

# boolean mask of label errors
label_errors_target = labels != true_labels

In [12]:
label_errors_target.mean()

0.0901

In [21]:
label_errors_target.sum()

4505

In [13]:
label_issues = cleanlab.filter.find_label_issues(labels=labels, pred_probs=pred_probs, filter_by="predicted_neq_given")

In [14]:
label_issues.sum()

4686

In [15]:
# datapoints where pred_prob argmax does not equal the noisy label
(pred_probs.argmax(axis=1) != labels).sum()

4686

In [16]:
(pred_probs.argmax(axis=1) == true_labels).mean()

0.98368

## Evaluate different filter_by options

In [17]:
filter_by_list = [
    "prune_by_noise_rate",
    "prune_by_class",
    "both",
    "confident_learning",
    "predicted_neq_given",
]

results = []

for filter_by in filter_by_list:

    label_issues = find_label_issues(
        labels=labels,
        pred_probs=pred_probs,
        filter_by=filter_by,
    )

    # Calculate evaluation metrics
    f1 = f1_score(label_errors_target, label_issues)
    precision = precision_score(label_errors_target, label_issues)
    recall = recall_score(label_errors_target, label_issues)

    result = {
        "filter_by": filter_by,
        "num_label_issues": sum(label_issues),
        "f1_score": f1,
        "precision": precision,
        "recall": recall
    }

    print(result)
    
    results.append(result)

{'filter_by': 'prune_by_noise_rate', 'num_label_issues': 2865, 'f1_score': 0.762550881953867, 'precision': 0.9808027923211169, 'recall': 0.6237513873473918}
{'filter_by': 'prune_by_class', 'num_label_issues': 2866, 'f1_score': 0.7741147741147743, 'precision': 0.9954640614096302, 'recall': 0.6332963374028857}
{'filter_by': 'both', 'num_label_issues': 2329, 'f1_score': 0.6786654960491658, 'precision': 0.995706311721769, 'recall': 0.5147613762486126}
{'filter_by': 'confident_learning', 'num_label_issues': 2462, 'f1_score': 0.6995837519735897, 'precision': 0.9898456539398862, 'recall': 0.5409544950055494}
{'filter_by': 'predicted_neq_given', 'num_label_issues': 4686, 'f1_score': 0.928952235882929, 'precision': 0.91101152368758, 'recall': 0.9476137624861265}


In [18]:
print(f"model: {model}")
print(f"cross-val procedure: stratified k-folds (k=5)")
print(f"dataset: cifar-10n")
print(f"noise type: aggregate (10% noise rate)")
print()
print(f"Label Error Detection: Evaluating Different filter_by:")
print()
pd.DataFrame(results)

model: swin_base_patch4_window7_224
cross-val procedure: stratified k-folds (k=5)
dataset: cifar-10n
noise type: aggregate (10% noise rate)

Label Error Detection: Evaluating Different filter_by:



Unnamed: 0,filter_by,num_label_issues,f1_score,precision,recall
0,prune_by_noise_rate,2865,0.762551,0.980803,0.623751
1,prune_by_class,2866,0.774115,0.995464,0.633296
2,both,2329,0.678665,0.995706,0.514761
3,confident_learning,2462,0.699584,0.989846,0.540954
4,predicted_neq_given,4686,0.928952,0.911012,0.947614
