In [1]:
import sys
import numpy as np
import pandas as pd
import datetime
from sklearn.metrics import roc_auc_score

sys.path.insert(0, "../")
from eval_metrics import lift_at_k

# clone code from repos
PATH_TO_CLEANLAB = "/cleanlab/"
sys.path.insert(0, PATH_TO_CLEANLAB)
from cleanlab.rank import order_label_issues, get_label_quality_scores, get_label_quality_ensemble_scores
from cleanlab.filter import find_label_issues

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Evaluate scores

In [2]:
%%time

models = [
    "resnet18", 
    "resnet50d",
    "efficientnet_b1",
    "twins_pcpvt_base",
    "swin_base_patch4_window7_224"
]

# args to pass to get_label_quality_scores()
score_params = \
[
    ("self_confidence", False),
    ("self_confidence", True),
    ("normalized_margin", False),
    ("normalized_margin", True),
    ("confidence_weighted_entropy", False)
]

results_list = []
pred_probs_list = [] # use for ensemble scoring
labels_list = [] # use for sanity check (labels from each model should be the same because they were generated from the same cross-val procedure

for model in models:
    
    # read numpy files
    numpy_out_folder = f"./cifar10_train_dataset_noise_amount_0.2_sparsity_0.4_cv_{model}/"
    pred_probs = np.load(numpy_out_folder + "pred_probs.npy")
    labels = np.load(numpy_out_folder + "noisy_labels.npy")
    true_labels = np.load(numpy_out_folder + "true_labels.npy")
    images = np.load(numpy_out_folder + "images.npy", allow_pickle=True)
    
    # boolean mask of label errors
    label_errors_target = labels != true_labels
    
    # save to list for ensemble scoring
    pred_probs_list.append(pred_probs)
    
    labels_list.append(labels)
    
    for score_param in score_params:
        
        method, adjust_pred_probs = score_param

        # compute scores
        label_quality_scores = get_label_quality_scores(labels=labels, pred_probs=pred_probs, method=method, adjust_pred_probs=adjust_pred_probs)

        # compute accuracy of detecting label errors
        auroc = roc_auc_score(label_errors_target, 1 - label_quality_scores)

        # compute Lift@K evaluation metric
        lift_at_k_dict = {}
        for k in range(1000, 11000, 1000):
            lift_at_k_dict[f"lift_at_{k}"] = lift_at_k(label_errors_target, 1 - label_quality_scores, k=k)

        # save results
        results = {
            "dataset": "cifar10",
            "model": model,
            "noise_config": "Noise Amount: 0.2 | Sparsity: 0.4",
            "method": method,
            "adjust_pred_probs": adjust_pred_probs,
            "auroc": auroc
        }

        # add the lift at k metrics
        results.update(lift_at_k_dict)

        # save results
        results_list.append(results)        

CPU times: user 25.8 s, sys: 79.8 ms, total: 25.9 s
Wall time: 25.9 s


In [3]:
# use for sanity check (noisy labels from each model should be the same because they were generated from the same cross-val procedure
for i, labels_temp in enumerate(labels_list):
    
    if i == 0:
        labels_temp_previous = labels_temp.copy()
    else:
        assert (labels_temp_previous == labels_temp).all()

## Evaluate ensemble scores

In [4]:
%%time

for score_param in score_params:

    method, adjust_pred_probs = score_param
    
    print(f"Scoring label quality...")
    print(f"  method: {method}")
    print(f"  adjust_pred_probs: {adjust_pred_probs}")

    label_quality_ensemble_scores = get_label_quality_ensemble_scores(labels, pred_probs_list, method=method, adjust_pred_probs=adjust_pred_probs)
    
    # compute accuracy of detecting label errors
    auroc = roc_auc_score(label_errors_target, 1 - label_quality_ensemble_scores)

    # compute Lift@K evaluation metric
    lift_at_k_dict = {}
    for k in range(1000, 11000, 1000):
        lift_at_k_dict[f"lift_at_{k}"] = lift_at_k(label_errors_target, 1 - label_quality_ensemble_scores, k=k)

    # save results
    results = {
        "dataset": "cifar10",
        "model": "ensemble (all)",
        "noise_config": "Noise Amount: 0.2 | Sparsity: 0.4",
        "method": method,
        "adjust_pred_probs": adjust_pred_probs,
        "auroc": auroc
    }

    # add the lift at k metrics
    results.update(lift_at_k_dict)
    
    # save results
    results_list.append(results)

Scoring label quality...
  method: self_confidence
  adjust_pred_probs: False
Weighting scheme for ensemble: accuracy
Ensemble members will be weighted by: accuracy of member / (sum of accuracy from all members)
  Model 0 accuracy : 0.73976
  Model 0 weights  : 0.19456616080587044
  Model 1 accuracy : 0.75506
  Model 1 weights  : 0.1985902527550564
  Model 2 accuracy : 0.73582
  Model 2 weights  : 0.19352989137581864
  Model 3 accuracy : 0.78098
  Model 3 weights  : 0.20540753793955974
  Model 4 accuracy : 0.79048
  Model 4 weights  : 0.2079061571236948
Scoring label quality...
  method: self_confidence
  adjust_pred_probs: True
Weighting scheme for ensemble: accuracy
Ensemble members will be weighted by: accuracy of member / (sum of accuracy from all members)
  Model 0 accuracy : 0.73976
  Model 0 weights  : 0.19456616080587044
  Model 1 accuracy : 0.75506
  Model 1 weights  : 0.1985902527550564
  Model 2 accuracy : 0.73582
  Model 2 weights  : 0.19352989137581864
  Model 3 accuracy :

## Create DataFrame with results

In [5]:
df = pd.DataFrame(results_list)

In [6]:
df

Unnamed: 0,dataset,model,noise_config,method,adjust_pred_probs,auroc,lift_at_1000,lift_at_2000,lift_at_3000,lift_at_4000,lift_at_5000,lift_at_6000,lift_at_7000,lift_at_8000,lift_at_9000,lift_at_10000
0,cifar10,resnet18,Noise Amount: 0.2 | Sparsity: 0.4,self_confidence,False,0.996882,5.013537,5.013537,5.011865,5.009776,5.005515,5.001003,4.990618,4.969668,4.911595,4.767873
1,cifar10,resnet18,Noise Amount: 0.2 | Sparsity: 0.4,self_confidence,True,0.990242,4.928306,4.928306,4.938334,4.948361,4.94435,4.915773,4.846657,4.660709,4.64532,4.414419
2,cifar10,resnet18,Noise Amount: 0.2 | Sparsity: 0.4,normalized_margin,False,0.995195,5.013537,5.008523,5.001838,4.994736,4.98145,4.962566,4.936185,4.881305,4.797397,4.660584
3,cifar10,resnet18,Noise Amount: 0.2 | Sparsity: 0.4,normalized_margin,True,0.995232,5.008523,5.008523,5.005181,5.001003,4.995488,4.980949,4.95409,4.902612,4.815223,4.659581
4,cifar10,resnet18,Noise Amount: 0.2 | Sparsity: 0.4,confidence_weighted_entropy,False,0.995044,5.008523,5.003509,5.005181,4.993482,4.983455,4.972593,4.946928,4.919533,4.855332,4.701695
5,cifar10,resnet50d,Noise Amount: 0.2 | Sparsity: 0.4,self_confidence,False,0.997504,5.013537,5.01103,5.010194,5.006016,5.002507,4.99766,4.99205,4.969668,4.931092,4.793944
6,cifar10,resnet50d,Noise Amount: 0.2 | Sparsity: 0.4,self_confidence,True,0.991531,4.958388,4.953374,4.945018,4.942094,4.928306,4.902403,4.775752,4.643788,4.643649,4.475584
7,cifar10,resnet50d,Noise Amount: 0.2 | Sparsity: 0.4,normalized_margin,False,0.996162,5.003509,5.006016,4.996825,4.983455,4.977439,4.962566,4.941198,4.908252,4.848647,4.720245
8,cifar10,resnet50d,Noise Amount: 0.2 | Sparsity: 0.4,normalized_margin,True,0.996745,5.013537,5.01103,5.005181,5.003509,4.998496,4.99014,4.974144,4.93332,4.859231,4.736789
9,cifar10,resnet50d,Noise Amount: 0.2 | Sparsity: 0.4,confidence_weighted_entropy,False,0.996068,5.003509,5.001003,4.995154,4.980949,4.974431,4.960894,4.949077,4.926426,4.884299,4.758348


## Export results to csv

In [7]:
# export results to CSV file
ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
df.to_csv(f"label_quality_scores_evaluation_{ts}.csv", index=False)