In [1]:
from cleanlab.filter import find_label_issues
#from cleanlab.filter_test import find_label_issues_test
import numpy as np
from time import time

In [2]:
def normalize(arr): 
    normalized = np.zeros(arr.shape, dtype=np.float16) 
    for i, a in enumerate(arr): 
        normalized[i] = a / np.sum(a) 
    return normalized 

In [3]:
def run_test(test_name, pred_probs, labels, multi_label=False, filter_by='prune_by_noise_rate', n_jobs=4):
    print("-------------------------------------\n")
    print(f"running test: {test_name}\n")
    start = time()
    issues_new_single = find_label_issues(pred_probs=pred_probs, labels=labels, n_jobs=1, multi_label=multi_label, filter_by=filter_by)
    print(f"new single: took {time()-start} s\n")
    start = time()
    issues_new_multi = find_label_issues(pred_probs=pred_probs, labels=labels, n_jobs=n_jobs, multi_label=multi_label, filter_by=filter_by)
    print(f"new multi: took {time()-start} s")
    print(f"diff from new single {np.sum(issues_new_single ^ issues_new_multi)}\n")
    #start = time()
    #issues_old_single = find_label_issues(pred_probs=pred_probs, labels=labels, n_jobs=1, multi_label=multi_label, filter_by=filter_by)
    #print(f"old single: took {time()-start} s")
    #print(f"diff from new single {np.sum(issues_new_single ^ issues_old_single)}")
    #print(f"diff from new multi {np.sum(issues_new_multi ^ issues_old_single)}\n")
    #start = time()
    #issues_old_multi = find_label_issues(pred_probs=pred_probs, labels=labels, n_jobs=n_jobs, multi_label=multi_label, filter_by=filter_by)
    #print(f"old multi: took {time()-start} s")
    #print(f"diff from new single {np.sum(issues_new_single ^ issues_old_multi)}")
    #print(f"diff from new multi {np.sum(issues_new_multi ^ issues_old_multi)}")
    #print(f"diff from old single {np.sum(issues_old_single ^ issues_old_multi)}\n")

In [4]:
n_m = [(100, 5), (200000, 100)]
multi_labels = [False, True]
filter_ops = ['prune_by_noise_rate', 'prune_by_class', 'both', 'confident_learning', 'predicted_neq_given']

In [5]:
for nm in n_m:
    n, m = nm
    test_name = f"n={n}, m={m}, "
    pred_probs = np.random.randint(low=1, high=100, size=[n, m], dtype=np.uint8) 
    pred_probs = normalize(pred_probs) 
    for multi_label in multi_labels:
        if multi_label:
            labels = []
            for i in range(n):
                num_labels = np.random.randint(m)
                if num_labels > 0:
                    labels.append(list(set(np.random.randint(m, size=num_labels))))
                else:
                    labels.append([])
        else:
            labels = np.repeat(np.arange(m), n // m) 
        for filter_op in filter_ops:
            run_test(test_name + filter_op + ", " + ("multi label" if multi_label else "no multi label"), pred_probs, labels, multi_label, filter_op, n_jobs=10)
            

-------------------------------------

running test: n=100, m=5, prune_by_noise_rate, no multi label

new single: took 2.1493027210235596 s

new multi: took 0.1927788257598877 s
diff from new single 0

-------------------------------------

running test: n=100, m=5, prune_by_class, no multi label

new single: took 0.0019636154174804688 s

new multi: took 0.10764098167419434 s
diff from new single 0

-------------------------------------

running test: n=100, m=5, both, no multi label

new single: took 0.0027358531951904297 s

new multi: took 0.21303081512451172 s
diff from new single 0

-------------------------------------

running test: n=100, m=5, confident_learning, no multi label

new single: took 0.002103090286254883 s

new multi: took 0.0012431144714355469 s
diff from new single 0

-------------------------------------

running test: n=100, m=5, predicted_neq_given, no multi label

new single: took 0.0012843608856201172 s

new multi: took 0.0010936260223388672 s
diff from new si

KeyboardInterrupt: 