In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt

from cleanlab.multiannotator import get_majority_vote_label, get_label_quality_multiannotator, get_active_learning_scores
from cleanlab.internal.label_quality_utils import get_normalized_entropy

from utils.model_training import fit_predict_proba
from utils.active_learning import setup_next_iter_data, get_idx_to_label

## Load Files

The datafiles can either be generate yourself by running [0_create_data.ipynb](0_create_data.ipynb), or by downloading our pre-generated files using the commands:

```
wget -nc 'https://cleanlab-public.s3.amazonaws.com/ActiveLearning/Benchmark/SingleAnnotator/data.tar.gz'
tar -xf data.tar.gz data/
```

In [2]:
num_rounds = 10
num_iter = 5
batch_size_to_label = 100

In [3]:
def get_data():
    X_labeled =  np.load("data/X_labeled.npy")
    X_unlabeled =  np.load("data/X_unlabeled.npy")
    X_test =  np.load("data/X_test.npy")

    true_labels_labeled =  np.load("data/true_labels_labeled.npy")
    true_labels_unlabeled =  np.load("data/true_labels_unlabeled.npy")
    true_labels_test =  np.load("data/true_labels_test.npy")

    y_labeled =  np.load("data/noisy_labels_labeled.npy")
    y_unlabeled =  np.load("data/noisy_labels_unlabeled.npy")

    return X_labeled, X_unlabeled, X_test, y_labeled, y_unlabeled, true_labels_labeled, true_labels_unlabeled, true_labels_test

In [4]:
# entropy 
for i in range(num_iter):
    (
        X_labeled, 
        X_unlabeled, 
        X_test, 
        y_labeled, 
        y_unlabeled, 
        true_labels_labeled, 
        true_labels_unlabeled, 
        true_labels_test
    ) = get_data()

    entropy_model_accuracy_arr = np.full(num_rounds, np.nan)

    for k in range(num_rounds):
        pred_probs, pred_probs_unlabeled = fit_predict_proba(
            ExtraTreesClassifier(),
            X_labeled,
            y_labeled,
            cv_n_folds=5,
            X_unlabeled=X_unlabeled,
        )

        single_model = ExtraTreesClassifier()
        single_model.fit(X_labeled, y_labeled)
        single_pred_labels = single_model.predict(X_test)
        entropy_model_accuracy_arr[k] = np.mean(single_pred_labels == true_labels_test)

        quality_of_consensus = - get_normalized_entropy(pred_probs_unlabeled)

        relabel_idx = np.array([])
        relabel_idx_unlabeled = np.argsort(quality_of_consensus)[:batch_size_to_label]

        y_labeled = np.concatenate((y_labeled, y_unlabeled[relabel_idx_unlabeled]))

        (
            relabel_idx_combined, X_labeled, X_unlabeled, true_labels_labeled, 
            true_labels_unlabeled, pred_probs_labeled, 
            pred_probs_unlabeled, y_unlabeled
        ) = setup_next_iter_data(
            relabel_idx, relabel_idx_unlabeled, X_labeled, X_unlabeled, 
            true_labels_labeled, true_labels_unlabeled, pred_probs, 
            pred_probs_unlabeled, y_unlabeled
        )

    np.save(f"results/entropy_{i}.npy", entropy_model_accuracy_arr)

In [5]:
# random 
for i in range(num_iter):
    (
        X_labeled, 
        X_unlabeled, 
        X_test, 
        y_labeled, 
        y_unlabeled, 
        true_labels_labeled, 
        true_labels_unlabeled, 
        true_labels_test
    ) = get_data()

    random_model_accuracy_arr = np.full(num_rounds, np.nan)

    for k in range(num_rounds):
        pred_probs, pred_probs_unlabeled = fit_predict_proba(
            ExtraTreesClassifier(),
            X_labeled,
            y_labeled,
            cv_n_folds=5,
            X_unlabeled=X_unlabeled,
        )

        single_model = ExtraTreesClassifier()
        single_model.fit(X_labeled, y_labeled)
        single_pred_labels = single_model.predict(X_test)
        random_model_accuracy_arr[k] = np.mean(single_pred_labels == true_labels_test)

        quality_of_consensus = np.random.rand(len(pred_probs_unlabeled))

        relabel_idx = np.array([])
        relabel_idx_unlabeled = np.argsort(quality_of_consensus)[:batch_size_to_label]

        y_labeled = np.concatenate((y_labeled, y_unlabeled[relabel_idx_unlabeled]))

        (
            relabel_idx_combined, X_labeled, X_unlabeled, true_labels_labeled, 
            true_labels_unlabeled, pred_probs_labeled, 
            pred_probs_unlabeled, y_unlabeled
        ) = setup_next_iter_data(
            relabel_idx, relabel_idx_unlabeled, X_labeled, X_unlabeled, 
            true_labels_labeled, true_labels_unlabeled, pred_probs, 
            pred_probs_unlabeled, y_unlabeled
        )

    np.save(f"results/random_{i}.npy", random_model_accuracy_arr)

In [6]:
# crowdlab 
for i in range(num_iter):
    (
        X_labeled, 
        X_unlabeled, 
        X_test, 
        y_labeled, 
        y_unlabeled, 
        true_labels_labeled, 
        true_labels_unlabeled, 
        true_labels_test
    ) = get_data()

    crowdlab_model_accuracy_arr = np.full(num_rounds, np.nan)

    for k in range(num_rounds):
        pred_probs, pred_probs_unlabeled = fit_predict_proba(
            ExtraTreesClassifier(),
            X_labeled,
            y_labeled,
            cv_n_folds=5,
            X_unlabeled=X_unlabeled,
        )

        single_model = ExtraTreesClassifier()
        single_model.fit(X_labeled, y_labeled)
        single_pred_labels = single_model.predict(X_test)
        crowdlab_model_accuracy_arr[k] = np.mean(single_pred_labels == true_labels_test)

        _, quality_of_consensus = get_active_learning_scores(
            y_labeled, pred_probs, pred_probs_unlabeled
        )

        relabel_idx = np.array([])
        relabel_idx_unlabeled = np.argsort(quality_of_consensus)[:batch_size_to_label]

        y_labeled = np.concatenate((y_labeled, y_unlabeled[relabel_idx_unlabeled]))

        (
            relabel_idx_combined, X_labeled, X_unlabeled, true_labels_labeled, 
            true_labels_unlabeled, pred_probs_labeled, 
            pred_probs_unlabeled, y_unlabeled
        ) = setup_next_iter_data(
            relabel_idx, relabel_idx_unlabeled, X_labeled, X_unlabeled, 
            true_labels_labeled, true_labels_unlabeled, pred_probs, 
            pred_probs_unlabeled, y_unlabeled
        )

    np.save(f"results/crowdlab_{i}.npy", crowdlab_model_accuracy_arr)