In [1]:
import numpy as np
import pandas as pd
from scipy.io import arff

from sklearn.model_selection import train_test_split
from cleanlab.benchmarking.noise_generation import (
    generate_noise_matrix_from_trace,
    generate_noisy_labels,
)
from cleanlab.multiannotator import get_majority_vote_label

In [2]:
SEED = 0
noise_rate = 0.6

In [3]:
def get_synthetic_labels(
    true_labels,
    noise_rate=0.8,
):
    n = len(true_labels)
    m = len(np.unique(true_labels))  # num classes
    py = np.bincount(true_labels) / float(len(true_labels))

    noise_matrix = generate_noise_matrix_from_trace(
        m,
        trace=noise_rate * m,
        py=py,
        valid_noise_matrix=True,
        seed=SEED,
    )

    labels = generate_noisy_labels(true_labels, noise_matrix)

    return labels

In [4]:
wallrobot = pd.read_csv("data/wall_robot_subset.csv", index_col=0)
num_annotators = 30

X = wallrobot.loc[:, wallrobot.columns != "class"].to_numpy()
y = wallrobot["class"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=1 / 3, random_state=SEED
)

noisy_labels = get_synthetic_labels(y_train, noise_rate=noise_rate)
extra_noisy_labels = get_synthetic_labels(y_train, noise_rate=noise_rate)

(
    X_labeled,
    X_unlabeled,
    y_labeled,
    y_unlabeled,
    noisy_labeled,
    noisy_unlabeled,
) = train_test_split(
    X_train,
    y_train,
    noisy_labels,
    test_size=0.75,
    random_state=SEED,
)

accuracy = np.mean(noisy_labeled == y_labeled)
print(f"base single label accuracy = {accuracy}")

accuracy = np.mean(noisy_unlabeled == y_unlabeled)
print(f"base extra single label accuracy = {accuracy}")

base single label accuracy = 0.614
base extra single label accuracy = 0.606


In [5]:
# np.save("data/X_labeled.npy", X_labeled)
# np.save("data/X_unlabeled.npy", X_unlabeled)
# np.save("data/X_test.npy", X_test)

# np.save("data/true_labels_labeled.npy", y_labeled)
# np.save("data/true_labels_unlabeled.npy", y_unlabeled)
# np.save("data/true_labels_test.npy", y_test)

# np.save("data/noisy_labels_labeled.npy", noisy_labeled)
# np.save("data/noisy_labels_unlabeled.npy", noisy_unlabeled)