In [1]:
import numpy as np
import pandas as pd
from scipy.io import arff

from sklearn.model_selection import train_test_split
from cleanlab.benchmarking.noise_generation import generate_noise_matrix_from_trace, generate_noisy_labels
from cleanlab.multiannotator import get_majority_vote_label

## Get full dataset

Download wall robot dataset [here](https://www.openml.org/search?type=data&sort=runs&status=any&qualities.NumberOfClasses=gte_2&qualities.NumberOfInstances=between_1000_10000&id=1526).

In [2]:
SEED = 111

In [3]:
data = arff.loadarff('data/wall-robot.arff')
df = pd.DataFrame(data[0])

df["class"] = df["Class"].astype("int64") - 1
df = df.loc[:, df.columns != "Class"]
df

Unnamed: 0,V1,V2,V3,V4,class
0,1.687,0.445,2.332,0.429,3
1,1.687,0.449,2.332,0.429,3
2,1.687,0.449,2.334,0.429,3
3,1.687,0.449,2.334,0.429,3
4,1.687,0.449,2.334,0.429,3
...,...,...,...,...,...
5451,1.024,0.657,1.087,1.562,0
5452,0.894,0.649,1.071,1.085,1
5453,0.873,0.642,1.053,1.105,1
5454,0.967,0.635,1.034,1.118,0


In [4]:
subset0 = df[df["class"] == 0].sample(n=1100, random_state=SEED)
subset1 = df[df["class"] == 1].sample(n=900, random_state=SEED)
subset2 = df[df["class"] == 2].sample(n=300, random_state=SEED)
subset3 = df[df["class"] == 3].sample(n=700, random_state=SEED)

subset = pd.concat([subset0, subset1, subset2, subset3])
wallrobot = subset.reset_index(drop=True)
wallrobot

Unnamed: 0,V1,V2,V3,V4,class
0,3.242,0.770,1.486,0.467,0
1,1.428,0.707,1.404,0.788,0
2,1.309,0.606,2.138,0.625,0
3,1.041,0.714,2.959,0.804,0
4,1.278,0.624,3.079,0.933,0
...,...,...,...,...,...
2995,1.324,0.477,1.807,0.548,3
2996,1.523,0.484,1.812,1.070,3
2997,1.528,0.466,1.802,0.921,3
2998,1.813,0.357,2.641,0.834,3


In [5]:
# wallrobot.to_csv("data/wall_robot_subset.csv")

## Get train/test split and annotator labels

Annotator labels are synthetically generated with noise.

In [6]:
def get_synthetic_multiannotator_labels(
    true_labels,
    num_annotators=30,
):
    n = len(true_labels)
    m = len(np.unique(true_labels)) # num classes
    py = np.bincount(true_labels) / float(len(true_labels))

    noise_matrix = generate_noise_matrix_from_trace(
        m,
        trace=0.8 * m,
        py=py,
        valid_noise_matrix=True,
        seed=SEED,
    )

    multiannotator_labels = np.vstack(
            [
                generate_noisy_labels(true_labels, noise_matrix) for i in range(num_annotators)
            ] 
        ).transpose()

    return multiannotator_labels

In [7]:
num_annotators = 30

X = wallrobot.loc[:, wallrobot.columns != "class"].to_numpy()
y = wallrobot["class"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=1/3, random_state=SEED
)

multiannotator_labels = get_synthetic_multiannotator_labels(
    y_train, num_annotators=num_annotators
)
extra_labels_labeled = get_synthetic_multiannotator_labels(
    y_train, num_annotators=num_annotators
)


In [8]:
# Returns sample labels/error_mask/annotator_mask where x_drop, y_drop are idxs that are dropped
def get_sample_labels(x_drop, y_drop, labels, annotator_mask):
    s_annotator_mask = annotator_mask.copy()
    s_annotator_mask[(x_drop,y_drop)] = 0
    s_labels = labels.copy().astype("float64")
    np.copyto(s_labels, np.nan, where=(s_annotator_mask==0)) 
    print('Total idxs dropped: ', annotator_mask.sum() - s_annotator_mask.sum())
    return s_labels, s_annotator_mask

# Returns a list of labeled indices to drop 
def get_random_drop_per_row_min_annotators(annotator_mask, max_annotations = 5):
    x,y = np.where(annotator_mask == 1)
    xy = set([(x_idx,y_idx) for x_idx,y_idx in zip(x,y)])
    idx_df = pd.DataFrame(zip(x,y),columns=['x','y'])
    idx_keep = []
    for x_idx in range(idx_df['x'].max()+1):
        Y = idx_df[idx_df['x'] == x_idx]['y']
        y_keep = np.random.choice(list(y), max_annotations,replace=False)
        xy_keep = [(x_idx,y) for y in y_keep]
        idx_keep.extend(xy_keep)
    xy = xy.difference(set(idx_keep))
    x_drop = [xy_idx[0] for xy_idx in xy]
    y_drop = [xy_idx[1] for xy_idx in xy]
    return x_drop, y_drop

def get_random_drop_per_row(annotator_mask):
    x,y = np.where(annotator_mask == 1)
    idx_df = pd.DataFrame(zip(x,y),columns=['x','y'])
    for x_idx in range(idx_df['x'].max()+1):
        num_drop = np.random.randint(1, len(idx_df[idx_df['x'] == x_idx])+1)
        idx_df = idx_df.drop(idx_df[idx_df['x'] == x_idx].sample(num_drop).index)
    x_drop = idx_df['x'].values
    y_drop = idx_df['y'].values
    return x_drop, y_drop

def get_least_annotations(annotator_mask):
    annotations_per_example = annotator_mask.sum(axis=1)
    annotations_per_annotator = annotator_mask.sum(axis=0)

    temp_mask = annotator_mask.copy()

    for ex in range(temp_mask.shape[0]):
        if annotations_per_example[ex] < 2: # ignore dropping when there are very little annotations
            continue

        annotators = np.where(temp_mask[ex] == 1)[0] # annotators for example
        drop_y = np.random.choice(annotators, len(annotators)-1, replace=False)

        for y in drop_y:
            if np.random.uniform() > 0.03:
                x = np.where(temp_mask[:,y] == 1)[0]  # annotations for annotator y for all examples
                x = np.setdiff1d(x,np.array([ex])) # annotations for annotator y for all examples minus curent example
                if annotations_per_example[x].max() > 2: # number of total annotations by our annotator
                    temp_mask[ex][y] = 0
                    annotations_per_annotator[y] -= 1
                    annotations_per_example[ex] -= 1

    x_drop, y_drop = np.where(temp_mask == 0)
    
    return x_drop, y_drop

In [9]:
annotator_mask = np.full(multiannotator_labels.shape, 1)

x_drop, y_drop = get_random_drop_per_row_min_annotators(annotator_mask)
multiannotator_labels, annotator_mask = get_sample_labels(x_drop, y_drop, multiannotator_labels, annotator_mask)

x_drop, y_drop = get_least_annotations(annotator_mask)
multiannotator_labels, annotator_mask = get_sample_labels(x_drop, y_drop, multiannotator_labels, annotator_mask)

print(
    f"Make sure {annotator_mask.sum(axis=1).max()} <= max_annotations and { annotator_mask.sum(axis=1).min()} > 0: "
)

Total idxs dropped:  50658
Total idxs dropped:  7128
Make sure 4 <= max_annotations and 1 > 0: 


In [10]:
# Check if dataset creation conditions are met
for col in range(annotator_mask.shape[1]):
    annotator = np.where(annotator_mask[:,col] == 1)[0]
    intersects = False
    for j in range(annotator_mask.shape[1]):
        if j == col:
            continue
        annotator2 = np.where(annotator_mask[:,j] == 1)[0]
        if len(np.intersect1d(annotator, annotator2)) > 0:
            intersects = True
    if not intersects:
        print(f'annotator {col} does not intersect with any other annotator')

In [11]:
pd.DataFrame(multiannotator_labels).count(axis=1).value_counts()

1    1797
2     193
3       9
4       1
dtype: int64

In [12]:
anno_acc = pd.DataFrame(multiannotator_labels).apply(lambda s: np.mean(s[pd.notna(s)] == y_train[pd.notna(s)]))
print(f"min accuracy = {np.min(anno_acc)}")
print(f"max accuracy = {np.max(anno_acc)}")
print(f"avg accuracy = {np.mean(anno_acc)}")

consensus_label = get_majority_vote_label(multiannotator_labels)
accuracy = np.mean(consensus_label == y_train)
print(f"base consensus accuracy = {accuracy}")

min accuracy = 0.7121212121212122
max accuracy = 0.9594594594594594
avg accuracy = 0.8496965272329894
base consensus accuracy = 0.852


In [13]:
# np.save("data/X_labeled.npy", X_train)
# np.save("data/X_test.npy", X_test)

# np.save("data/true_labels_labeled.npy", y_train)
# np.save("data/true_labels_test.npy", y_test)

# np.save("data/multiannotator_labels_labeled.npy", multiannotator_labels)
# np.save("data/extra_labels_labeled.npy", extra_labels_labeled)