In [2]:
import numpy as np
import pandas as pd
import torch

from cleanlab.multiannotator import get_majority_vote_label, get_label_quality_multiannotator
from sklearn.metrics import confusion_matrix

### Preparing data for CROWDLAB

Aim is to create a dataframe that has image_id on the first column and then other columns represent annotators.

Each annotator's column shows the labels that annotator has given for the images (image_id)

In [5]:
# loading C10N worker data
data_ori = pd.read_csv('../cifar10n_no_gt_modified/data/side_info_cifar10N.csv')
data_ori = data_ori.loc[:, ['Image-batch', 'Worker1-id', 'Worker2-id', 'Worker3-id']]
data_ori.head()

Unnamed: 0,Image-batch,Worker1-id,Worker2-id,Worker3-id
0,0--9,198,385,197
1,10--19,430,140,584
2,20--29,601,430,631
3,30--39,545,79,385
4,40--49,631,373,177


In [6]:
# (0--4) to [0, 4]
data_ori['Image-batch'] = data_ori['Image-batch'].map(lambda x: [i for i in x.split('-') if len(i.strip()) > 0])
# [0, 4] to [0, 1, 2, 3, 4]
data_ori['Image-batch'] = data_ori['Image-batch'].map(lambda x: list(range(int(x[0]), int(x[1])+1)))
# get [0, 1, 2, 3, 4] to rows while duplicating other info
data_ori = data_ori.explode('Image-batch')
data_ori.head()

Unnamed: 0,Image-batch,Worker1-id,Worker2-id,Worker3-id
0,0,198,385,197
0,1,198,385,197
0,2,198,385,197
0,3,198,385,197
0,4,198,385,197


In [7]:
w1 = data_ori.loc[:, ['Image-batch', 'Worker1-id']].reset_index(drop=True)
w2 = data_ori.loc[:, ['Image-batch', 'Worker2-id']].reset_index(drop=True)
w3 = data_ori.loc[:, ['Image-batch', 'Worker3-id']].reset_index(drop=True)
print(w1.shape, w2.shape, w3.shape)

(50000, 2) (50000, 2) (50000, 2)


In [9]:
c10n = torch.load('../cifar10n_no_gt_modified/data/CIFAR-10_human.pt')
w1['ch_lbl'] = c10n['random_label1']
w2['ch_lbl'] = c10n['random_label2']
w3['ch_lbl'] = c10n['random_label3']

print(w1.shape, w2.shape, w3.shape)
w1.head()

(50000, 3) (50000, 3) (50000, 3)


Unnamed: 0,Image-batch,Worker1-id,ch_lbl
0,0,198,6
1,1,198,9
2,2,198,9
3,3,198,4
4,4,198,1


In [10]:
outDfs = []
for temp_df in [w1, w2, w3]:
    data = {
        'annot_id': temp_df[temp_df.columns[1]],
        'chosen_label': temp_df[temp_df.columns[2]],
        'img_id': temp_df[temp_df.columns[0]]
    }
    annot_data = pd.DataFrame(data)

    # Create the new DataFrame with pivot_table
    annot_data = annot_data.pivot_table(index='img_id', columns='annot_id', values='chosen_label')

    # Reset the index if needed
    annot_data = annot_data.reset_index(drop=True)

    # to int
    annot_data = annot_data.astype('Int32', errors='ignore')
    outDfs.append(annot_data)

w1_multi, w2_multi, w3_multi = outDfs
print(w1_multi.shape, w2_multi.shape, w3_multi.shape)

(50000, 609) (50000, 625) (50000, 595)


In [11]:
merged = w1_multi.combine_first(w2_multi).combine_first(w3_multi)
merged # image_id on the first column and labels from each individual annotator in other columns

annot_id,0,1,2,3,4,5,6,7,8,9,...,737,738,739,740,741,742,743,744,745,746
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,,,,,,,,,,,...,,,,,,,,,,
49996,,,,,,,,,,,...,,,,,,,,,,
49997,,,,,,,,,,,...,,,,,,,,,,
49998,,,,,,,,,,,...,,,,,,,,,,


In [13]:
merged.to_csv('annot_data.csv', index=False)

### CROWDLAB Step 1

In [14]:
multiannotator_labels = pd.read_csv('annot_data.csv')
multiannotator_labels

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,737,738,739,740,741,742,743,744,745,746
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,,,,,,,,,,,...,,,,,,,,,,
49996,,,,,,,,,,,...,,,,,,,,,,
49997,,,,,,,,,,,...,,,,,,,,,,
49998,,,,,,,,,,,...,,,,,,,,,,


In [15]:
majority_vote_label = get_majority_vote_label(multiannotator_labels)
print("Shape:", majority_vote_label.shape)

data = {'c_10n_step1': majority_vote_label}
torch.save(data, 'c_10n_noise.pt')

Shape: (50000,)
Step 1 acc: 0.9059


### CROWDLAB Step 2

According to the CROWDLAB process, a model has to be trained as a cross validation process with majority votes as labels.

Then take probabilistic output of label for each training sample and assign it to *pred_probs* below

In [5]:
# getting probabilities for training set from the trained model
pred_probs = evaluate_getprobs()

# getting better consensus from cleanlab
better_consensus_with_step1model = get_label_quality_multiannotator(multiannotator_labels, pred_probs, verbose=False)

# saving better consensus to the same file as mojority votes
data = torch.load('./c_10n_noise.pt')
data['c_10n_step2'] = better_consensus_with_step1model["label_quality"].consensus_label.to_numpy()
torch.save(data, 'c_10n_noise.pt')

### CROWDLAB step 3

Train a model with better conses as training labels and use that model as the base model for PHICO

### Base model noise matrix

In [None]:
# labels - test labels
# predictions - step 3 model test set predictions
confusion_matrix(labels, predictions, normalize='true')