In [1]:
import sys
sys.path.append("..")

import cleanlab
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
import sklearn_extensions.fuzzy_kmeans as Fuzz
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report, silhouette_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from joblib import dump, load
from sklearn.svm import SVC
from sklearn.naive_bayes import CategoricalNB, GaussianNB

import random
import torch
import torchvision
from torchvision import transforms
device = torch.device("cuda:0" if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else "cpu")

  return torch._C._cuda_getDeviceCount() > 0


### Preparing Training and Testing Users

Aim is to make $L.C$ vectors for users so that they can be clustered.

In [2]:
data_ori = pd.read_csv('../cifar10n_no_gt_modified/data/side_info_cifar10N.csv')
data_ori = data_ori.loc[:, ['Image-batch', 'Worker1-id', 'Worker2-id', 'Worker3-id']]
data_ori.head()

Unnamed: 0,Image-batch,Worker1-id,Worker2-id,Worker3-id
0,0--9,198,385,197
1,10--19,430,140,584
2,20--29,601,430,631
3,30--39,545,79,385
4,40--49,631,373,177


In [3]:
# (0--4) to [0, 4]
data_ori['Image-batch'] = data_ori['Image-batch'].map(lambda x: [i for i in x.split('-') if len(i.strip()) > 0])
# [0, 4] to [0, 1, 2, 3, 4]
data_ori['Image-batch'] = data_ori['Image-batch'].map(lambda x: list(range(int(x[0]), int(x[1])+1)))
# get [0, 1, 2, 3, 4] to rows while duplicating other info
data_ori = data_ori.explode('Image-batch')
data_ori.head()

Unnamed: 0,Image-batch,Worker1-id,Worker2-id,Worker3-id
0,0,198,385,197
0,1,198,385,197
0,2,198,385,197
0,3,198,385,197
0,4,198,385,197


In [4]:
w1 = data_ori.loc[:, ['Image-batch', 'Worker1-id']].reset_index(drop=True)
w2 = data_ori.loc[:, ['Image-batch', 'Worker2-id']].reset_index(drop=True)
w3 = data_ori.loc[:, ['Image-batch', 'Worker3-id']].reset_index(drop=True)
print(w1.shape, w2.shape, w3.shape)

(50000, 2) (50000, 2) (50000, 2)


In [5]:
c10n = torch.load('../cifar10n_no_gt_modified/data/CIFAR-10_human.pt')
step2_labels = torch.load('../cifar10n_no_gt_modified/c_10n_noise.pt')['c_10n_step2']

w1['chosen_label'] = c10n['random_label1']
w1['true_label'] = step2_labels

w2['chosen_label'] = c10n['random_label2']
w2['true_label'] = step2_labels

w3['chosen_label'] = c10n['random_label3']
w3['true_label'] = step2_labels

print(w1.shape, w2.shape, w3.shape)

w1.rename(columns={'Worker1-id': 'annotator_id', 'Image-batch': 'c10_train_index'}, inplace=True)
w2.rename(columns={'Worker2-id': 'annotator_id', 'Image-batch': 'c10_train_index'}, inplace=True)
w3.rename(columns={'Worker3-id': 'annotator_id', 'Image-batch': 'c10_train_index'}, inplace=True)

(50000, 4) (50000, 4) (50000, 4)


In [6]:
raw = pd.concat([w1, w2, w3], ignore_index=True)
print(raw.shape)
raw

(150000, 4)


Unnamed: 0,c10_train_index,annotator_id,chosen_label,true_label
0,0,198,6,6
1,1,198,9,9
2,2,198,9,9
3,3,198,4,4
4,4,198,1,1
...,...,...,...,...
149995,49995,169,2,2
149996,49996,169,6,6
149997,49997,169,9,9
149998,49998,169,1,1


In [7]:
n_unique_annotators = raw.annotator_id.unique()
print("n_annotators:", len(n_unique_annotators))
print("max annotations by one person:", raw.annotator_id.value_counts().max())
print("min annotations by one person:", raw.annotator_id.value_counts().min(),)
print("mean annotations by one person:", raw.annotator_id.value_counts().mean(),"\n")

print("n_unique images annotated:", len(raw.c10_train_index.unique()))
print("max annotations per image:", raw.c10_train_index.value_counts().max())
print("min annotations per image:", raw.c10_train_index.value_counts().min())
print("mean annotations per image:", raw.c10_train_index.value_counts().mean())

n_annotators: 747
max annotations by one person: 3070
min annotations by one person: 10
mean annotations by one person: 200.80321285140562 

n_unique images annotated: 50000
max annotations per image: 3
min annotations per image: 3
mean annotations per image: 3.0


In [8]:
chosen_annotators = []
min_labels_per_class = 20
for annotator in n_unique_annotators:
    rows = raw.loc[raw.annotator_id == annotator]
    annotations = rows.loc[:,['chosen_label', 'true_label']]

    cond1 = annotations.true_label.unique().shape[0] == 10 # checks if labeled images from every class
    cond2 = annotations.true_label.value_counts().min() >= min_labels_per_class # checks if least number of images annotated from each class

    if (cond1 and cond2):
        chosen_annotators.append(annotator)

chosen_annotators = np.array(chosen_annotators)
chosen_annotators.shape

(159,)

In [9]:
# splitting traing/testing
np.random.seed(0)
usersToNoiseMat = np.random.choice(chosen_annotators, size=int(np.floor(chosen_annotators.__len__()*0.5)), replace=False)
usersToTest = list(set(chosen_annotators) - set(usersToNoiseMat))
print("Training:", usersToNoiseMat.__len__())
print("Testing:", usersToTest.__len__())

Training: 79
Testing: 80


In [10]:
def getUserLabels(userSet):
    allUsers_chosen=[]
    allUsers_gt=[]
    for annotator in userSet:
        rows = raw.loc[raw.annotator_id == annotator]
        annotations = rows.loc[:,['c10_train_index', 'true_label', 'chosen_label']]

        perUser_chosen = []
        perUser_gt = []
        for i in range(0,10):
            # taking 'min_labels_per_class' instances as the no. of images from each class
            perUser_chosen.append(random.sample(annotations.loc[annotations.true_label == i].chosen_label.tolist(), min_labels_per_class))
            perUser_gt.append(np.repeat(i, min_labels_per_class).tolist())

        allUsers_chosen.append(np.array(perUser_chosen))
        allUsers_gt.append(np.array(perUser_gt))

    allUsers_chosen = np.array(allUsers_chosen)
    allUsers_gt = np.array(allUsers_gt)
    return allUsers_chosen, allUsers_gt

trainUsers_chosen, trainUsers_gt = getUserLabels(usersToNoiseMat)
testUsers_chosen, testUsers_gt = getUserLabels(usersToTest)

print("Training", trainUsers_chosen.shape, trainUsers_gt.shape)
print("Testing", testUsers_chosen.shape, testUsers_gt.shape)

# 3dims -> users, classes, labels for each class

Training (79, 10, 20) (79, 10, 20)
Testing (80, 10, 20) (80, 10, 20)


### Fuzzy K-Means

In [11]:
# get annotator profiles based on labels provided by training users
np.random.seed(0)
user_cluster_relation = []
for i in range(2,5):
    fuzzy_kmeans = Fuzz.FuzzyKMeans(k=i, m=2)
    fuzzy_kmeans.fit(trainUsers_chosen.reshape(-1, min_labels_per_class*10))
    allocated_cluster = np.argmax(fuzzy_kmeans.fuzzy_labels_, axis=1)
    user_cluster_relation.append(allocated_cluster)
    
    print("K={}, silhouette={}, user_distribution={}".format(i, 
    silhouette_score(trainUsers_chosen.reshape(-1, min_labels_per_class*10), fuzzy_kmeans.labels_),
    np.unique(allocated_cluster, return_counts=True)[1]
    ))
    
user_cluster_relation = np.array(user_cluster_relation)

1 [79]
2 [34 45]
3 [23 31 25]


In [12]:
# getting users belong to each cluster at each k
k_to_cluster_to_users = []
for j in range(0, user_cluster_relation.shape[0]):
    k_to_cluster_to_users.append({i+1: (user_cluster_relation[j] == i).nonzero()[0] for i in np.unique(user_cluster_relation[j])})

set the best k from silhouette

In [13]:
selected_k = 2 # set the selected K from silhouette score
k_to_cluster_to_users = [k_to_cluster_to_users[selected_k-2]]
k_to_cluster_to_users # shows the users in the clusters of selected K

[{1: array([ 0,  1,  4,  5, 10, 14, 15, 16, 18, 19, 20, 21, 28, 29, 31, 32, 35,
         37, 39, 40, 41, 43, 45, 48, 49, 51, 55, 56, 62, 65, 67, 69, 74, 76]),
  2: array([ 2,  3,  6,  7,  8,  9, 11, 12, 13, 17, 22, 23, 24, 25, 26, 27, 30,
         33, 34, 36, 38, 42, 44, 46, 47, 50, 52, 53, 54, 57, 58, 59, 60, 61,
         63, 64, 66, 68, 70, 71, 72, 73, 75, 77, 78])}]

### Creating augmented training and testing sets

These sets are used to train and validate $m_\theta$

In [14]:
# getting c10 test set labels
transform = transforms.Compose([
    transforms.ToTensor(),
])
testset = torchvision.datasets.CIFAR10(root="../cifar10n_no_gt_modified/data/", train=False, download=False, transform=transform)
ori_test_labels = testset.targets

In [None]:
# add noise to training sets and create some users for each cluster in each k
np.random.seed(0)

n_aug = 3 # can set the number of times to augment. 3 is the default used
k_user_train = {}
k_user_test = {}

for k in range(0, k_to_cluster_to_users.__len__()):
    noisy_users_train_in_k = []
    noisy_users_test_in_k = []

    # iterating clusters
    for clus in k_to_cluster_to_users[k].keys():
        chosen = np.array([trainUsers_chosen[u].reshape(-1,) for u in k_to_cluster_to_users[k][clus]]).flatten()
        gt = np.array([trainUsers_gt[u].reshape(-1,) for u in k_to_cluster_to_users[k][clus]]).flatten()
        
        # augmenting
        for i in range(0, n_aug):
            noise_matrix = np.transpose(np.round(confusion_matrix(gt, chosen, normalize='true'), decimals=5))
            noiseInjectedTrain = cleanlab.benchmarking.noise_generation.generate_noisy_labels(step2_labels, noise_matrix)
            noiseInjectedTest = cleanlab.benchmarking.noise_generation.generate_noisy_labels(ori_test_labels, noise_matrix)

            noisy_users_train_in_k.append(noiseInjectedTrain)
            noisy_users_test_in_k.append(noiseInjectedTest)

        # just checking the accuracy of each cluster
        acc = classification_report(gt, chosen, output_dict=True)['accuracy']
        print("cluster:", clus, "acc:", acc)

    noisy_users_train_in_k.append(step2_labels) # consensus labels are appended at the end to ease of access when training
    k_user_train[selected_k] = np.array(noisy_users_train_in_k)
    k_user_test[selected_k] = np.array(noisy_users_test_in_k)
    print()

*format of k_user_train*

if selected_k is 2 and n_aug = 3, then k_user_train[2].shape would be (7, 50000).

interpretation of 7:
* 1,2,3 for augmentations of cluster 1 noise
* 4,5,6 for augmentations of cluster 2 noise
* 7 for consensus labels

50000 comes from number of training samples in cifar 10

*format of k_user_test*

shape would be (6, 10000)
6 - first 3 for cluster 1 and last 3 for cluster 2
10000 is the number of test samples in cifar 10

In [16]:
with open('cifar10n_train_clustering_c10n.npy', 'wb') as f:
    np.save(f, k_user_train)

with open('cifar10n_test_clustering_c10n.npy', 'wb') as f:
    np.save(f, k_user_test)