In [1]:
import sys
sys.path.append("..")

import cleanlab
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
import sklearn_extensions.fuzzy_kmeans as Fuzz
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from joblib import dump, load
from sklearn.svm import SVC
from sklearn.naive_bayes import CategoricalNB, GaussianNB

import random
import torch
import torchvision
from torchvision import transforms
device = torch.device("cuda:0" if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else "cpu")

### Training Cnboarding Classifier

In [13]:
# loading files saved at the end of clus_prep.ipynb
with open('cifar10n_train_clustering_c10n.npy', 'rb') as f:
    k_user = np.load(f, allow_pickle=True).tolist()

In [3]:
# funtion to create sub arrays (samples) having a given number of elements from a given array.
# this is used to randomly pick instances from a single class
def createSubArrays(large_array, n_sub, len_sub):
    subarrays = []
    for i in range(n_sub):
        subarray_indices = np.random.choice(len(large_array), size=len_sub, replace=False)
        subarray = large_array[subarray_indices]
        subarrays.append(subarray)
    return np.array(subarrays)

def createDataset(n_samples, sample_len, instance_indexes, user_numLabels, n_aug):
    # n_samples - n data points per user
    # sample_len - no. of samples to take from 1 class [for a row in data frame]

    class_samples = []
    for i in range(0,10):
        class_samples.append(createSubArrays(instance_indexes[i], n_samples, sample_len))

    # combined_samples
    # combined in the class order
    dataset_with_indexes = []
    for i in zip(*class_samples):
        dataset_with_indexes.append(np.array(i).flatten())

    # X
    X = []
    Y = []
    label = 0
    for i in range(0, user_numLabels.shape[0]):
        X.append(user_numLabels[i][dataset_with_indexes])
        label += 1 if i%n_aug==0 and i>0 else 0
        Y.append(np.full(n_samples, label))


    data = pd.DataFrame(np.vstack([*X]))
    data['label'] = np.hstack([*Y])

    # shuffling
    # data = data.sample(frac=1)
    return data

# validation set sampler
def create_test_set(n_samples, sample_len, k):
    testX = []
    testY = []
    label = 0
    for u in range(0, len(user_numLabels), n_aug):
        noise_matrix = np.transpose(np.round(confusion_matrix(gt_numLabels, user_numLabels[u], normalize='true'), decimals=2))
        totest = cleanlab.benchmarking.noise_generation.generate_noisy_labels(gt_numLabels, noise_matrix)

        class_samples = []
        for i in range(0,10):
            class_samples.append(createSubArrays(instance_indexes[i], n_samples, sample_len))

        dataset_with_indexes = []
        for j in zip(*class_samples):
            dataset_with_indexes.append(np.array(j).flatten())
        
        testX.append(totest[dataset_with_indexes])
        testY.append(np.full(dataset_with_indexes.__len__(), label))
        label += 1


    return np.array(testX).reshape((-1, sample_len*10)), np.array(testY).reshape(-1,)

In [4]:
k=2 # set the selected K
# sample_len=20 # no. of samples to take from one class
# n_samples=5000 # no. data points per augmentation

user_numLabels = k_user[k][:-1] # training labels (augmentations)
gt_numLabels = k_user[k][-1] # training consensus

# taking indexes of instances from each class
instance_indexes = []
for i in range(0,10):
    instance_indexes.append(np.where(gt_numLabels == i)[0])

In [5]:
# k=k, s=n_samples, f=sample_len
def saveClusterClassifier(clf, k, n_samples, sample_len):
    clf_name = "svm_k{}_s{}_f{}.joblib".format(k, n_samples, sample_len)
    dump(clf, 'onboarding/'+clf_name)

Set the number of times augmented to take samples from every augmentation, and to set the cluster label accurately

In [6]:
n_samples_from_user = []
n_samples_from_class = []
accuracy = []
k_ = []
n_aug=3 # set the number of times augmented to take samples from every augmentation, and to set the cluster label accurately

# n_samples -> no. data points to sample per user
# sample_len -> no. of samples to take from 1 class [for a row in data frame]
for k in [2]: # set the K
    user_numLabels = k_user[k][:-1]
    gt_numLabels = k_user[k][-1]
    for n_samples in [100]:
        for sample_len in [2, 5, 10, 15, 20]:

            # taking indexes of instances from each class
            instance_indexes = []
            for i in range(0,10):
                instance_indexes.append(np.where(gt_numLabels == i)[0])
                
            dataset= createDataset(n_samples=n_samples, sample_len=sample_len, instance_indexes=instance_indexes, user_numLabels=user_numLabels, n_aug=n_aug)
            print("n_samples_from_aug:", n_samples, "n_samples_from_class:", sample_len, "train_shape", dataset.shape)

            clf = make_pipeline(StandardScaler(), SVC(probability=False))
            clf.fit(X=dataset.drop('label', axis=1).values, y=dataset['label'])

            n_samples_from_user.append(n_samples)
            n_samples_from_class.append(sample_len)
            k_.append(k)

            testset = create_test_set(1000, sample_len, n_aug)
            accuracy.append(classification_report(testset[1], clf.predict(testset[0]), output_dict=True)['accuracy'])

            # saveClusterClassifier(clf, k, n_samples, sample_len)

n_samples_from_aug: 100 n_samples_from_class: 2 train_shape (600, 21)
n_samples_from_aug: 100 n_samples_from_class: 5 train_shape (600, 51)
n_samples_from_aug: 100 n_samples_from_class: 10 train_shape (600, 101)
n_samples_from_aug: 100 n_samples_from_class: 15 train_shape (600, 151)
n_samples_from_aug: 100 n_samples_from_class: 20 train_shape (600, 201)


In [7]:
perf_comp = pd.DataFrame({
    "k": k_,
    "samples from a cluster": (np.array(n_samples_from_user) * n_aug).astype(str),
    "samples from a class": n_samples_from_class,
    "accuracy": accuracy})
perf_comp.head(10)
# perf_comp.to_csv('./onboarding/classifier_stats23287.csv', index=False)

Unnamed: 0,k,samples from a cluster,samples from a class,accuracy
0,2,300,2,0.536
1,2,300,5,0.5775
2,2,300,10,0.635
3,2,300,15,0.6565
4,2,300,20,0.6655
