In [1]:
import sys
sys.path.append("..")

from models.modeling import VisionTransformer, CONFIGS
import copy
import gc
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
import pandas as pd
from matrics import label_performance, overall_performance, label_performance_old, overall_performance_old, alterationsModel, modelChoices, meanAlterationsAnnotators
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report
import torchvision
from torchvision import transforms
from joblib import dump, load

device = torch.device("cuda:0" if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else "cpu")


### Some of helper methods

In [2]:
validation_onboarding = np.load("validation_c10n.npy") # validation images for onboarding. This one has image indexes
torch.random.manual_seed(0)
transform_test = transforms.Compose([
transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

testset = torchvision.datasets.CIFAR10(root="./data/", train=False, download=False, transform=transform_test)
clean_testset = torch.tensor(np.array(testset.targets).astype(np.float32))

In [4]:
# helpers
def evaluateUserBefore(testloader):
    # this method gets the user accuracy before alterations
    predictions = torch.tensor([]).to(device)
    groundTruth = torch.tensor([]).to(device)

    for i, (imgs, labels, n_labels) in enumerate(testloader):
        # imgs = imgs.to(device)
        labels = labels.to(device)
        n_labels = n_labels.to(device)

        predictions = torch.cat((predictions, n_labels))
        groundTruth = torch.cat((groundTruth, labels))

    predictions = predictions.detach().cpu().numpy()
    groundTruth = groundTruth.detach().cpu().numpy()

    return classification_report(y_true=groundTruth, y_pred=predictions, output_dict=True)

def getOnboaringAndTestsets(testset, noiseInjectedTest, validation_onboarding, onboarding_len): 
    # this method gets the user's test and validation labels and validation set accuracy
    validationSet = np.array([random.sample(validation_onboarding[i].tolist(), onboarding_len) for i in range(validation_onboarding.shape[0])]).flatten()
    validationSetToRemove = validation_onboarding.flatten()
    
    noiseInjectedTest = noiseInjectedTest.astype(np.int64)
    userNoise = noiseInjectedTest[validationSet] # getting user labels for chosen validation set
    val_acc = classification_report(y_true=np.array(testset.targets)[validationSet], y_pred=userNoise, output_dict=True)['accuracy']

    testset.data = np.delete(testset.data, validationSetToRemove, axis=0)
    testset.targets = np.delete(testset.targets, validationSetToRemove, axis=0)
    noiseInjectedTest = np.delete(noiseInjectedTest, validationSetToRemove, axis=0)

    class DataWithNoise(torch.utils.data.Dataset):
        def __init__(self, dataset, noise):
            self._dataset = dataset
            self.noise = noise

        def __len__(self):
            return len(self._dataset)

        def __getitem__(self, idx):
            return (*self._dataset[idx], self.noise[idx])
    
    data_with_noise = DataWithNoise(testset, noiseInjectedTest)
    return userNoise, data_with_noise, val_acc

def evaluate(model, dataloader, device):
    #  to evaluate the model/users.
    model.eval()
    with torch.no_grad():
        predictions = torch.tensor([]).to(device)
        groundTruth = torch.tensor([]).to(device)
        n_user = torch.tensor([]).to(device)
        n_base = torch.tensor([]).to(device)

        for i, (imgs, labels, n_labels) in enumerate(dataloader):
            imgs = imgs.to(device)
            labels = labels.to(device)
            n_labels = n_labels.to(device)
            n_labels = F.one_hot(n_labels, num_classes=10).to(device, dtype=torch.float32)
            
            base, user, outputs = model(imgs, n_labels)
            _, preds = torch.max(outputs, 1)
            
            predictions = torch.cat((predictions, preds))
            groundTruth = torch.cat((groundTruth, labels))

            n_user = torch.cat((n_user, torch.argmax(n_labels, 1)))
            n_base = torch.cat((n_base, torch.argmax(base, 1)))

    predictions = predictions.detach().cpu().numpy()
    groundTruth = groundTruth.detach().cpu().numpy()
    n_user = n_user.detach().cpu().numpy()
    n_base = n_base.detach().cpu().numpy()

    report = classification_report(y_true=groundTruth, y_pred=predictions, output_dict=True)
    # ConfusionMatrixDisplay(confusion_matrix(groundTruth, predictions)).plot()
    return report, groundTruth, predictions, n_base, n_user 

class AdaptedAI(nn.Module): # same model used in training script
    def __init__(self):
        super(AdaptedAI, self).__init__()

        config = CONFIGS['ViT-B_16']
        self.base_model = VisionTransformer(config, 224, zero_head=True, num_classes=10)
        self.base_model.to(device)
        for param in self.base_model.parameters():
            param.requires_grad = False

        # to encode the noisy lable
        self.n_l_encoder = nn.Sequential(
            nn.Linear(10, 32),
            nn.ReLU(),
            nn.Linear(32, 10)
        )

        # input -> img + noisy lable
        self.a_ai = nn.Sequential(
            nn.Linear(20, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 10),
        )

    def forward(self, imgs, n_l):
        img_features = self.base_model(imgs)[0] # [0] is because of how vit is designed
        n_l_features = self.n_l_encoder(n_l)

        out = torch.cat((img_features, n_l_features), dim=1)
        out = self.a_ai(out)
        return img_features, n_l, out
    
def loadModel(name):
    adapt_model = AdaptedAI().to(device)
    for param in adapt_model.parameters():
        param.requires_grad = False

    adapt_model.load_state_dict(torch.load(name, map_location=torch.device(device)))
    return adapt_model

def getOurEval(data_records, gt, pred, base_pred, user_pred):
    overall, class_result = overall_performance(pred, user_pred, base_pred, gt)

    # handling individual class results
    temp_label_result = pd.DataFrame([class_result])
    temp_label_result.insert(0, 'annot_id', [data_records['annot_id']])
    
    global class_results
    if class_results is None:
        class_results = temp_label_result
    else:
        class_results = pd.concat([class_results, temp_label_result], ignore_index=True)

    # handling overall results
    data_records.update(overall)
    temp_overall = pd.DataFrame([data_records])
    temp_overall.fillna(value=0, inplace=True)

    global overall_results
    if overall_results is None:
        overall_results = temp_overall
    else:
        overall_results = pd.concat([overall_results, temp_overall], ignore_index=True)

def recordOutputs(gt, pred, base_pred, user_pred):
    # handling raw outputs
    outs = {
        'gt': gt.astype(int).tolist(),
        'base': base_pred.astype(int).tolist(),
        'user': user_pred.astype(int).tolist(),
        'pred': pred.astype(int).tolist()
    }
    temp_rawOuts = pd.DataFrame([outs])
    global raw_outputs
    if raw_outputs is None:
        raw_outputs = temp_rawOuts
    else:
        raw_outputs = pd.concat([raw_outputs, temp_rawOuts], ignore_index=True)


### Loading models

In [5]:
# loading the svm
clf = load('path/to/svm.joblib')

# loading cluster wise models (m_theta)
clus_model = [ 
    loadModel("path/to/K2C1_model"), loadModel("path/to/K2C2_model")
]

# loading test sets for users
savedTestUsers = np.load('path/to/testsets.npy') # labels for each test user

### Evaluate with each test user

In [None]:
class_results = None
overall_results = None
raw_outputs = None

for i, annotator in enumerate(savedTestUsers): # iterate through each test user
    userNoise, testsetForUser, valAcc = getOnboaringAndTestsets(copy.deepcopy(testset), annotator, validation_onboarding, onboarding_len=20)
    testloader = torch.utils.data.DataLoader(testsetForUser, batch_size=512, shuffle=False, num_workers=4)
    u_cluster = clf.predict_proba(userNoise.reshape((1,-1)))

    clus_p = np.amax(u_cluster, axis=1)[0] # prob of assigning to the cluster
    u_cluster = np.argmax(u_cluster, axis=1)[0] # assigned cluster
    accuracy_before = evaluateUserBefore(testloader)['accuracy']
    report, gt, pred, base_pred, user_pred = evaluate(clus_model[u_cluster], testloader, device)
    
    data_records = {
        'annot_id': i,
        'before_acc': accuracy_before,
        'val_acc': valAcc,
        'clus': u_cluster,
        'clus_prob': clus_p,
        'after_acc': report['accuracy']
    }

    recordOutputs(gt, pred, base_pred, user_pred)
    getOurEval(data_records, gt, pred, base_pred, user_pred)
    print(accuracy_before, report['accuracy'], u_cluster, "yess" if accuracy_before<=report['accuracy'] else "noo", clus_p )

    del testloader
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
# saving results
# overall_results.to_csv('./eval_c10n_k/overall_k1_vitb16.csv', index=False)
# class_results.to_csv('./eval_c10n_k/labelres_k1_vitb16.csv', index=False)
# raw_outputs.to_csv('./eval_c10n_k/rawout_k1_vitb16.csv', index=False)

# loading results
overall_results = pd.read_csv("./eval_c10n_k/overall_k3_vitb16.csv")
class_results = pd.read_csv("./eval_c10n_k/labelres_k3_vitb16.csv")
raw_outputs = pd.read_csv("./eval_c10n_k/rawout_k3_vitb16.csv")

In [8]:
conditions = (overall_results.val_acc < 0.98150) # set base model accuracy here

over = overall_results.loc[conditions]
print("not", over.loc[(over.before_acc > over.after_acc)].shape)
print("same", over.loc[(over.before_acc == over.after_acc)].shape)
print("improved:", over.loc[(over.before_acc < over.after_acc)].shape)
print("before:", over.before_acc.mean(), "after:", over.after_acc.mean())

not (0, 21)
same (0, 21)
improved: (3, 21)
before: 0.8570068027210884 after: 0.9898299319727891


In [10]:
condition2 = overall_results.loc[conditions].annot_id

print("For Annotators")
print(meanAlterationsAnnotators(class_results.iloc[:, 1:]))
print(meanAlterationsAnnotators(class_results.iloc[condition2, 1:]))

For Annotators
(0.9520010613548688, 0.003973561521271203)
(0.9520010613548688, 0.003973561521271203)


In [11]:
# To get model choices

# b -> base model
# h -> human
# t -> team
# 
# c -> correct
# nc -> not correct 
modelChoices(raw_outputs.iloc[condition2])

{'b:c, h:nc, t:c': 3977,
 'b:nc, h:c, t:c': 599,
 'b:c, h:c, t:c': 24497,
 'b:nc, h:nc, t:c': 28,
 'b:c, h:nc, t:nc': 98,
 'b:nc, h:c, t:nc': 100,
 'b:c, h:c, t:nc': 0,
 'b:nc, h:nc, t:nc': 101}