In [1]:
import torch
from PIL import Image
import numpy as np
import pandas as pd
import torch
import os
import sys
from torch.utils import data
from sklearn.metrics import confusion_matrix as cm
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
sys.path.append('../')
from scipy.spatial import distance

from utils import utils, model_fetch, dataloader
metrics_dict = {}
gdv_table = {
    "class-discrimination": {},
    "intra-class": {},
    "inter-class": {}
}
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# Loading both test sets - CIFAR and CIFAR - 10H -- Images are same.

def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict


# CIFAR Data
file_path = '../data/data-cifar-10/cifar-10-batches-py/test_batch'
label_dict = unpickle(file_path)

# Labels for both data
cifar_labels = label_dict[b'labels'] 

# print(label_dict.keys())

In [3]:
def get_model(model_of_interest):

    if True:
        preds_path =  '../../results/preds/' +model_of_interest+ '.npy'
        preds = np.load(preds_path)
        pred_classes = np.argmax(preds, 1)

        embeds_path =  '../../results/embeddings/' +model_of_interest+ '.npy'
        embeds = np.load(embeds_path)
        
    return preds, pred_classes, embeds

def z_scoring(preds):
    return 0.5 * (preds - np.mean(preds, 0)) / np.std(preds, 0)


def bucketize_class(preds, pred_classes):
    bucket = {}
    for i in range(len(preds)):
        if pred_classes[i] not in bucket:
            bucket[pred_classes[i]] = []
        bucket[pred_classes[i]].append(preds[i])
        
    return bucket



In [4]:
def dist_func(a, b):
    return np.dot(a, b) / (np.sqrt(a.dot(a)) * np.sqrt(b.dot(b)))

def intra_class(class_bucket):
    class_dist = 0
    for i in range(len(class_bucket) - 1):
        for j in range(i, len(class_bucket)):
            class_dist += dist_func(class_bucket[i], class_bucket[j])
    class_dist /= (len(class_bucket)*(len(class_bucket) - 1))
    class_dist *= 2
    
    return class_dist
            

In [5]:
def inter_class(class_bucket1, class_bucket2):
    class_dist = 0
    for i in range(len(class_bucket1)):
        for j in range(i+1, len(class_bucket2)):
            class_dist += dist_func(class_bucket1[i],class_bucket2[j])
    class_dist /= (len(class_bucket1) * len(class_bucket2))
    return class_dist

In [18]:
model_of_interest = "baseline"

preds, pred_classes, _ = get_model(model_of_interest)
z_embeds = z_scoring(preds)
bucket = bucketize_class(z_embeds, cifar_labels)
dim = preds.shape[1]

intra_class_dist = {}
for class_label in tqdm(bucket):
    intra_class_dist[class_label] = intra_class(bucket[class_label])

100%|██████████| 10/10 [00:20<00:00,  2.03s/it]


In [19]:
num_classes = len(list(set(pred_classes)))
term1 = sum(list(intra_class_dist.values()))/num_classes

term2 = 0
classes = sorted(list(bucket.keys()))
for i in tqdm(range(len(classes) - 1)):
    for j in range(i+1, len(classes)):
        term2 += inter_class(bucket[i], bucket[j])
term2 /= (num_classes*(num_classes-1))
term2 *= 2   

gdv = 1/(dim**0.5) * (term1 - term2)

100%|██████████| 9/9 [01:54<00:00, 12.73s/it]


In [20]:
gdv_table["class-discrimination"].update({model_of_interest: gdv})
gdv_table["intra-class"].update({model_of_interest: term1})
gdv_table["inter-class"].update({model_of_interest: term2})

pd.DataFrame(gdv_table)

Unnamed: 0,class-discrimination,intra-class,inter-class
mixup,0.16058,0.481783,-0.026016
cutmix,0.154048,0.46217,-0.024972
cutout,0.158004,0.473913,-0.02574
augmented,0.155214,0.46574,-0.025091
baseline,0.161022,0.482504,-0.026693
