### Grid Approach to enhance or eliminate bias in the training subsets. 

In [1]:
import torch
import torchvision
import tarfile
from torchvision.datasets.utils import download_url
from torch.utils.data import random_split, DataLoader, SubsetRandomSampler, Subset, ConcatDataset
import os
from Truncate import truncate
from BSI_Entropy import BSIE
import numpy as np
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#### Like what we did in the original file, we set up all the classes/functions properly and prepare our datasets

In [2]:
data_dir = './data/cifar10'
training_dataset = ImageFolder(data_dir+'/train', transform=ToTensor())
test_dataset = ImageFolder(data_dir+'/test', transform=ToTensor())

In [3]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)
class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [4]:
import torch.nn as nn
import torch.nn.functional as F

class ImageClassificationBase(nn.Module):
    def training_step(self, batch):
        images, labels = batch 
        out = self(images)                  # Generate predictions
        loss = F.cross_entropy(out, labels) # Calculate loss
        return loss
    
    def validation_step(self, batch):
        images, labels = batch 
        out = self(images)                    # Generate predictions
        loss = F.cross_entropy(out, labels)   # Calculate loss
        acc = accuracy(out, labels)           # Calculate accuracy
        return {'val_loss': loss.detach(), 'val_acc': acc}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
    def epoch_end(self, epoch, result):
        print("Epoch [{}], train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
            epoch, result['train_loss'], result['val_loss'], result['val_acc']))
        
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

In [5]:
@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.SGD):
    history = []
    optimizer = opt_func(model.parameters(), lr)
    for epoch in range(epochs):
        # Training Phase 
        model.train()
        train_losses = []
        for batch in train_loader:
            loss = model.training_step(batch)
            train_losses.append(loss)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # Validation phase
        result = evaluate(model, val_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        model.epoch_end(epoch, result)
        history.append(result)
    return history

In [6]:
class Cifar10CnnModel(ImageClassificationBase):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 64 x 16 x 16

            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 128 x 8 x 8

            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 256 x 4 x 4

            nn.Flatten(), 
            nn.Linear(256*4*4, 1024),
            nn.ReLU(),
            nn.Linear(1024, 64),
            nn.ReLU(),
            nn.Linear(64, 10))
        
    def forward(self, xb):
        return self.network(xb)

In [7]:
device = get_default_device()
device

device(type='cpu')

#### We now create a CNN model object from the class above and assign it trained parameters we stored. This CNN model will be our embedding function to help us extract feature space of images. 

In [8]:
modelCNN = to_device(Cifar10CnnModel(), device)

In [9]:
modelCNN.load_state_dict(torch.load('cifar10-cnn.pth')) # load our pretrained model parameters and assign to the new model object

<All keys matched successfully>

In [10]:
from torch.utils.data.dataloader import DataLoader
batch_size=128
test_loader = DeviceDataLoader(DataLoader(test_dataset, batch_size*2), device)
evaluate(modelCNN, test_loader) # this will make sure it is the same CNN model we trained initially. 

{'val_loss': 0.9372484087944031, 'val_acc': 0.7689453363418579}

#### This following function will help us to attain intermediate features of images before the final classification layer of the CNN model which is our embedding function in this case.

In [11]:
features = {}
def get_features(name):
    def hook(model, input, output):
        features[name] = output.detach()

    return hook

In [12]:
modelCNN.network[18].register_forward_hook(get_features('18'))

<torch.utils.hooks.RemovableHandle at 0x17914098650>

#### In the following code, we attain the intermediate feature space for each class in the training set

In [13]:
class_feature_data = {label: [] for label in range(10)}

In [14]:
for image, label in training_dataset:
    if label in class_feature_data:
        output = modelCNN(to_device(image.unsqueeze(0), device))
        class_feature_data[label].append(features['18'])

In [15]:
for label in class_feature_data:
    class_feature_data[label] = torch.cat(class_feature_data[label], dim = 0).T 

##### We then apply SVD to the feature space of each training class to attain singular vector. 

In [16]:
singular_vector = {label: torch.svd(class_feature_data[label])[1] for label in class_feature_data}
print(singular_vector)

{0: tensor([8305.8789, 1585.3182, 1073.5406,  963.0099,  828.8642,  769.5621,
         616.4266,  580.3412,  516.1616,  488.6954,  402.4275,  341.2084,
         275.5750,  266.5753,  257.8827,  231.7271,  198.1219,  183.4829,
         180.9243,  177.5941,  156.9813,  144.6703,  141.9810,  132.6933,
         126.5861,  122.6644,  117.9465,  113.8699,  110.0822,  106.3363,
         103.7124,   93.6782,   91.7617,   89.5256,   84.1956,   82.4179,
          80.3859,   78.2227,   75.4202,   72.8308,   71.0707,   69.9203,
          65.3629,   64.8393,   62.6921,   60.4378,   58.1620,   56.9152,
          54.7706,   54.1711,   52.7214,   51.6815,   50.4430,   49.4335,
          47.0124,   45.7740,   43.8802,   43.6093,   42.2480,   40.5315,
          39.1501,   37.6497,   36.0807,   34.7374]), 1: tensor([12095.2061,  1403.4337,  1103.4655,   963.6672,   940.5552,   853.0527,
          744.2352,   670.6514,   567.7023,   496.4117,   435.3126,   407.3419,
          350.2057,   277.2058,   250.3

#### Note here, we don't truncate singular vector anymore to get BSIE value because the singular vector of each training class feature space contains values apparently greater than 0 almost at each entry. 

In [17]:
entropy = {label: BSIE(singular_vector[label]).item() for label in singular_vector}
print(entropy)

{0: 0.06290901692797712, 1: 0.06283993894178475, 2: 0.06495570296841247, 3: 0.06314928805309994, 4: 0.06271682050760996, 5: 0.06349178769941977, 6: 0.06271585297092763, 7: 0.062195439924848994, 8: 0.06215472324653093, 9: 0.0626122551072612}


#### Now we want to select representative and unrepresentative subsamples for each class and combine them together to train CNN submodels

In [18]:
classes = {0: (0, 5000), 1:(5000, 10000), 2:(10000, 15000), 3:(15000, 20000), 4:(20000, 25000), 5:(25000, 30000), 6:(30000, 35000), 7:(35000, 40000), 8:(40000, 45000), 9:(45000, 50000)}
training_data_by_class = {category: Subset(ImageFolder('./data/cifar10/train/', transform=ToTensor()), range(classes[category][0], classes[category][1])) for category in classes}


In [19]:
def num_subsample_per_class(classified_training_data, num = 15, n_sample = 500):
    subsample = {j: [random_split(classified_training_data[j], [n_sample, len(classified_training_data[j]) - n_sample])[0] for round in range(num)] for j in classified_training_data}
    return subsample


In [20]:
subsamples_per_class = num_subsample_per_class(training_data_by_class)
subsamples_per_class[2][13][499][0]

tensor([[[0.9137, 0.9059, 0.9098,  ..., 0.9333, 0.9255, 0.9216],
         [0.9059, 0.8980, 0.9020,  ..., 0.9294, 0.9255, 0.9176],
         [0.9098, 0.9020, 0.9059,  ..., 0.9333, 0.9294, 0.9255],
         ...,
         [0.8667, 0.8549, 0.8588,  ..., 0.8784, 0.8706, 0.8667],
         [0.8588, 0.8471, 0.8510,  ..., 0.8706, 0.8667, 0.8627],
         [0.8510, 0.8431, 0.8471,  ..., 0.8667, 0.8667, 0.8588]],

        [[0.9098, 0.9020, 0.9059,  ..., 0.9294, 0.9294, 0.9216],
         [0.9020, 0.8941, 0.8980,  ..., 0.9255, 0.9255, 0.9176],
         [0.9059, 0.8980, 0.9020,  ..., 0.9333, 0.9294, 0.9255],
         ...,
         [0.8431, 0.8314, 0.8353,  ..., 0.8706, 0.8627, 0.8588],
         [0.8353, 0.8235, 0.8275,  ..., 0.8627, 0.8588, 0.8549],
         [0.8275, 0.8196, 0.8235,  ..., 0.8588, 0.8588, 0.8510]],

        [[0.8902, 0.8902, 0.8980,  ..., 0.9294, 0.9294, 0.9216],
         [0.8863, 0.8784, 0.8863,  ..., 0.9255, 0.9255, 0.9176],
         [0.8902, 0.8824, 0.8863,  ..., 0.9333, 0.9294, 0.

In [21]:
def feature_space(dictionary):
    feature_space_dict = {label: [[] for _ in range(len(dictionary[label]))] for label in dictionary}
    for category in dictionary:
        for i in range(len(dictionary[category])):
            subsample = dictionary[category][i]
            for img, lbl in subsample:
                if lbl in feature_space_dict:
                    output = modelCNN(to_device(img.unsqueeze(0), device))
                    feature_space_dict[lbl][i].append(features['18'])

            feature_space_dict[category][i] = torch.cat(feature_space_dict[category][i], dim = 0).T

    return feature_space_dict

#### Below, we want to confirm we extract a list of feature space for each class 0 to 9 correctly

In [22]:
feature_space_per_class = feature_space(subsamples_per_class)#[3][14]

In [23]:
feature_space_per_class[3][14].shape

torch.Size([64, 500])

In [24]:
feature_space_per_class[3][14]

tensor([[-10.4720, -12.0392,  -6.2620,  ..., -19.7578, -16.0811, -22.5578],
        [  7.0483,  -8.4155,   3.7559,  ..., -16.7447,  -2.4426,   8.0573],
        [ -6.7543,  -6.8985,  -7.8737,  ..., -10.5969,  -6.1091, -10.5488],
        ...,
        [ -9.8876, -11.9938, -16.3818,  ..., -15.9279,  -7.8179, -11.7522],
        [-10.7643, -12.9664, -18.5365,  ..., -19.1223,  -9.7846, -12.0340],
        [ -3.5994,  -4.4575,  -4.9504,  ...,  -8.8697,  -5.4898,  -4.8947]])

In [25]:
# Convert to singular vectors
def entropy_values_per_class(dict):
    return {label: [BSIE(torch.svd(dict[label][i])[1]).item() for i in range(len(dict[label]))] for label in dict}

In [26]:
entropies_per_class = entropy_values_per_class(feature_space_per_class) 

In [27]:
entropies_per_class # 15 entropy values for each class as each class has 15 subsamples of 500 images

{0: [0.06439395609530962,
  0.06292441962248518,
  0.06359862810676586,
  0.06362807091416656,
  0.06274896601813384,
  0.06429198410638515,
  0.06369383492051284,
  0.0632180096030941,
  0.06397535133417398,
  0.06295952256733506,
  0.06411708451153308,
  0.06307028822368876,
  0.06402714204867299,
  0.06376201956790517,
  0.06341724388991532],
 1: [0.061787274771073486,
  0.062338574506456745,
  0.061420278651014626,
  0.062023171278363654,
  0.06187684033652663,
  0.061814124572514784,
  0.062222152783827256,
  0.0625186695005735,
  0.0622038811690977,
  0.06167997572019335,
  0.06281738835150508,
  0.06264392127265628,
  0.061521433347196064,
  0.06220526068110588,
  0.0624334397412466],
 2: [0.06443611976374886,
  0.06560658777050044,
  0.06522065161595636,
  0.06532914793667866,
  0.06559189326477843,
  0.06577518870257715,
  0.0660935019043406,
  0.0667085081863984,
  0.0653809321747777,
  0.06509328707673245,
  0.0668100498183215,
  0.06541731207121182,
  0.06557156464085556,
 

#### Then we want to select the most representative subsample of the 15 subsamples for each class, probably the most unrepresentative one as well.

In [28]:
def find_best_worst_per_class(class_entropy, class_subset_entropy):
    entropy_diff = {label:[np.abs(class_entropy[label] - class_subset_entropy[label][i])/class_entropy[label] for i in range(len(class_subset_entropy[label]))] for label in class_entropy}
    print(entropy_diff)
    min_max_entropy_diff_index = {label: [entropy_diff[label].index(min(entropy_diff[label])), entropy_diff[label].index(max(entropy_diff[label]))] for label in entropy_diff}
    sum_best_entropy = sum([entropy_diff[label][min_max_entropy_diff_index[label][0]] for label in entropy_diff])
    sum_worst_entropy =sum([entropy_diff[label][min_max_entropy_diff_index[label][1]] for label in entropy_diff])
    return min_max_entropy_diff_index, sum_best_entropy, sum_worst_entropy

In [29]:
min_max_entropy_diff_index, sum_best_entropy, sum_worst_entropy = find_best_worst_per_class(entropy, entropies_per_class)

{0: [0.0236045520951721, 0.00024484080756966855, 0.010962040299854918, 0.011430062355173328, 0.0025441648536094126, 0.021983608168465674, 0.012475445188314276, 0.004911739051187651, 0.016950422344346506, 0.000802836251848547, 0.019203409027024623, 0.0025635640737523445, 0.01777368611523495, 0.013559306464836779, 0.008078761785771537], 1: [0.016751514855647028, 0.00797843606742627, 0.022591687940456153, 0.012997588431423363, 0.015326218030707137, 0.01632424197961557, 0.009831106910046648, 0.0051125040320117564, 0.010121871271649324, 0.018459012550378098, 0.0003588576096574431, 0.003119316670725276, 0.020981968104872795, 0.010099918481251809, 0.006468803238569809], 2: [0.007999039051525306, 0.010020441198280786, 0.004078912788808356, 0.005749225259678798, 0.00979421770980361, 0.012616070594497188, 0.017516536407610587, 0.026984623949621617, 0.006546449148152793, 0.0021181220744681075, 0.028547868242004795, 0.007106521547828242, 0.009481256368552982, 0.014442734061571176, 0.00331158164998

In [30]:
min_max_entropy_diff_index

{0: [1, 0],
 1: [10, 2],
 2: [9, 10],
 3: [7, 6],
 4: [4, 5],
 5: [5, 11],
 6: [1, 5],
 7: [14, 0],
 8: [12, 7],
 9: [11, 1]}

In [31]:
sum_best_entropy, sum_worst_entropy

(0.009147526332022043, 0.2910108314774757)

#### We aggregate the most representative subsamples and most unrepresentative subsamples of each class to train CNN and observe the difference

In [32]:
def aggregate_best_worst_subsets(grid_subsets, entropy_diff_index):
    best_subsets = ConcatDataset([grid_subsets[label][entropy_diff_index[label][0]] for label in grid_subsets])
    worst_subsets = ConcatDataset([grid_subsets[label][entropy_diff_index[label][1]] for label in grid_subsets])
    return best_subsets, worst_subsets

In [33]:
best_agg_subsets, worst_agg_subsets = aggregate_best_worst_subsets(subsamples_per_class, min_max_entropy_diff_index)
len(best_agg_subsets), len(worst_agg_subsets)

(5000, 5000)

#### Now since we have the most representative and unrepresentative subsets of 5000 images, we can use them to train a CNN submodel respectively.

In [34]:

val_size = 5000
train_size = len(training_dataset) - val_size

train_ds, val_ds = random_split(training_dataset, [train_size, val_size])


In [35]:
num_epochs = 15
opt_func = torch.optim.Adam
lr = 0.001

In [36]:
test_loader = DeviceDataLoader(DataLoader(test_dataset, batch_size*2), device)
train_loader = DeviceDataLoader(DataLoader(training_dataset, batch_size*2), device)

In [37]:
cifar10bestsub = DeviceDataLoader(DataLoader(best_agg_subsets, batch_size = batch_size, shuffle = True, pin_memory = True), device)
val_dl = DataLoader(val_ds, batch_size*2, num_workers=4, pin_memory=True)
best_model = to_device(Cifar10CnnModel(), device)
history = fit(num_epochs, lr, best_model, cifar10bestsub, val_dl, opt_func)

print(evaluate(best_model, test_loader), evaluate(best_model, train_loader))


Epoch [0], train_loss: 2.2516, val_loss: 2.1156, val_acc: 0.2048
Epoch [1], train_loss: 2.0535, val_loss: 1.9995, val_acc: 0.2532
Epoch [2], train_loss: 1.9484, val_loss: 1.9355, val_acc: 0.2608
Epoch [3], train_loss: 1.8459, val_loss: 2.0884, val_acc: 0.2511
Epoch [4], train_loss: 1.7684, val_loss: 1.7646, val_acc: 0.3345
Epoch [5], train_loss: 1.6837, val_loss: 1.7033, val_acc: 0.3529
Epoch [6], train_loss: 1.5882, val_loss: 1.7205, val_acc: 0.3798
Epoch [7], train_loss: 1.5163, val_loss: 1.4927, val_acc: 0.4412
Epoch [8], train_loss: 1.4171, val_loss: 1.9794, val_acc: 0.3595
Epoch [9], train_loss: 1.3806, val_loss: 1.5066, val_acc: 0.4356
Epoch [10], train_loss: 1.2812, val_loss: 1.5234, val_acc: 0.4585
Epoch [11], train_loss: 1.1946, val_loss: 1.3618, val_acc: 0.5064
Epoch [12], train_loss: 1.1239, val_loss: 1.3542, val_acc: 0.5098
Epoch [13], train_loss: 1.0552, val_loss: 1.4409, val_acc: 0.4906
Epoch [14], train_loss: 1.0076, val_loss: 1.4564, val_acc: 0.5070
{'val_loss': 1.52568

In [38]:
cifar10worstsub = DeviceDataLoader(DataLoader(worst_agg_subsets, batch_size = batch_size, shuffle = True, pin_memory = True), device)
val_dl = DataLoader(val_ds, batch_size*2, num_workers=4, pin_memory=True)
worst_model = to_device(Cifar10CnnModel(), device)
history = fit(num_epochs, lr, worst_model, cifar10worstsub, val_dl, opt_func)

print(evaluate(worst_model, test_loader), evaluate(worst_model, train_loader))

Epoch [0], train_loss: 2.2000, val_loss: 2.0648, val_acc: 0.2354
Epoch [1], train_loss: 2.0306, val_loss: 1.9801, val_acc: 0.2637
Epoch [2], train_loss: 1.9287, val_loss: 1.9137, val_acc: 0.2693
Epoch [3], train_loss: 1.7938, val_loss: 1.7456, val_acc: 0.3450
Epoch [4], train_loss: 1.6942, val_loss: 1.7970, val_acc: 0.3474
Epoch [5], train_loss: 1.5802, val_loss: 1.5736, val_acc: 0.4182
Epoch [6], train_loss: 1.4873, val_loss: 1.5404, val_acc: 0.4295
Epoch [7], train_loss: 1.3890, val_loss: 1.4947, val_acc: 0.4614
Epoch [8], train_loss: 1.2998, val_loss: 1.4780, val_acc: 0.4563
Epoch [9], train_loss: 1.2281, val_loss: 1.6579, val_acc: 0.4491
Epoch [10], train_loss: 1.1548, val_loss: 1.3855, val_acc: 0.5160
Epoch [11], train_loss: 1.0103, val_loss: 1.4861, val_acc: 0.4976
Epoch [12], train_loss: 0.9357, val_loss: 1.5349, val_acc: 0.4940
Epoch [13], train_loss: 0.7874, val_loss: 1.5272, val_acc: 0.5362
Epoch [14], train_loss: 0.6146, val_loss: 1.7607, val_acc: 0.5271
{'val_loss': 1.87224

Not finished yet, will update later this week. 