This notebook aims at clustering the dataset to expose potential bias. It uses a small convolutional network to find embeddings of every sample of the dataset, before applying PCA and using the k-means algorithm with a customized distance metric to yield a clustering of the dataset. This notebook can be ran sequencially, cell by cell. 

# Imports

In [2]:
from __future__ import print_function, division

import sys  
import time
import os
import copy
import shutil

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision import datasets, models, transforms
from torch.utils.data import TensorDataset
import torchvision

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from math import ceil
from PIL import Image

import visdom
from IPython.display import clear_output
from PIL import Image
import nltk
from nltk.cluster.kmeans import KMeansClusterer

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity, pairwise_distances

sys.path.insert(0, '../Resnet/')
%load_ext autoreload
%autoreload 2
from model import *
from my_ImageFolder import *
from fairness_metrics import *

plt.ion()   # interactive mode

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Defining the inputs

In [4]:
W_PROTECTED, BIAS, VAL_MODE, START_EPOCH, NUM_EPOCH, SHOW_PROGRESS, ID, DATASET, NUM_TRIALS, BIAS, PROTECTED = 1, 0.8, False, 0, 3, False, 0, "basket_volley", 1, 0.8, "gd"

# Importing the dataset

In [5]:
path_bask_r_f = '../Datasets/basket_volley/basket/basket_f_r/'
path_bask_y_f = '../Datasets/basket_volley/basket/basket_f_y/'
path_bask_r_m = '../Datasets/basket_volley/basket/basket_m_r/'
path_bask_y_m = '../Datasets/basket_volley/basket/basket_m_y/'

bask_r_f = os.listdir(path_bask_r_f)
bask_y_f = os.listdir(path_bask_y_f)
bask_r_m = os.listdir(path_bask_r_m)
bask_y_m = os.listdir(path_bask_y_m)

path_voll_r_f = '../Datasets/basket_volley/volley/volley_f_r/'
path_voll_y_f = '../Datasets/basket_volley/volley/volley_f_y/'
path_voll_r_m = '../Datasets/basket_volley/volley/volley_m_r/'
path_voll_y_m = '../Datasets/basket_volley/volley/volley_m_y/'

voll_r_f = os.listdir(path_voll_r_f)
voll_y_f = os.listdir(path_voll_y_f)
voll_r_m = os.listdir(path_voll_r_m)
voll_y_m = os.listdir(path_voll_y_m)

class0_min, class1_min = (bask_y_m + bask_y_f, voll_r_m + voll_r_f) if PROTECTED == "jc" else (bask_y_f + bask_r_f, voll_r_m + voll_y_m)
protected_groups = set(class0_min + class1_min)

In [6]:
# Data augmentation and normalization for training
# Just normalization for validation
data_transforms = {
    'train': transforms.Compose([
        # transforms.RandomResizedCrop(224),
        # transforms.RandomHorizontalFlip(),
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

data_dir = f'../Datasets/basket_volley/train_test_split_{PROTECTED}'
image_datasets = {
    x: my_ImageFolder(os.path.join(data_dir, f"train_{BIAS}" if x == "train" else x), data_transforms[x],
                      protected_groups, W_PROTECTED)
    for x in ['train', 'test']}

dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
                                              shuffle=True, num_workers=4)
               for x in ['train', 'test']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'test']}
class_names = image_datasets['train'].classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Defining convolutional network

In [7]:
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 1, 4)
        self.pool = nn.MaxPool2d(2, 2)
        # self.conv2 = nn.Conv2d(6, 12, 5)
        # self.fc1 = nn.Linear(33708, 2048)
        self.fc2 = nn.Linear(12100, 512)
        self.fc3 = nn.Linear(512, len(class_names))

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        # x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        # x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

In [8]:
net = Net()
net.fc2.register_forward_hook(get_activation('fc2'))

criterion = weighted_cross_entropy_loss # nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

### Training

In [9]:
net = train_model(net, criterion, optimizer, scheduler, dataloaders, dataset_sizes, device,
                             start_epoch=START_EPOCH,
                             num_epochs=NUM_EPOCH,
                             val_mode=VAL_MODE, show_progress=SHOW_PROGRESS)

Epoch 0/2
----------
train Loss: 0.6769 Acc: 0.5614
Epoch 1/2
----------
train Loss: 0.6876 Acc: 0.5757
Epoch 2/2
----------
train Loss: 0.6711 Acc: 0.5990
Training complete in 0m 29s
Best val Acc: 0.000000


### Evaluation

In [10]:
print(f"Acc. on Training set: {float(accuracy(net, device, dataloaders['train']))}")
print(f"Acc. on Test set: {float(accuracy(net, device, dataloaders['test']))}")

Acc. on Training set: 0.6610608020698577
Acc. on Test set: 0.5634146341463414


In [10]:
print(demographic_parity(net, device, image_datasets["train"], [class0_min, class1_min]))

          Group0    Group1
Class0  1.000000  0.571429
Class1  0.070423  0.709790


# Extracting represenation

In [11]:
X_transform_0, X_transform_1 = np.array([[]]).reshape(0, net.fc2.out_features),np.array([[]]).reshape(0, net.fc2.out_features)
indexes_0, indexes_1 = np.array([]).astype(int), np.array([]).astype(int)
for i, (inputs, labels, clusters, index) in enumerate(dataloaders["train"]):
    output = net(inputs)
    output = activation['fc2']
    for j, l in enumerate(labels):
        if not l:
            X_transform_0 = np.concatenate([X_transform_0, output[j].numpy().reshape((1, -1))])
            indexes_0 = np.concatenate([indexes_0, index[j].numpy().reshape(-1)])
        else:
            X_transform_1 = np.concatenate([X_transform_1, output[j].numpy().reshape((1, -1))])
            indexes_1 = np.concatenate([indexes_1, index[j].numpy().reshape(-1)])

# PCA reduction

In [12]:
pca_0 = PCA(n_components=10)
pca_1 = PCA(n_components=10)

X_reducted_0 = pca_0.fit_transform(X_transform_0)
X_reducted_1 = pca_1.fit_transform(X_transform_1)

In [13]:
eigenvalues_0 = pca_0.explained_variance_
eigenvalues_1 = pca_1.explained_variance_
eigenvalues_0, eigenvalues_1

(array([120.70412648,   9.16219355,   4.24154224,   2.02520958,
          1.13201802,   1.04790829,   0.89211207,   0.81012006,
          0.65167638,   0.54672269]),
 array([128.05055591,  24.94210482,  12.70354008,   5.15882912,
          2.91624167,   1.88255961,   1.70923716,   1.54939981,
          1.32936886,   1.1190326 ]))

# k-means clustering

In [14]:
def my_distance_0(v1, v2):
    dist = 0
    for i, (c1, c2) in enumerate(zip(v1, v2)):
        dist += eigenvalues_0[i] * float(abs(c1 - c2))
    return dist

def my_distance_1(v1, v2):
    dist = 0
    for i, (c1, c2) in enumerate(zip(v1, v2)):
        dist += eigenvalues_1[i] * float(abs(c1 - c2))
    return dist

In [15]:
km_0 = KMeansClusterer(2, distance=my_distance_0, repeats=25) # nltk.cluster.util.cosine_distance
km_1 = KMeansClusterer(2, distance=my_distance_1, repeats=25) # nltk.cluster.util.cosine_distance

kmeans_0 = km_0.cluster(X_reducted_0, assign_clusters=True)
kmeans_1 = km_1.cluster(X_reducted_1, assign_clusters=True)

# Create cluster folder

In [17]:
cluster_paths_0 = view_clusters("class_0/", kmeans_0, indexes_0)
cluster_paths_1 = view_clusters("class_1/", kmeans_1, indexes_1)

# Performance statistics

### Basket

In [18]:
statistics("class_0/", cluster_paths_0)

--------------Cluster 0--------- 
 n. samples: 357
 n. of bask: 357 (100.0%)
 n. of volley: 0 (0.0%)
 n. of red: 176 (49.3%)
 n. of yellow: 181 (50.7%)
 n. of males: 291 (81.5%)
 n. of females: 66 (18.5%)
--------------Cluster 1--------- 
 n. samples: 32
 n. of bask: 32 (100.0%)
 n. of volley: 0 (0.0%)
 n. of red: 16 (50.0%)
 n. of yellow: 16 (50.0%)
 n. of males: 21 (65.6%)
 n. of females: 11 (34.4%)


### Volley

In [19]:
statistics("class_1/", cluster_paths_1)

--------------Cluster 0--------- 
 n. samples: 313
 n. of bask: 0 (0.0%)
 n. of volley: 313 (100.0%)
 n. of red: 145 (46.3%)
 n. of yellow: 168 (53.7%)
 n. of males: 60 (19.2%)
 n. of females: 253 (80.8%)
--------------Cluster 1--------- 
 n. samples: 71
 n. of bask: 0 (0.0%)
 n. of volley: 71 (100.0%)
 n. of red: 35 (49.3%)
 n. of yellow: 36 (50.7%)
 n. of males: 16 (22.5%)
 n. of females: 55 (77.5%)


### Saving clustering

In [20]:
dict = make_save_dict(image_datasets["train"].samples, [kmeans_0, kmeans_1], [indexes_0, indexes_1], save=True, name="conv_0.8.txt")

# Helper methods 

In [16]:
def show_closest(list, top_k=5):
    for i in range(top_k):
        img_show(list[i][1])
        print(list[i][0])
        
def img_show(i):
    # clear_output(wait=True)
    image = Image.open(image_datasets["train"].samples[i][0], "r")
    plt.imshow(np.asarray(image))
    plt.show()

def view_clusters(path, kmeans, indexes):
    K = len(set(kmeans))
    
    paths = []
    for k in range(K):
        paths.append(os.path.join(path, f"clustering_{K}/cluster_{k}"))
        os.makedirs(paths[-1], exist_ok=True)
        
    for i in range(len(kmeans)):
        src = image_datasets["train"].samples[indexes[i]][0]
        dst = os.path.join(path, f"clustering_{K}/cluster_{kmeans[i]}/") + src.split("/")[-1]
        shutil.copy(src, dst)
    
    return paths

def list_clusters(path, kmeans, indexes):
    K = len(set(kmeans))
    bask, voll = os.listdir(os.path.join(path, "basket")), os.listdir(os.path.join(path, "volley"))
    
    lists = [[] for _ in range(K)]
    for i in range(len(kmeans)):
        src = bask[indexes[i]] if indexes[i] < len(bask) else voll[indexes[i] - len(bask)]
        lists[kmeans[i]].append(src)
        
    return lists

def proximity(pca, km, X_reducted, indexes, kmeans):
    proj = pca.inverse_transform(km.cluster_centers_)
    list_0, list_1 = [], []
    for rep, id, c in zip(X_reducted, indexes, kmeans):
        dist = mse(rep, km.cluster_centers_[0]) if not c else mse(rep, km.cluster_centers_[1])
        list_1.append((dist, id)) if c else list_0.append((dist, id))      
    list_0.sort(), list_1.sort()
    return list_0, list_1

def closest_to(emb, X_transform, indexes, metric=cosine_similarity, descending=False):
    l = []
    for i, (emb2, idx) in enumerate(zip(X_transform, indexes)):
        dist = metric(emb.reshape((1, -1)), emb2.reshape((1, -1)))
        l.append((dist, idx))
    l.sort(reverse=descending)
    return l

def statistics(path, clusters):
    K = len(set(clusters))
    
    for k in range(K):
        n_bask, n_voll, n_r, n_y, n_m, n_f = 0, 0, 0, 0, 0, 0
        cluster = os.listdir(clusters[k])
        for img in cluster:
            if img in bask_r_f:
                n_bask += 1
                n_f += 1
                n_r += 1
                
            if img in bask_r_m:
                n_bask += 1
                n_m += 1
                n_r += 1
                
            if img in bask_y_f:
                n_bask += 1
                n_f += 1
                n_y += 1
            
            if img in bask_y_m:
                n_bask += 1
                n_m += 1
                n_y += 1
            
            if img in voll_r_f:
                n_voll += 1
                n_f += 1
                n_r += 1
            
            if img in voll_r_m:
                n_voll += 1
                n_m += 1
                n_r += 1
                
            if img in voll_y_f:
                n_voll += 1
                n_f += 1
                n_y += 1
            
            if img in voll_y_m:
                n_voll += 1
                n_m += 1
                n_y += 1
                
        
        print(f"--------------Cluster {k}--------- \n n. samples: {len(cluster)}\n n. of bask: {n_bask} ({n_bask/len(cluster)*100:.1f}%)\n n. of volley: {n_voll} ({n_voll/len(cluster)*100:.1f}%)\n n. of red: {n_r} ({n_r/len(cluster)*100:.1f}%)\n n. of yellow: {n_y} ({n_y/len(cluster)*100:.1f}%)\n n. of males: {n_m} ({n_m/len(cluster)*100:.1f}%)\n n. of females: {n_f} ({n_f/len(cluster)*100:.1f}%)")
        
def statistics_colors(clusters, K=2):
    stats = []
    for k in range(K):
        n_r, n_y = 0, 0
        cluster = clusters[k]
        for img in cluster:
            if img in bask_r_f or img in bask_r_m or img in voll_r_f or img in voll_r_m:
                n_r += 1
            else:
                n_y += 1
        stats.append(n_r/(n_r+n_y))
    return stats

def make_save_dict(samples, k_means_list, indexes_list, save=False, name="dict.txt"):
    dic = {}
    for k_means, indexes in zip(k_means_list, indexes_list):
        for cluster, idx in zip(k_means, indexes):
            img = samples[idx][0].split("/")[-1]
            dic[img] = cluster
       
    if save:
        f = open(name, "a")
        f.write(str(dic))
        f.close()
    
    return dic