In [1]:
import os
import pandas as pd
import time

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from torchvision.io import read_image

import torchvision
import torchvision.models as models
import torchvision.transforms as transforms

from PIL import Image

Fino ad ora abbiamo definito:
- un dataset che restituisce 2 immagini transformed + il suo training loop
- una rete siamese (sia simmettrica che asimmetrica)

Oggi vedremo come usare una siamese asimmetrica.
Una volte estreatte le feature vorremo creare una loss che presi due valori di feature, ci renda un valore di loss.

Vogliamo una soluzione base:
- niente for loop (optional)
- vedremo insieme una versione più complessa

In [2]:
# Custom Dataset class
class CustomImageDataset(Dataset):
    def __init__(self, data, targets=None, transform=None, target_transform=None):
        self.imgs = data # Tensore di tutte le immagini
        self.targets = targets
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        img = self.imgs[idx] # Sampling randomico di emlementi del dataset
        if isinstance(img, str): # Può capitare che il dataset sia salvato come stringhe/path (da usare quando non è possibile salvarsi tutto il tensore del dataset)
          img = read_image(img_path) # Fuzione di Torchvision, trova un'immaigne dal path fornito
        else:
          img = Image.fromarray(img.astype('uint8'), 'RGB') # Preso un array restituisce un'immagine RGB, senza non si riesce a lavorare
        if self.targets:
          label = self.targets[idx] # Non utile nel caso di self-supervised ovviamente
        if self.transform:
          img1 = self.transform(img) # Utilizzo le trasformazioni
          img2 = self.transform(img)  # Già così genero due immagini augmented diverse, siccome le funzioni che trasformano sono randomiche (TODO, rivedi le variabili)
        if self.target_transform:
          label1 = self.target_transform(label)
          label2 = self.target_transform(label)
        return img1, img2, label # Concateno immaigni e labels

# Simmetric Siamese
class SiameseNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = models.resnet18()
        self.backbone.fc = nn.Identity()


    def forward(self, x1, x2):
        # buona pratica, concatenare x1 e x2 in un unico tensore
        x1 = self.backbone(x1)
        x2 = self.backbone(x2)
        return torch.cat((x1, x2), dim=0) # concatena x1 e x2

# Contrastive Loss

---
In this session, we are going to implement the SimCLR loss function (https://arxiv.org/abs/2002.05709).

This follows the InfoNCE loss, i.e., uses two different augmented versions of the same image as positive pair and the other images in the batch as negative samples, and the batch construction of the N-pair-mc loss.

I negative sono tutti i positive degli altri esempi.

Costruisco una matrice in cui metto nella prima posizione di ogni riga il sample di partenza, nella seconda i positive e nelle restanti righe i negative.

La cross entropy dovrà avere un'etichetta.

Non ho una z di projection, non cambia nulla siccome non addestriamo, però si può inserire.

In [3]:
torch.cat((torch.Tensor([1, 2, 3]), torch.Tensor([4, 5, 6])), dim=0)

tensor([1., 2., 3., 4., 5., 6.])

In [4]:
class ForContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.07):
        super().__init__()
        self.temperature = temperature
        self.criterion = torch.nn.CrossEntropyLoss()

    def forward(self, features):
        ### features = torch.cat((x1,x2), dim=0)
        # normalize features to later compute cosine distance/similarity btw them
        features = F.normalize(features, dim=1)
        # compute the similarity matrix btw features
        # (consider that feature are normalized! so the cosine similarity is ...)
        batch_size = len(features)//2
        similarity_matrix = torch.matmul(features, features.T)

        ## TODO
        start = time.time()
        # create the logits tensor where:
        #   - in the first position there is the similarity of the positive pair
        #   - in the other 2N-1 positions there are the similarity w negatives
        # the shape of the tensor need to be 2Nx2N-1, with N is the batch size -> 2N-1 negative perché più negative ho meglio funziona l'apprendimento
        logits = torch.zeros(2*batch_size, 2*batch_size-1)

        for idx, val in enumerate(similarity_matrix):
          row = torch.zeros(2*batch_size - 1)

          pos_idx = idx + batch_size if idx < batch_size else idx - batch_size
          row[0] = val[pos_idx]
          row[1:] = torch.tensor([v for i, v in enumerate(val) if  i != idx and i!=pos_idx])

          logits[idx] = row

        logits = logits / self.temperature

        # to compute the contrastive loss using the CE loss, we just need to
        # specify where is the similarity of the positive pair in the logits tensor
        # since we put in the first position we create a gt of all zeros
        # N.B.: this is just one of the possible implementations!
        gt = torch.zeros(logits.shape[0], dtype=torch.long)
        loss = self.criterion(logits, gt)
        end = time.time()
        t = end - start
        return loss, t

Se voglio calcolare le tempistiche per bene devo iniziare a prendere il tempo dopo il calcolo della similarity matrix e la fine dopo il calcolo della loss.
C'è da rivedere il tutto.

Miglioro tutto se faccio operazioni in place in teoria.
Altra possibiità di miglioramento è ridurre le maschere a qualcosa di utile per le rappresentazioni sparse.

Con queste loss basate su contrasto, la memoria diventa una risorsa cruciale, soprattutto considerando che più negative uso, più funziona bene.

In [5]:
class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.07):
        super().__init__()
        self.temperature = temperature
        self.criterion = torch.nn.CrossEntropyLoss()

    def forward(self, features):
        ### features = torch.cat((x1,x2), dim=0)
        # normalize features to later compute cosine distance/similarity btw them
        features = F.normalize(features, dim=1)
        # compute the similarity matrix btw features
        # (consider that feature are normalized! so the cosine similarity is ...)
        batch_size = len(features)//2
        similarity_matrix = torch.matmul(features, features.T)

        start = time.time()
        labels = torch.cat([torch.arange(features.shape[0]//2) for i in range(2)], dim=0)
        labels = (labels.unsqueeze(0) == labels.unsqueeze(1)).float() #simpatico trick
        mask = torch.eye(labels.shape[0], dtype=torch.bool) # maschera per togliere la diagonale
        labels = labels[~mask].view(labels.shape[0], -1) # Tolgo la diagonale principale sia da labels che dalla similarity matrix
        similarity_matrix = similarity_matrix[~mask].view(similarity_matrix.shape[0], -1)

        ## TODO

        positives = similarity_matrix[labels.bool()].view(labels.shape[0], -1)
        negatives = similarity_matrix[~labels.bool()].view(similarity_matrix.shape[0], -1)

        # create the logits tensor where:
        #   - in the first position there is the similarity of the positive pair
        #   - in the other 2N-1 positions there are the similarity w negatives
        # the shape of the tensor need to be 2Nx2N-1, with N is the batch size -> 2N-1 negative perché più negative ho meglio funziona l'apprendimento

        logits = torch.cat([positives, negatives], dim=1)
        logits = logits / self.temperature

        # to compute the contrastive loss using the CE loss, we just need to
        # specify where is the similarity of the positive pair in the logits tensor
        # since we put in the first position we create a gt of all zeros
        # N.B.: this is just one of the possible implementations!
        gt = torch.zeros(logits.shape[0], dtype=torch.long)
        loss = self.criterion(logits, gt)
        end = time.time()
        t = end - start
        return loss, t

In [6]:
class NContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.07):
        super().__init__()
        self.temperature = temperature
        self.criterion = torch.nn.CrossEntropyLoss()

    def forward(self, features):
        ### features = torch.cat((x1,x2), dim=0)
        # normalize features to later compute cosine distance/similarity btw them
        features = F.normalize(features, dim=1)
        # compute the similarity matrix btw features
        # (consider that feature are normalized! so the cosine similarity is ...)
        batch_size = len(features)//2
        logits = torch.matmul(features, features.T)

        start = time.time()

        pos1 = logits.diag(batch_size)
        pos2 = logits.diag(-batch_size)
        logits = logits[~(torch.cat([torch.arange(features.shape[0]//2) for i in range(2)], dim=0).unsqueeze(0) == torch.cat([torch.arange(features.shape[0]//2) for i in range(2)], dim=0).unsqueeze(1))].view(2*batch_size, -1)
        logits = torch.cat((torch.cat((pos1, pos2), dim=0).unsqueeze(1), logits), dim=1)

        ## TODO


        # create the logits tensor where:
        #   - in the first position there is the similarity of the positive pair
        #   - in the other 2N-1 positions there are the similarity w negatives
        # the shape of the tensor need to be 2Nx2N-1, with N is the batch size -> 2N-1 negative perché più negative ho meglio funziona l'apprendimento

        logits = logits / self.temperature

        # to compute the contrastive loss using the CE loss, we just need to
        # specify where is the similarity of the positive pair in the logits tensor
        # since we put in the first position we create a gt of all zeros
        # N.B.: this is just one of the possible implementations!
        gt = torch.zeros(logits.shape[0], dtype=torch.long)
        loss = self.criterion(logits, gt)
        end = time.time()
        t = end - start
        return loss, t

Let's now use the Dataset which creates the two augmented views for each image and the Siamese Network from the past lab session [1](https://colab.research.google.com/drive/1NJwAFbRiD4MdwWf__6P2Lm0xYk_DNdVu?usp=sharing) and [2](https://colab.research.google.com/drive/1AMkh0q8L5nJScx7v6cMWoK336zqOqDY6?usp=sharing) and create a training loop

In [7]:
dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True)

s=1
size=32
color_jitter = transforms.ColorJitter(0.8 * s, 0.8 * s, 0.8 * s, 0.2 * s)
transform = transforms.Compose([transforms.RandomResizedCrop(size=size),
                                  transforms.RandomHorizontalFlip(),
                                  transforms.RandomApply([color_jitter], p=0.8),
                                  transforms.RandomGrayscale(p=0.2),
                                  transforms.GaussianBlur(kernel_size=3),
                                  transforms.ToTensor()])

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:03<00:00, 45.6MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data


In [14]:
trainset = CustomImageDataset(dataset.data, dataset.targets, transform=transform)
dataloader = DataLoader(trainset, batch_size=256, shuffle=True)

model = SiameseNet()
optimizer = optim.Adam(model.parameters())
criterion = NContrastiveLoss()


times = []
for idx, data in enumerate(dataloader):
    v1, v2, _ = data
    v1 = v1
    v2 = v2
    optimizer.zero_grad()
    output = model(v1, v2)
    loss, t = criterion(output)
    loss.backward()
    optimizer.step()
    times.append(t)

print(f"Our variant took on average:{np.mean(times)}")
      # if idx == 3:
      #     break

Our variant took on average:0.006391294148503517


In [15]:
trainset = CustomImageDataset(dataset.data, dataset.targets, transform=transform)
dataloader = DataLoader(trainset, batch_size=256, shuffle=True)

model = SiameseNet()
optimizer = optim.Adam(model.parameters())
criterion = ContrastiveLoss()


times = []
for idx, data in enumerate(dataloader):
    v1, v2, _ = data
    v1 = v1
    v2 = v2
    optimizer.zero_grad()
    output = model(v1, v2)
    loss, t = criterion(output)
    loss.backward()
    optimizer.step()
    times.append(t)

print(f"Mask variant took on average:{np.mean(times)}")
      # if idx == 3:
      #     break

Mask variant took on average:0.01670169465395869


In [16]:
trainset = CustomImageDataset(dataset.data, dataset.targets, transform=transform)
dataloader = DataLoader(trainset, batch_size=256, shuffle=True)

model = SiameseNet()
optimizer = optim.Adam(model.parameters())
criterion = ForContrastiveLoss()


times = []
for idx, data in enumerate(dataloader):
    v1, v2, _ = data
    v1 = v1
    v2 = v2
    optimizer.zero_grad()
    output = model(v1, v2)
    loss, t = criterion(output)
    loss.backward()
    optimizer.step()
    times.append(t)

print(f"For variant took on average:{np.mean(times)}")
      # if idx == 3:
      #     break

For variant took on average:0.797833861136923


Supposed CUDA
For:
0.053777456283569336
0.05588936805725098
0.05563950538635254
0.05593299865722656

mask:
0.0048694610595703125
0.0016055107116699219
0.002445697784423828
0.0035674571990966797

Supposed CPU
For:
0.01118779182434082
0.012949943542480469
0.011744260787963867
0.011047601699829102

mask:
0.0007627010345458984
0.0007863044738769531
0.0006840229034423828
0.0010235309600830078

Our variant took on average:
0.0008311143616581208 (64)
0.00207529958251797 (128)
0.006391294148503517 (256)

Better variant took on average:
0.001560683445552426 (64)
0.004945228776663466 (128)
0.01670169465395869 (256)

For variant took on average:
0.04765970017903906 (64)
0.2059327104817266 (128)
0.797833861136923 (256)

# DOMANDE
- Ottimizzazione in place -> velocizza davvero o è solo una questione di memoria
- Le due loss servono solo per convergenza più rapida?
- Perché con la gpu va un botto (5x) più lento