# Siamese Network



---

In this session, we are going to implement a Siamese Network.

It takes as input two augmented versions of the same image and produces as output two feature vectors one for each version of the image.

For simplicity, we will use the same backbone to process the views as in SimCLR paper.



In [77]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from torchvision.io import read_image


from PIL import Image

In [78]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Backbone(nn.Module):  # emulates a smaller resnet18
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))  # Outputs 256-dim vector

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)  # Flatten to (batch_size, 256)
        return x


class SiameseNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = Backbone()
        self.projection = nn.Sequential(
            nn.Linear(256, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),
            nn.Linear(128, 10)
        )

    def forward(self, x1, x2):
        x1 = self.backbone(x1)
        x1 = self.projection(x1)

        x2 = self.backbone(x2)
        x2 = self.projection(x2)

        x1 = F.normalize(x1, dim=1)
        x2 = F.normalize(x2, dim=1)
    
        return x1, x2

# Check output
a = SiameseNet()
input1 = torch.randn(5, 3, 32, 32)
input2 = torch.randn(5, 3, 32, 32)
output1, output2 = a(input1, input2)

print("Output shapes:", output1.shape, output2.shape)




Output shapes: torch.Size([5, 10]) torch.Size([5, 10])


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Backbone(nn.Module):  # emulates a smaller resnet18
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))  # Outputs 256-dim vector

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)  # Flatten to (batch_size, 256)
        return x


class AsimmetricSiameseNet(nn.Module): # TODO MAKE ME WITH DIFFERENT ENCODERS
    def __init__(self):
        super().__init__()
        self.backbone = Backbone()
        self.projection = nn.Sequential(
            nn.Linear(256, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),
            nn.Linear(128, 10)
        )
        self.backbone1 = Backbone()
        self.projection1 = nn.Sequential(
            nn.Linear(256, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),
            nn.Linear(128, 10)
        )

    def forward(self, x1, x2):
        x1 = self.backbone(x1)
        x1 = self.projection(x1)

        x2 = self.backbone1(x2)
        x2 = self.projection1(x2)

        x1 = F.normalize(x1, dim=1)
        x2 = F.normalize(x2, dim=1)
    
        return x1, x2

# Check output
a = SiameseNet()
input1 = torch.randn(5, 3, 32, 32)
input2 = torch.randn(5, 3, 32, 32)
output1, output2 = a(input1, input2)

print("Output shapes:", output1.shape, output2.shape)


In [80]:
class CustomImageDataset(Dataset):
    def __init__(self, data, targets=None, transform=None, target_transform=None):
        self.imgs = data # Tensore di tutte le immagini
        self.targets = targets
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        img = self.imgs[idx] # Sampling randomico di emlementi del dataset
        if isinstance(img, str): # Può capitare che il dataset sia salvato come stringhe/path (da usare quando non è possibile salvarsi tutto il tensore del dataset)
          img = read_image(img_path) # Fuzione di Torchvision, trova un'immaigne dal path fornito
        else:
          img = Image.fromarray(img.astype('uint8'), 'RGB') # Preso un array restituisce un'immagine RGB, senza non si riesce a lavorare
        if self.targets:
          label = self.targets[idx] # Non utile nel caso di self-supervised ovviamente
        if self.transform:
          img1 = self.transform(img) # Utilizzo le trasformazioni
          img2 = self.transform(img)  # Già così genero due immagini augmented diverse, siccome le funzioni che trasformano sono randomiche (TODO, rivedi le variabili)
        if self.target_transform:
          label1 = self.target_transform(label)
          label2 = self.target_transform(label)
        return img1, img2, label # Concateno immaigni e labels

Let's now use the Dataset which creates the two augmented views for each image from the [past lab session](https://colab.research.google.com/drive/1NJwAFbRiD4MdwWf__6P2Lm0xYk_DNdVu?usp=sharing) and create a loop with forward pass

In [81]:
# simclr DA pipeline
s=1
size=32
color_jitter = transforms.ColorJitter(0.8 * s, 0.8 * s, 0.8 * s, 0.2 * s)
transform = transforms.Compose([transforms.RandomResizedCrop(size=size),
                                  transforms.RandomHorizontalFlip(),
                                  transforms.RandomApply([color_jitter], p=0.8),
                                  transforms.RandomGrayscale(p=0.2),
                                  transforms.GaussianBlur(kernel_size=3),
                                  transforms.ToTensor()])

data = torchvision.datasets.CIFAR10(root='./data', train=True, download=True)
# create training set from CustomDataset
trainset = CustomImageDataset(data.data, data.targets, transform=transform)
dataloader = DataLoader(trainset, batch_size=64, shuffle=True)

Files already downloaded and verified


In [82]:
model = SiameseNet()
for idx, data in enumerate(dataloader):
    views1, views2, targets = data
    print(views1.shape)
    print(views2.shape)
    print(targets.shape)

    output = model(views1, views2)

    if idx == 3:
        break

torch.Size([64, 3, 32, 32])
torch.Size([64, 3, 32, 32])
torch.Size([64])
torch.Size([64, 3, 32, 32])
torch.Size([64, 3, 32, 32])
torch.Size([64])
torch.Size([64, 3, 32, 32])
torch.Size([64, 3, 32, 32])
torch.Size([64])
torch.Size([64, 3, 32, 32])
torch.Size([64, 3, 32, 32])
torch.Size([64])
