<a href="https://colab.research.google.com/github/gekoramy/uni.deep-learning/blob/attention/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%shell
tee requirements.txt << END
ftfy
jaxtyping
jupyter
matplotlib
pydantic
regex
torch
torchinfo
torchvision
tqdm
ultralytics
END

pip install -q -r requirements.txt
pip install -q git+https://github.com/openai/CLIP.git

ftfy
jaxtyping
jupyter
matplotlib
pydantic
regex
torch
torchinfo
torchvision
tqdm
ultralytics
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m953.0 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m609.6/609.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.9/121.9 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.9/84.9 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for clip (setup.py) ... [?25l[?25hdone




In [2]:
import clip
import json
import os
import pickle
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import numpy as np
import PIL
import itertools as it
import math

from datetime import datetime
from jaxtyping import Float, UInt, Int
from pydantic.dataclasses import dataclass
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchvision import transforms
from torchvision.utils import draw_bounding_boxes
from torchvision.io import read_image
from torchinfo import summary
from typing import Literal, Callable, Mapping, TypeVar
from tqdm import tqdm
from timeit import default_timer as timer
from torch.utils.tensorboard import SummaryWriter

In [3]:
device: Literal['cpu', 'cuda'] = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_default_device(device)
device

'cpu'

### Utils

In [4]:
def print_train_time(start: float, end: float, device: torch.device = None):
    """Prints difference between start and end time.

    Args:
        start (float): Start time of computation (preferred in timeit format).
        end (float): End time of computation.
        device ([type], optional): Device that compute is running on. Defaults to None.

    Returns:
        float: time between start and end in seconds (higher is longer).
    """
    total_time = end - start
    print(f"Train time on {device}: {total_time:.3f} seconds")
    return total_time

In [5]:
# args:
#  - predictionList: [Prediction]
#  - numPred: int :: if numPred==-1 (default) consider all the predictions in predictionList
def display_predictions(predictionList, numPred=-1):
  limit = 0
  for p in predictionList:
    if numPred!=-1 and limit >= numPred:
      return;
    limit += 1

    p_image = p.image

    if(not isinstance(p_image, torch.Tensor)):
      p_image = torchvision.transforms.PILToTensor()(p_image)

    p_description = p.description
    p_ground_truth_bbox = p.ground_truth_bbox
    p_output_bbox = p.output_bbox

    # TODO: concatenate
    p_image = draw_bounding_boxes(p_image, p_ground_truth_bbox.unsqueeze(0), colors="green", width=5)
    p_image = draw_bounding_boxes(p_image, p_output_bbox.unsqueeze(0), colors="red", width=5)

    tensor_to_pil = transforms.ToPILImage()
    image_pil = tensor_to_pil(p_image)
    display(image_pil)
    print(p_description)
    print("\n\n")

#### Dataset and type declaration

In [6]:
%%shell
if ! [ -d dataset ]; then
  mkdir dataset &&
  gdown 1P8a1g76lDJ8cMIXjNDdboaRR5-HsVmUb &&
  tar -xf refcocog.tar.gz -C dataset &&
  rm refcocog.tar.gz
fi

Downloading...
From: https://drive.google.com/uc?id=1P8a1g76lDJ8cMIXjNDdboaRR5-HsVmUb
To: /content/refcocog.tar.gz
100% 13.5G/13.5G [01:57<00:00, 114MB/s]




In [7]:
root = os.path.join("dataset", "refcocog", "")
data_instances = os.path.join(root, "annotations", "instances.json")
data_refs = os.path.join(root, "annotations", "refs(umd).p")
data_images = os.path.join(root, "images", "")

In [8]:
I = TypeVar("I")
P = TypeVar("P")
B = TypeVar("B")
T = TypeVar("T")

Img = UInt[torch.Tensor, "C W H"]
BBox = UInt[torch.Tensor, "4"]
Split = Literal["train", "test", "val"]

@dataclass
class Info:
    description: str  # This is stable 1.0 version of the 2014 MS COCO dataset.
    url: str  # http://mscoco.org/
    version: str  # 1.0
    year: int  # 2014
    contributor: str  # Microsoft COCO group
    date_created: datetime  # 2015-01-27 09:11:52.357475

@dataclass
class Image:
    license: int  # each image has an associated licence id
    file_name: str  # file name of the image
    coco_url: str  # example http://mscoco.org/images/131074
    height: int
    width: int
    flickr_url: str  # example http://farm9.staticflickr.com/8308/7908210548_33e
    id: int  # id of the imag
    date_captured: datetime  # example '2013-11-21 01:03:06'

@dataclass
class License:
    url: str  # example http://creativecommons.org/licenses/by-nc-sa/2.0/
    id: int  # id of the licence
    name: str  # example 'Attribution-NonCommercial-ShareAlike License

@dataclass
class Annotation:
    # segmentation: list[list[float]]  # description of the mask; example [[44.17, 217.83, 36.21, 219.37, 33.64, 214.49, 31.08, 204.74, 36.47, 202.68, 44.17, 203.2]]
    area: float  # number of pixel of the described object
    iscrowd: Literal[
        1, 0
    ]  # Crowd annotations (iscrowd=1) are used to label large groups of objects (e.g. a crowd of people)
    image_id: int  # id of the target image
    bbox: tuple[
        float, float, float, float
    ]  # bounding box coordinates [xmin, ymin, width, height]
    category_id: int
    id: int  # annotation id

@dataclass
class Category:
    supercategory: str  # example 'vehicle'
    id: int  # category id
    name: str  # example 'airplane'

@dataclass
class Instances:
    info: Info
    images: list[Image]
    licenses: list[License]
    annotations: list[Annotation]
    categories: list[Category]

@dataclass
class Sentence:
    tokens: list[str]  # tokenized version of referring expression
    raw: str  # unprocessed referring expression
    sent: str  # referring expression with mild processing, lower case, spell correction, etc.
    sent_id: int  # unique referring expression id

@dataclass
class Ref:
    image_id: int  # unique image id
    split: Split
    sentences: list[Sentence]
    file_name: str  # file name of image relative to img_root
    category_id: int  # object category label
    ann_id: int  # id of object annotation in instance.json
    sent_ids: list[int]  # same ids as nested sentences[...][sent_id]
    ref_id: int  # unique id for refering expression

In [9]:
class Prediction:
  def __init__(self, image, description, ground_truth_bbox, output_bbox):
    self.image = image
    self.description = description
    self.ground_truth_bbox = ground_truth_bbox
    self.output_bbox = output_bbox

In [10]:
def fix_ref(x: Ref) -> Ref:
    x.file_name = fix_filename(x.file_name)
    return x


def fix_filename(x: str) -> str:
    """
    :param x: COCO_..._[image_id]_[annotation_id].jpg
    :return:  COCO_..._[image_id].jpg

    >>> fix_filename('COCO_..._[image_id]_0000000001.jpg')
    'COCO_..._[image_id].jpg'

    """
    return re.sub("_\d+\.jpg$", ".jpg", x)

In [11]:
with open(data_refs, "rb") as f:
    raw = pickle.load(f)

refs: list[Ref] = [fix_ref(Ref(**ref)) for ref in raw]

In [12]:
with open(data_instances, "r") as f:
    raw = json.load(f)

instances: Instances = Instances(**raw)

id2annotation: Mapping[int, Annotation] = {x.id: x for x in instances.annotations}

In [13]:
class CocoDataset(Dataset[tuple[PIL.Image, list[str], Float[torch.Tensor, "4"]]]):
    def __init__(
        self,
        split: Split,
        limit: int = -1,
    ):
        self.__init__
        self.items: list[tuple[str, list[str], Float[torch.Tensor, "4"]]] = [
            (i, [s.sent for s in ss], xywh)
            for ref in refs
            if ref.split == split
            for i in [os.path.join(data_images, ref.file_name)]
            for ss in [ref.sentences]
            for xywh in [torch.tensor(id2annotation[ref.ann_id].bbox, dtype=torch.float)]
        ]
        self.len: int = len(self.items) if limit < 0 else min(limit, len(self.items))

    def __len__(self) -> int:
        return self.len

    def __getitem__(
        self, index: int
    ) -> tuple[PIL.Image, list[str], Float[torch.Tensor, "4"]]:
        i, ps, xywh = self.items[index]
        xyxy: Float[torch.Tensor, "4"] = torchvision.ops.box_convert(xywh, in_fmt="xywh", out_fmt="xyxy")
        with PIL.Image.open(i) as img:
            img.load()
            return img, ps, xyxy

In [14]:
class Coco4CLIPDataset(Dataset[tuple[list[PIL.Image], list[str]]]):
    def __init__(
        self,
        split: Split,
        limit: int = -1,
    ):
        self.__init__
        self.items: list[tuple[str, list[str], Float[torch.Tensor, "4"]]] = [
            (i, [s.sent for s in ss], xywh)
            for ref in refs
            if ref.split == split
            for i in [os.path.join(data_images, ref.file_name)]
            for ss in [ref.sentences]
            for xywh in [torch.tensor(id2annotation[ref.ann_id].bbox, dtype=torch.float)]
        ]
        self.len: int = len(self.items) if limit < 0 else min(limit, len(self.items))

    def __len__(self) -> int:
        return self.len

    def __getitem__(self, index: int) -> tuple[list[PIL.Image], list[str]]:
        i, ps, xywh = self.items[index]
        xyxy: Float[torch.Tensor, "4"] = torchvision.ops.box_convert(xywh, in_fmt="xywh", out_fmt="xyxy")
        with PIL.Image.open(i) as img:
            img.load()
            return [img.crop(xyxy.tolist())], ps

In [15]:
def unzip(batch: list[tuple[T, ...]]) -> tuple[list[T], ...]:
    return tuple(zip(*batch))

In [16]:
batch_size: int = 3
limit: int = 5 * batch_size

In [17]:
dl: DataLoader[tuple[list[PIL.Image], list[list[str]], list[Float[torch.Tensor, "4"]]]] = DataLoader(
    dataset=CocoDataset(split="test", limit=limit),
    batch_size=batch_size,
    collate_fn=unzip,
)

In [18]:
dl4clip: DataLoader[tuple[list[PIL.Image], list[str]]] = DataLoader(
    dataset=Coco4CLIPDataset(split="test", limit=limit),
    batch_size=batch_size,
    collate_fn=unzip,
    shuffle=True,
)

In [19]:
imgs: tuple[PIL.Image, ...]
promptss: tuple[list[str], ...]
true_xyxy: tuple[Float[torch.Tensor, "4"], ...]

for imgs, promptss, true_xyxy in dl:
    print(imgs)
    print(promptss)
    print(true_xyxy)
    print("-" * 50)

(<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x376 at 0x7F1EF76EF910>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x431 at 0x7F1EF74F8A30>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x426 at 0x7F1EF74F8A90>)
(['the man in yellow coat', 'skiier in red pants'], ['there is red colored truck in between the other trucks', 'a shiny red vintage pickup truck'], ['a apple desktop computer', 'the white imac computer that is also turned on'])
(tensor([374.3100,  65.0600, 510.3500, 267.0000]), tensor([ 93.9500,  83.2900, 598.5600, 373.8600]), tensor([338.8000,  82.1900, 486.1400, 239.5600]))
--------------------------------------------------
(<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x7F1EF76EF940>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x275 at 0x7F1EF74F8FD0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x375 at 0x7F1EF74F9060>)
(['a girl wearing glasses and a pink shirt', 'an asian girl with a pin

In [20]:
cropss: tuple[list[PIL.Image], ...]
promptss: tuple[list[str], ...]

for cropss, promptss in dl4clip:
    print(cropss)
    print(promptss)
    print("-" * 50)

([<PIL.Image.Image image mode=RGB size=371x341 at 0x7F1EF76EF790>], [<PIL.Image.Image image mode=RGB size=189x430 at 0x7F1EF76EE6B0>], [<PIL.Image.Image image mode=RGB size=62x178 at 0x7F1F0334AA70>])
(['a brown horse wearing a mask getting rode by a jockey'], ['a blonde woman in a white shirt and long black skirt', 'there is one small girl wearing white top is touching the elephant'], ['a man standing next to a young girl on a grassy hillside', 'a man in a black jacket'])
--------------------------------------------------
([<PIL.Image.Image image mode=RGB size=255x196 at 0x7F1EF76EFF40>], [<PIL.Image.Image image mode=RGB size=185x213 at 0x7F1EF76EFC40>], [<PIL.Image.Image image mode=RGB size=136x202 at 0x7F1EF76EE860>])
(['the adult giraffe', 'a mother giraffe lickicking her baby'], ['a brown bear near a soda bottle', 'a without hairy brown color teddy bear'], ['the man in yellow coat', 'skiier in red pants'])
--------------------------------------------------
([<PIL.Image.Image image

## Yolov5

In [21]:
class Yolo_v5(torch.nn.Module):
  def __init__(self, device=device):
    super().__init__()

    # load yolo model
    self.yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
    self.yolo_model.to(device=device).eval()

  def forward(self, img):

    # yolo bboxes
    predictions = self.yolo_model(img)

    # xmin,      ymin,      xmax,      ymax,      confidence, class
    # 274.06390, 231.20389, 392.66345, 372.59018, 0.93251,    23.00000
    bboxes: list[Float[torch.Tensor, 'X 6']] = predictions.xyxy # bboxes[i] contains the bboxes highlighted by yolo in image i

    for image_idx, bbox_img in enumerate(bboxes):
      # if empty, put a bbox equal to image size
      if len(bbox_img) == 0:
          bboxes[image_idx] = torch.tensor([[0, 0, img[image_idx].size[0], img[image_idx].size[1], 0, 0]], dtype=torch.float)

    return bboxes

In [22]:
# instantiate the region proposal algorithm
yolo = Yolo_v5().to(device)

Downloading: "https://github.com/ultralytics/yolov5/zipball/master" to /root/.cache/torch/hub/master.zip
[31m[1mrequirements:[0m Ultralytics requirement ['gitpython>=3.1.30'] not found, attempting AutoUpdate...
Collecting gitpython>=3.1.30
  Downloading GitPython-3.1.32-py3-none-any.whl (188 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 188.5/188.5 kB 4.3 MB/s eta 0:00:00
Collecting gitdb<5,>=4.0.1 (from gitpython>=3.1.30)
  Downloading gitdb-4.0.10-py3-none-any.whl (62 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 62.7/62.7 kB 8.9 MB/s eta 0:00:00
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython>=3.1.30)
  Downloading smmap-5.0.0-py3-none-any.whl (24 kB)
Installing collected packages: smmap, gitdb, gitpython
Successfully installed gitdb-4.0.10 gitpython-3.1.32 smmap-5.0.0

[31m[1mrequirements:[0m AutoUpdate success ✅ 6.1s, installed 1 package: ['gitpython>=3.1.30']
[31m[1mrequirements:[0m ⚠️ [1mRestart runtime or rerun command for updates to take effect[0m

# Attention is all you need
In the following of this notebook we try to fine tune CLIP using a self-attention based approach. In this context, we try to refine the latent representations of both visual and textual prompts by means of single head attention mechanism.

In [74]:
class attention_CLIP(nn.Module):
  def __init__(self, device=device):
    super().__init__()

    # load clip model and preprocessing code
    model, preprocess = clip.load('RN50')

    # freeze all pretrained layers by setting requires_grad=False
    for param in model.parameters():
      param.requires_grad = False

    self.clip_visual_encoder = model.encode_image
    self.clip_text_encoder = model.encode_text
    self.clip_visual_preprocess = preprocess
    self.clip_text_preprocess = clip.tokenize

    # attention operator
    self.attention = nn.MultiheadAttention(embed_dim=1024, num_heads=1)

    # to be removed!!!
    self.fc1 = nn.Linear(1024, 1024)

  # preprocess input prompts as required by the visual encoder
  def visual_preprocess(self, _imgs):
    prep_images = torch.stack([
        self.clip_visual_preprocess(i)
        for i in _imgs
    ]).to(device)

    return prep_images

  # preprocess text prompts as required by the text encoder
  def text_preprocess(self, _txts):
    prep_texts = self.clip_text_preprocess(_txts)

    return prep_texts

  # visual encoder
  def visual_encoder(self, image):
    with torch.no_grad():
      clipFeatures = self.clip_visual_encoder(image)
    return clipFeatures

  # text encoder
  def text_encoder(self, text):
    with torch.no_grad():
      clipFeatures = self.clip_text_encoder(text)
    return clipFeatures

  def forward(self, image, text):
    # image and text preprocessing
    with torch.no_grad():
      image_pre = self.visual_preprocess(image)
      text_pre = self.text_preprocess(text)

    # get image and text feature representation
    image_features = self.visual_encoder(image_pre)
    text_features = self.text_encoder(text_pre)

    print("image_features")
    print(image_features)
    print("text_features")
    print(text_features)

    # store number of images and number of texts for later retrival
    ###num_images = len(image)
    ###num_texts = len(text)

    ####print("num_images")
    ####print(num_images)
    ####print("num_texts")
    ####print(num_texts)

    # concatenate image embeedings and prompt embeedings in the same latent context
    ####context_features = torch.cat((image_features, text_features), dim=0)

    # refine the latent representation of each text and image according to the overall context by means of the attention mechanism
    ####attn_output, _ = self.attention(context_features, context_features, context_features)


    # retrive image_features and text_features by means of the previously stored indexes
    #####image_features = attn_output[:num_images]
    #####text_features = attn_output[-num_texts:]

    image_features = self.fc1(image_features)

    return image_features, text_features

In [75]:
# instantiate the network and move it to the chosen device
net = attention_CLIP().to(device)

In [76]:
def get_optimizer(model, _lr, _wd, _momentum):
  optimizer = torch.optim.SGD(  params = model.parameters(),
                                lr = _lr,
                                weight_decay = _wd,
                                momentum = _momentum)
  return optimizer

In [77]:
def get_accuracy_function():
  def iou_accuracy(bbox_prediction, bbox_groundtruth):

    # compute intersection over union between ground truth bboxes and predicted bboxes
    iou_accuracy_matrix = torchvision.ops.box_iou(bbox_prediction[:, :4], bbox_groundtruth)

    # extract the diagonal elements
    iou_accuracy_matrix_diagonal = torch.diag(iou_accuracy_matrix)

    # compute the mean of the intersection over union
    mean_iou = iou_accuracy_matrix_diagonal.mean()

    # compute the iou accuracy
    iou_accuracy_output = mean_iou.item()

    return iou_accuracy_output
  return iou_accuracy

In [78]:
# input:
#   -> retrived_bboxes : bounding boxes proposed by the region proposal model
#   -> bbox_groundtruth : ground truth bounding box provided by the training sample
# output:
#   -> [3, 5] in this case for the first element in the batch the best bbox is the fourth, while for the second element in the batch the best bbox is the sixth. The best bbox is the one characterized by the largest IoU with the ground truth bbox
def best_bbox_one_hot_encoding(retrived_bboxes, bbox_groundtruth):
  batch_bbox_one_hot_encoding = []
  for batch_item_retrived_bboxes, batch_item_bbox_groundtruth in zip(retrived_bboxes, bbox_groundtruth):
    iou_matrix = torchvision.ops.box_iou(batch_item_retrived_bboxes[:,:4], batch_item_bbox_groundtruth.unsqueeze(0))
    batch_bbox_one_hot_encoding.append(torch.argmax(iou_matrix, dim=0))

  batch_bbox_one_hot_encoding = torch.cat(batch_bbox_one_hot_encoding, dim=0)

  return batch_bbox_one_hot_encoding

In [79]:
def cosine_similarity(images_z: torch.Tensor, texts_z: torch.Tensor):
  # normalise the image and the text
  images_z /= images_z.norm(dim=-1, keepdim=True)
  texts_z /= texts_z.norm(dim=-1, keepdim=True)

  # evaluate the cosine similarity between the sets of features
  similarity = (texts_z @ images_z.T)

  return similarity.cpu()

In [89]:
def training_step(  model: torch.nn.Module,
                    region_proposal_model: torch.nn.Module,
                    data_loader: torch.utils.data.DataLoader,
                    loss_fn: torch.nn.Module,
                    accuracy_fn,
                    optimizer: torch.optim.Optimizer,
                    device: torch.device = device):
  train_loss, iou_train_acc = 0, 0
  model.to(device)
  model.train()
  region_proposal_model.to(device)

  for batch_idx, (imgs, promptss, true_xyxy) in tqdm(enumerate(data_loader)):
    # send data to target device
    # todo: send data to target device

    print("imgs")
    print(imgs)

    print("promptss")
    print(promptss)

    print("true_xyxy")
    print(true_xyxy)

    with torch.no_grad():
      # i. region proposal
      bboxes = region_proposal_model(imgs)

      # ii. get best bounding box with respect to the ground truth
      bbox_groundtruth = best_bbox_one_hot_encoding(bboxes, true_xyxy)

      # from yolo bboxes to cropped images
      crops = []
      for batch_image, batch_image_bboxes in zip(imgs, bboxes):
        list_bboxes_image: list[Image] = [
            batch_image.crop((xmin, ymin, xmax, ymax))
            for bbox in batch_image_bboxes
            for [xmin, ymin, xmax, ymax, _, _] in [bbox.tolist()]
        ]

        crops.append(list_bboxes_image)

    print("bboxes")
    print(bboxes)

    # forward pass
    cropss_z = []
    promptss_z = []
    for c, p in zip(crops, promptss):
      model_output = model(c, p)
      model_output_image_features = model_output[0]
      model_output_text_features = model_output[1]

      cropss_z.append(model_output_image_features)
      promptss_z.append(model_output_text_features)

    print("cropss_z")
    print(cropss_z)

    print("promptss_z")
    print(promptss_z)

    # cosine similarity evaluation
    #   cropss_z :: list of BATCH_SIZE tensors: [tensor([bbox_img_1, 1024]), tensor([bbox_img_2, 1024]), ..., tensor([bbox_img_BATCH_SIZE, 1024])]
    #   promptss_z :: list of BATCH_SIZE tensors: [tensor([prompts_img_1, 1024]), tensor([prompts_img_2, 1024]), ..., tensor([prompts_img_BATCH_SIZE, 1024])]
    bbox_index_pred = []  # for each batch sample this list contains the index of the predicted bbox at the end of the iteration
    for c_z, p_z, y in zip(cropss_z, promptss_z, bbox_groundtruth):

      print("c_z")
      print(c_z)

      print("p_z")
      print(p_z)

      # rows :: prompts ; columns: crops
      cosine_similarity_matrix = cosine_similarity(c_z, p_z)

      print("cosine_similarity_matrix")
      print(cosine_similarity_matrix)

      # for each crop we set the average cosine similarity with the prompts
      crop_logits = torch.mean(cosine_similarity_matrix, dim=0)

      print("crop_logits")
      print(crop_logits)

      # calculate loss
      #####loss = loss_fn(crop_logits.to(device), y.to(device))
      loss = loss + loss_fn(crop_logits, y) ### ARRIVATO QUI

      print("loss PLEASE GRADIENT")
      print(loss)

      # get index of the predicted bounding box in order to compute IoU accuracy
      bbox_index_pred.append(crop_logits.argmax().item())

    ###loss = loss / len(bbox_groundtruth)  # avg loss

    print("big loss PLEASE GRADIENT")
    print(loss)

    train_loss += loss

    # optimizer zero grad
    optimizer.zero_grad()

    # loss backward
    loss.backward()

    # optimizer step
    optimizer.step()

    with torch.no_grad():
      # get predicted bounding box for each example in the batch
      bbox_pred = [batch_example_bboxes[idx] for batch_example_bboxes, idx in zip(bboxes, bbox_index_pred)]

      prediction_obj = Prediction(imgs[0], promptss[0], true_xyxy[0], bbox_pred[0][:4])
      display_predictions([prediction_obj])

      # calculate intersection over union train accuracy
      acc = accuracy_fn(torch.stack(bbox_pred, dim=0), torch.stack(list(true_xyxy), dim=0))
      iou_train_acc += acc

    # Adjust metrics and print out
    train_loss /= len(data_loader)
    iou_train_acc /= len(data_loader)
    print(f"Train loss: {train_loss:.5f} | IoU train accuracy: {iou_train_acc:.5f}\n")
    return train_loss, iou_train_acc

#### test loop

In [90]:
def test_step(  model: torch.nn.Module,
                region_proposal_model: torch.nn.Module,
                data_loader: torch.utils.data.DataLoader,
                loss_fn: torch.nn.Module,
                accuracy_fn,
                device: torch.device = device):
  test_loss, iou_test_acc = 0, 0
  model.to(device)
  model.eval()
  region_proposal_model.to(device)

  with torch.inference_mode():
    for batch_idx, (imgs, promptss, true_xyxy) in tqdm(enumerate(data_loader)):
      # send data to target device
      # todo: send data to target device

      # i. region proposal
      bboxes = region_proposal_model(imgs)

      # ii. get best bounding box with respect to the ground truth
      bbox_groundtruth = best_bbox_one_hot_encoding(bboxes, true_xyxy)

      # from yolo bboxes to cropped images
      crops = []
      for batch_image, batch_image_bboxes in zip(imgs, bboxes):
        list_bboxes_image: list[Image] = [
            batch_image.crop((xmin, ymin, xmax, ymax))
            for bbox in batch_image_bboxes
            for [xmin, ymin, xmax, ymax, _, _] in [bbox.tolist()]
        ]

        crops.append(list_bboxes_image)

      # forward pass
      cropss_z = []
      promptss_z = []
      for c, p in zip(crops, promptss):
        model_output = model(c, p)
        model_output_image_features = model_output[0]
        model_output_text_features = model_output[1]

        cropss_z.append(model_output_image_features)
        promptss_z.append(model_output_text_features)

      # cosine similarity evaluation
      #   cropss_z :: list of BATCH_SIZE tensors: [tensor([bbox_img_1, 1024]), tensor([bbox_img_2, 1024]), ..., tensor([bbox_img_BATCH_SIZE, 1024])]
      #   promptss_z :: list of BATCH_SIZE tensors: [tensor([prompts_img_1, 1024]), tensor([prompts_img_2, 1024]), ..., tensor([prompts_img_BATCH_SIZE, 1024])]
      bbox_index_pred = []  # for each batch sample this list contains the index of the predicted bbox at the end of the iteration
      for c_z, p_z, y in zip(cropss_z, promptss_z, bbox_groundtruth):
        crop_logits = []  # for each crop we set the average cosine similarity with the prompts
        for vector_c_z in c_z:
          vector_c_z_cos_similarities = []
          for vector_p_z in p_z:
            cosine_similarity = torch.nn.CosineSimilarity()(vector_c_z.unsqueeze(0), vector_p_z.unsqueeze(0)).item()
            vector_c_z_cos_similarities.append(cosine_similarity)

          mean_cosine_similarity = sum(vector_c_z_cos_similarities) / len(vector_c_z_cos_similarities)

          crop_logits.append(mean_cosine_similarity)

        # calculate loss
        loss = loss_fn(torch.tensor(crop_logits).to(device), y.to(device))

        # get index of the predicted bounding box in order to compute IoU accuracy
        bbox_index_pred.append(crop_logits.index(max(crop_logits)))

      loss = loss / len(bbox_groundtruth)  # avg loss
      test_loss += loss

      # get predicted bounding box for each example in the batch
      bbox_pred = [batch_example_bboxes[idx] for batch_example_bboxes, idx in zip(bboxes, bbox_index_pred)]

      prediction_obj = Prediction(imgs[0], promptss[0], true_xyxy[0], bbox_pred[0][:4])
      display_predictions([prediction_obj])

      # calculate intersection over union train accuracy
      acc = accuracy_fn(torch.stack(bbox_pred, dim=0), torch.stack(list(true_xyxy), dim=0))
      iou_test_acc += acc

    # Adjust metrics and print out
    test_loss /= len(data_loader)
    iou_test_acc /= len(data_loader)
    print(f"Test loss: {test_loss:.5f} | IoU test accuracy: {iou_test_acc:.5f}\n")
    return test_loss, iou_test_acc

#### main training-evaluation loop

In [91]:
# tensorboard logging utilities
def log_values_evaluation(writer, step, loss, accuracy, prefix):
  writer.add_scalar(f"{prefix}/loss", loss, step)
  writer.add_scalar(f"{prefix}/accuracy", accuracy, step)

In [92]:
# setting a manual seed allow us to provide reprudicible results in this notebook
torch.manual_seed(42)

# create a logger for the experiment
writer = SummaryWriter(log_dir="runs/exp1")

BATCH_SIZE = 3
LIMIT = 5 * BATCH_SIZE

# get dataset instance
train_dataset = CocoDataset(split="train", limit=LIMIT)
test_dataset = CocoDataset(split="test", limit=LIMIT)
val_dataset = CocoDataset(split="val", limit=LIMIT)
print(f"LEN_TRAIN_DATASET: {len(train_dataset)}, LEN_TEST_DATASET: {len(test_dataset)}, LEN_VALIDATION_DATASET: {len(val_dataset)}")

# get dataloaders
print(f"Creating DataLoader's with batch size {BATCH_SIZE}.") # todo: togliere NUM_WORKERS anche da contrastive learning code
train_loader: DataLoader[tuple[list[PIL.Image], list[list[str]], list[Float[torch.Tensor, "4"]]]] = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    collate_fn=unzip,
)
test_loader: DataLoader[tuple[list[PIL.Image], list[list[str]], list[Float[torch.Tensor, "4"]]]] = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    collate_fn=unzip,
)
val_loader: DataLoader[tuple[list[PIL.Image], list[list[str]], list[Float[torch.Tensor, "4"]]]] = DataLoader(
    dataset=val_dataset,
    batch_size=BATCH_SIZE,
    collate_fn=unzip,
)
print(f"LEN_TRAIN_DATALOADER: {len(train_loader)}, LEN_TEST_DATALOADER: {len(val_loader)}, LEN_VALIDATION_DATALOADER: {len(test_loader)}")

# instantiate the optimizer
learning_rate = 0.01
weight_decay = 0.000001
momentum = 0.9
optimizer = get_optimizer(net, learning_rate, weight_decay, momentum)

# define the cost function
loss_function = torch.nn.CrossEntropyLoss()

# define the accuracy function
accuracy_function = get_accuracy_function()

"""
print('Before training:')
train_loss, train_accuracy = test_step( model = net,
                        region_proposal_model = yolo,
                        data_loader = train_loader,
                        loss_fn = loss_function,
                        accuracy_fn = accuracy_function)

test_loss, test_accuracy = test_step( model = net,
                        region_proposal_model = yolo,
                        data_loader = test_loader,
                        loss_fn = loss_function,
                        accuracy_fn = accuracy_function)

val_loss, val_accuracy = test_step( model = net,
                        region_proposal_model = yolo,
                        data_loader = val_loader,
                        loss_fn = loss_function,
                        accuracy_fn = accuracy_function)

# log to TensorBoard
log_values_evaluation(writer, -1, train_loss, train_accuracy, "train")
log_values_evaluation(writer, -1, val_loss, val_accuracy, "validation")
log_values_evaluation(writer, -1, test_loss, test_accuracy, "test")

print('\tTraining loss {:.5f}, Training accuracy {:.5f}'.format(train_loss, train_accuracy))
print('\tValidation loss {:.5f}, Validation accuracy {:.5f}'.format(val_loss, val_accuracy))
print('\tTest loss {:.5f}, Test accuracy {:.5f}'.format(test_loss, test_accuracy))
print('-----------------------------------------------------')
"""

# measure time
train_time_start = timer()

EPOCHS = 3
for epoch in tqdm(range(EPOCHS)):
    train_loss, train_accuracy = training_step(
        model = net,
        region_proposal_model = yolo,
        data_loader = train_loader,
        loss_fn = loss_function,
        accuracy_fn = accuracy_function,
        optimizer = optimizer
    )

    val_loss, val_accuracy = test_step(
        model = net,
        region_proposal_model = yolo,
        data_loader = val_loader,
        loss_fn = loss_function,
        accuracy_fn = accuracy_function
    )

    # logs to TensorBoard
    log_values_evaluation(writer, epoch, train_loss, train_accuracy, "train")
    log_values_evaluation(writer, epoch, val_loss, val_accuracy, "validation")

    print('\tTraining loss {:.5f}, Training accuracy {:.5f}'.format(train_loss, train_accuracy))
    print('\tValidation loss {:.5f}, Validation accuracy {:.5f}'.format(val_loss, val_accuracy))
    print('-----------------------------------------------------')

train_time_end = timer()
total_train_time_model_1 = print_train_time(start=train_time_start,
                                            end=train_time_end,
                                            device=device)
# compute final evaluation results
print('After training:')
train_loss, train_accuracy = test_step( model = net,
                        region_proposal_model = yolo,
                        data_loader = train_loader,
                        loss_fn = loss_function,
                        accuracy_fn = accuracy_function)

test_loss, test_accuracy = test_step( model = net,
                        region_proposal_model = yolo,
                        data_loader = test_loader,
                        loss_fn = loss_function,
                        accuracy_fn = accuracy_function)

val_loss, val_accuracy = test_step( model = net,
                        region_proposal_model = yolo,
                        data_loader = val_loader,
                        loss_fn = loss_function,
                        accuracy_fn = accuracy_function)

# log to TensorBoard
log_values_evaluation(writer, EPOCHS, train_loss, train_accuracy, "train")
log_values_evaluation(writer, EPOCHS, val_loss, val_accuracy, "validation")
log_values_evaluation(writer, EPOCHS, test_loss, test_accuracy, "test")

print('\tTraining loss {:.5f}, Training accuracy {:.5f}'.format(train_loss, train_accuracy))
print('\tValidation loss {:.5f}, Validation accuracy {:.5f}'.format(val_loss, val_accuracy))
print('\tTest loss {:.5f}, Test accuracy {:.5f}'.format(test_loss, test_accuracy))
print('-----------------------------------------------------')

# closes the logger
writer.close()

LEN_TRAIN_DATASET: 15, LEN_TEST_DATASET: 15, LEN_VALIDATION_DATASET: 15
Creating DataLoader's with batch size 3.
LEN_TRAIN_DATALOADER: 5, LEN_TEST_DATALOADER: 5, LEN_VALIDATION_DATALOADER: 5


  0%|          | 0/3 [00:00<?, ?it/s]
0it [00:00, ?it/s][A

imgs
(<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x7F1EAC5F2080>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=612x612 at 0x7F1EAC5F1FC0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x426 at 0x7F1EAC5F0BB0>)
promptss
(['two woman one in black eatting and the other has a white shirt at the desk', 'woman in white shirt looking down at laptop computer'], ['a tv with a woman being interviewed on it', 'a woman with sunglasses on her head on the television being interviewed'], ['a young boy doing a skateboard trick on a blue board', 'a man jumping with a skateboard'])
true_xyxy
(tensor([  0.00000,  45.95000, 238.92000, 454.59003]), tensor([213.72000, 456.51001, 405.72998, 590.26001]), tensor([ 93.82000,  45.79000, 442.27002, 275.54001]))
bboxes
[tensor([[0.00000e+00, 4.21853e+01, 2.42700e+02, 4.63766e+02, 8.32214e-01, 0.00000e+00],
        [1.87698e+02, 2.06740e+02, 4.95460e+02, 4.46894e+02, 7.16692e-01, 6.30000e+01],
        [1.35497e+02, 3.924

0it [00:09, ?it/s]
  0%|          | 0/3 [00:09<?, ?it/s]

image_features
tensor([[-0.02052,  0.03498, -0.00313,  ..., -0.08768, -0.02622, -0.06750],
        [-0.00253,  0.01081, -0.00388,  ..., -0.02971,  0.00254, -0.07382],
        [ 0.02408,  0.03914,  0.00347,  ..., -0.02745, -0.01212, -0.02312],
        [ 0.02563,  0.00623,  0.01962,  ..., -0.00854,  0.00791,  0.01737]])
text_features
tensor([[-0.14392, -0.02109, -0.13265,  ...,  0.23622,  0.12136, -0.08925],
        [-0.01313,  0.15650, -0.14343,  ...,  0.25467,  0.44390,  0.08826]])
cropss_z
[tensor([[-0.05367, -0.06149,  0.01372,  ..., -0.00717,  0.01416, -0.01856],
        [-0.02598, -0.04922,  0.02730,  ...,  0.02116,  0.02391,  0.00659],
        [-0.06484, -0.03503,  0.02063,  ..., -0.03896, -0.01666,  0.00303],
        ...,
        [-0.02982, -0.06851,  0.03154,  ..., -0.05120,  0.00577, -0.00577],
        [-0.03166, -0.09545, -0.02331,  ..., -0.04340, -0.01100, -0.00700],
        [-0.01432, -0.07083, -0.00480,  ..., -0.03041, -0.01674,  0.00979]], grad_fn=<AddmmBackward0>), tensor




UnboundLocalError: ignored

## Experiments

In [None]:
texts = ["ciao", "come", "va"]

model, preprocess = clip.load('RN50')

with torch.no_grad():
  texts_pre = clip.tokenize(texts)
  texts_z = model.encode_text(texts_pre)

print(texts_z)
print(texts_z.shape)

In [None]:
texts_z[-2:]

In [None]:
multihead_attn = nn.MultiheadAttention(1024, 1)
attn_output, attn_output_weights = multihead_attn(texts_z, texts_z, texts_z)


In [None]:
summary(multihead_attn)

In [None]:
attn_output

In [None]:
attn_output.shape

In [None]:
attn_output_weights