# Baseline algorithm
In this notebook we propose a training free approach that combines CLIP zero-shot with a YOLO architecture. This method involves extracting
all the bounding boxes proposed by YOLO and evaluating their similarity with the textual query with CLIP.

## Initialization

In [None]:
%%shell
wget "https://raw.githubusercontent.com/ultralytics/yolov5/v7.0/requirements.txt" -O "yolo-requirements.txt"
mkdir -p /root/.cache/torch/hub
cp yolo-requirements.txt /root/.cache/torch/hub/requirements.txt

tee requirements.txt << END
ftfy
jaxtyping
jupyter
matplotlib
pydantic
regex
torch
torchvision
tqdm
END

pip install -r requirements.txt
pip install git+https://github.com/openai/CLIP.git
pip install -r yolo-requirements.txt

In [None]:
import os
from pathlib import Path
import json
import pandas as pd
import torch
from torchvision.io import read_image
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import transforms
from typing import Tuple, Dict, List, Literal, Callable, Optional, Mapping, TypeVar
from jaxtyping import Array, Float, UInt, Int
import pickle
import itertools as it
import re
import matplotlib.pyplot as plt
import torchvision
from PIL import Image
from pkg_resources import packaging
import clip
import numpy as np
from pydantic.dataclasses import dataclass
from datetime import datetime
import torch
import torch.nn.functional as F

In [None]:
torch.set_default_device(
    'cuda' if torch.cuda.is_available() else 'cpu'
)

Download the dataset.
**TODO:** prima di scaricare il dataset controllare che non esista già.

In [None]:
%%shell
if ! [ -d dataset ]; then
  mkdir dataset &&
  gdown 1P8a1g76lDJ8cMIXjNDdboaRR5-HsVmUb &&
  mv refcocog.tar.gz ./dataset/ &&
  tar -xf dataset/refcocog.tar.gz -C dataset &&
  rm dataset/refcocog.tar.gz
fi

Folder paths

In [None]:
data = Path("dataset/refcocog/") #main dataset folder
data_instances = Path("dataset/refcocog/annotations/instances.json")  #instances.json
data_refs = Path("dataset/refcocog/annotations/refs(umd).p")  #refs(umd).p
data_images = Path("dataset/refcocog/images") #image folder

Type declaration

In [None]:
Split = Literal['train', 'test', 'val']

@dataclass
class Info:
    description: str  # This is stable 1.0 version of the 2014 MS COCO dataset.
    url: str  # http://mscoco.org/
    version: str  # 1.0
    year: int  # 2014
    contributor: str  # Microsoft COCO group
    date_created: datetime  # 2015-01-27 09:11:52.357475

@dataclass
class Image:
    license: int  # each image has an associated licence id
    file_name: str  # file name of the image
    coco_url: str  # example http://mscoco.org/images/131074
    height: int
    width: int
    flickr_url: str  # example http://farm9.staticflickr.com/8308/7908210548_33e
    id: int  # id of the imag
    date_captured: datetime  # example '2013-11-21 01:03:06'

@dataclass
class License:
    url: str  # example http://creativecommons.org/licenses/by-nc-sa/2.0/
    id: int  # id of the licence
    name: str  # example 'Attribution-NonCommercial-ShareAlike License

@dataclass
class Annotation:
    # segmentation: list[list[float]]  # description of the mask; example [[44.17, 217.83, 36.21, 219.37, 33.64, 214.49, 31.08, 204.74, 36.47, 202.68, 44.17, 203.2]]
    area: int  # number of pixel of the described object
    iscrowd: Literal[1, 0]  # Crowd annotations (iscrowd=1) are used to label large groups of objects (e.g. a crowd of people)
    image_id: int  # id of the target image
    bbox: tuple[int, int, int, int]  # bounding box coordinates [xmin, ymin, width, height]
    category_id: int
    id: int  # annotation id

@dataclass
class Category:
    supercategory: str  # example 'vehicle'
    id: int  # category id
    name: str  # example 'airplane'

@dataclass
class Instances:
    info: Info
    images: list[Image]
    licenses: list[License]
    annotations: list[Annotation]
    categories: list[Category]

@dataclass
class Sentence:
    tokens: list[str]  # tokenized version of referring expression
    raw: str  # unprocessed referring expression
    sent: str  # referring expression with mild processing, lower case, spell correction, etc.
    sent_id: int  # unique referring expression id

@dataclass
class Ref:
    image_id: int  # unique image id
    split: Split
    sentences: list[Sentence]
    file_name: str  # file name of image relative to img_root
    category_id: int  # object category label
    ann_id: int  # id of object annotation in instance.json
    sent_ids: list[int]  # same ids as nested sentences[...][sent_id]
    ref_id: int  # unique id for refering expression

I = TypeVar('I')
P = TypeVar('P')
B = TypeVar('B')
T = TypeVar('T')

Img = UInt[torch.Tensor, 'C W H']
BBox = UInt[torch.Tensor, '4']

Useful functions.
**TODO**: commentare cosa fanno queste funzioni.

In [None]:
def fix_ref(x: Ref) -> Ref:
    x.file_name = fix_filename(x.file_name)
    return x


def fix_filename(x: str) -> str:
    """
    :param x: COCO_..._[image_id]_[annotation_id].jpg
    :return:  COCO_..._[image_id].jpg
    """
    return re.sub('_\d+\.jpg$', '.jpg', x)

Read refs and annotations.

In [None]:
f = open(data_refs, 'rb')
raw = pickle.load(f)
f.close()
refs: list[Ref] = [
    fix_ref(Ref(**ref))
    for ref in raw
]

In [None]:
f = open(data_instances, 'r')
raw = json.load(f)
f.close()
instances: Instances = Instances(**raw)

Create a mapping between annotation_id => annotation_object

In [None]:
id2annotation: Mapping[int, Annotation] = {
    x.id: x
    for x in instances.annotations
}

Define custom dataset

In [None]:
class CocoDataset(Dataset[tuple[I, P, B]]):

    #split: train, test or val
    #img_transform: apply list of transformations on the processed images
    #prompt_transform: apply list of transformations on the processed reference expressions
    #bb_transform: apply list of transformations on the bounding box
    def __init__(
        self,
        split: Split,
        img_transform: Callable[[Img], I] = lambda x: x,
        prompt_transform: Callable[[list[Sentence]], P] = lambda ps: [ p.sent for p in ps ],
        bb_transform: Callable[[UInt[torch.Tensor, '4']], B] = lambda x: x
    ):
        self.img_transform = img_transform
        self.prompt_transform = prompt_transform
        self.bb_transform = bb_transform

        # Internally the dataset is a list of tuple[str, list[Sentence], UInt[torch.Tensor, '4']]
        # Such that:
        # str                     : image filename
        # list[Sentence]          : list of reference expression objects
        # UInt[torch.Tensor, '4'] : bounding box 
        self.items: list[tuple[str, list[Sentence], UInt[torch.Tensor, '4']]] = [
            (i, ps, o)
            for ref in refs
            if ref.split == split
            for i in [os.path.join(data_images, Path(ref.file_name))]
            for ps in [ref.sentences]
            for o in [torch.tensor(id2annotation[ref.ann_id].bbox, dtype=torch.int)]
        ]

    def __len__(self):
        return len(self.items)


    def __getitem__(self, item: int) -> tuple[I, P, B]:
        i, ps, b = self.items[item]
        img = read_image(i)
        return (
            self.img_transform(img),
            self.prompt_transform(ps),
            self.bb_transform(b),
        )

## Step 1: take an image from the dataset

**TODO**: in realtà la baseline è training free. Quindi no ha molto senso splittare il dataset in training and test.

In [None]:
train_dataset = CocoDataset(split='train')
test_dataset = CocoDataset(split='test')

In [None]:
train_dataloader = DataLoader(
    dataset=train_dataset, # use custom created train Dataset
    batch_size=1,  # how many samples per batch?
    num_workers=0, # how many subprocesses to use for data loading? (higher = more)
    shuffle=False,   # shuffle the data?
)

test_dataloader = DataLoader(
    dataset=test_dataset, # use custom created test Dataset
    batch_size=1,
    num_workers=0,
    shuffle=False  # usually there is no need to shuffle testing data
)

In [None]:
print("INPUT")

# Get image and label from custom DataLoader
[img], (prompts), [true_xywh] = next(iter(train_dataloader))

[true_xyxy] = torchvision.ops.box_convert(true_xywh.unsqueeze(0), in_fmt='xywh', out_fmt='xyxy')

# convert the output image to pil to display the picture
img_pil = transforms.ToPILImage()(img)
img_pil.show()

print(prompts)

## Step 2: find the bounding boxes inside the image with yolo

Import the YOLOv5 model

In [None]:
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

Execute the model to find the bounding boxes inside the input images.

In [None]:
predictions = yolo_model(img_pil)
predictions.show()

## Step 3: compute the latent representation of the reference expression using clip

Load the CLIP model and set it to evaluation mode.

In [None]:
clip_model, preprocess = clip.load('RN50')
clip_model = clip_model.cuda().eval()

Prepare input text tokens.

In [None]:
prompts_tokens: Int[torch.Tensor, 'P 77'] = clip.tokenize([
    template.format(prompt)
    for template in ["A photo of {}", "We can see {}"]
    for (prompt, ) in prompts  # <- ¯\_(ツ)_/¯
])

Execute the text encoder of CLIP to get the latent representation of the text tokens.

In [None]:
with torch.no_grad():
  prompts_z: Float[torch.Tensor, 'P 124'] = clip_model.encode_text(prompts_tokens)

## Step 4: for each bounding box: i. compute the representation of the crop in the latent space; ii. evaluate the similarity with the reference expression; Finally, consider only the bounding box with the higher similarity.

Get cropped images

In [None]:
# xmin,      ymin,      xmax,      ymax,      confidence, class
# 274.06390, 231.20389, 392.66345, 372.59018, 0.93251,    23.00000
bboxes: Float[torch.Tensor, 'X 6'] = predictions.xyxy

crops = [
    img_pil.crop((xmin, ymin, xmax, ymax))
    for bbox in bboxes[0]
    for [xmin, ymin, xmax, ymax, _, _]  in [bbox.tolist()]
]

In [None]:
for crop in crops:
    crop.show()

Compute the representation of the crops in the latent space.

In [None]:
preprocess_crops: Float[torch.Tensor, 'X 3 244 244'] = torch.stack([
    preprocess(crop)
    for crop in crops
]).cuda()

In [None]:
with torch.no_grad():
    crops_z: Float[torch.Tensor, 'X 1024'] = clip_model.encode_image(preprocess_crops)

Evaluate the cosine similarity between each bounding box and the reference expression. Finally, consider only the bounding box with the higher similarity score.

In [None]:
with torch.no_grad():
    ass_z: tuple[Float[torch.Tensor, 'X P'], Float[torch.Tensor, 'P X']] = clip_model(preprocess_crops, prompts_tokens)
    logits_per_crop, logits_per_prompt = ass_z

Get index of the bounding box which is characterized by the highest similarity score with respect to the input reference expression.

In [None]:
torch.max(logits_per_prompt, 0).values

In [None]:
best_match = torch.argmax(torch.max(logits_per_prompt, 0).values)
best_bbox = bboxes[0][best_match]

r, g, b = torch.randint(0, 256, [3]).tolist()
img_bbox = torchvision.utils.draw_bounding_boxes(
    image=img,
    boxes=best_bbox[:4].unsqueeze(0),
    colors=(r, g, b),
    width=2,
)

In [None]:
output_pil = tensor_to_pil(img_bbox)
output_pil.show()

Output the groundtruth bounding box.

In [None]:
img_bbox = torchvision.utils.draw_bounding_boxes(
    image=img_bbox,
    boxes=true_xyxy.unsqueeze(0),
    colors=(r, g, b),
    width=5,
)

In [None]:
output_pil = tensor_to_pil(img_bbox)
output_pil.show()

## Step 5: evaluate the results

$$
J(A, B) = \frac {|A \cap B|} {|A \cup B|}
$$

Where:

- $A$ is the ground truth bbox
- $B$ is our bbox

In [None]:
torchvision.ops.box_iou(true_xyxy.unsqueeze(0), best_bbox[:4].unsqueeze(0))

$$
\text{cosine similarity } A, B := \cos(\theta) = \frac{\mathbf{A} \cdot \mathbf{B}}{||\mathbf{A}||  ||\mathbf{B}||}
$$

Where:

- $\mathbf A$ is the ground truth bbox in CLIP latin space
- $\mathbf B$ is out bbox in CLIP latin space

In [None]:
rectangle = true_xywh.tolist()
ground_truth_crop = img_pil.crop(rectangle)

rectangle = torch.tensor(best_bbox[:4], dtype=torch.int).tolist()
best_crop = img_pil.crop(rectangle)

A = clip_model.encode_image(torch.tensor(preprocess(ground_truth_crop)).cuda().unsqueeze(0))
B = clip_model.encode_image(torch.tensor(preprocess(best_crop)).cuda().unsqueeze(0))

F.cosine_similarity(A, B)

$$
||\mathbf x - \mathbf y||
$$

In [None]:
A, B

In [None]:
torch.cdist(torch.tensor(A, dtype=torch.float), torch.tensor(B, dtype=torch.float), p=2)

$$
\text{Pre} = \frac{TP}{TP + FP} \qquad \text{Rec} = \frac{TP}{TP + FN}
$$