Baseline algorithm
---

In this notebook we propose a training free approach that combines CLIP zero-shot with a YOLO architecture.
This method involves extracting all the bounding boxes proposed by YOLO and evaluating their similarity with the textual query with CLIP.

Dependencies

In [None]:
%%shell
wget "https://raw.githubusercontent.com/ultralytics/yolov5/v7.0/requirements.txt" -O "yolo-requirements.txt"
mkdir -p /root/.cache/torch/hub
cp yolo-requirements.txt /root/.cache/torch/hub/requirements.txt

tee requirements.txt << END
ftfy
jaxtyping
jupyter
matplotlib
pydantic
regex
torch
torchvision
tqdm
END

pip install -q -r requirements.txt
pip install -q git+https://github.com/openai/CLIP.git
pip install -q -r yolo-requirements.txt

In [None]:
import clip
import json
import os
import pickle
import re
import torch
import torch.nn.functional as F
import torchvision

from datetime import datetime
from jaxtyping import Float, UInt, Int
from pydantic.dataclasses import dataclass
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchvision import transforms
from torchvision.io import read_image
from typing import Literal, Callable, Mapping, TypeVar
from tqdm import tqdm

In [None]:
device: Literal['cpu', 'cuda'] = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_default_device(device)

Download the dataset

In [None]:
%%shell
if ! [ -d dataset ]; then
  mkdir dataset &&
  gdown 1P8a1g76lDJ8cMIXjNDdboaRR5-HsVmUb &&
  tar -xf refcocog.tar.gz -C dataset &&
  rm refcocog.tar.gz
fi

Folder paths

In [None]:
root = os.path.join('dataset', 'refcocog', '')
data_instances = os.path.join(root, 'annotations', 'instances.json')
data_refs = os.path.join(root, 'annotations', 'refs(umd).p')
data_images = os.path.join(root, 'images', '') 

Type declaration

In [None]:
I = TypeVar('I')
P = TypeVar('P')
B = TypeVar('B')
T = TypeVar('T')

Img = UInt[torch.Tensor, 'C W H']
BBox = UInt[torch.Tensor, '4']
Split = Literal['train', 'test', 'val']

@dataclass
class Info:
    description: str  # This is stable 1.0 version of the 2014 MS COCO dataset.
    url: str  # http://mscoco.org/
    version: str  # 1.0
    year: int  # 2014
    contributor: str  # Microsoft COCO group
    date_created: datetime  # 2015-01-27 09:11:52.357475

@dataclass
class Image:
    license: int  # each image has an associated licence id
    file_name: str  # file name of the image
    coco_url: str  # example http://mscoco.org/images/131074
    height: int
    width: int
    flickr_url: str  # example http://farm9.staticflickr.com/8308/7908210548_33e
    id: int  # id of the imag
    date_captured: datetime  # example '2013-11-21 01:03:06'

@dataclass
class License:
    url: str  # example http://creativecommons.org/licenses/by-nc-sa/2.0/
    id: int  # id of the licence
    name: str  # example 'Attribution-NonCommercial-ShareAlike License

@dataclass
class Annotation:
    # segmentation: list[list[float]]  # description of the mask; example [[44.17, 217.83, 36.21, 219.37, 33.64, 214.49, 31.08, 204.74, 36.47, 202.68, 44.17, 203.2]]
    area: int  # number of pixel of the described object
    iscrowd: Literal[1, 0]  # Crowd annotations (iscrowd=1) are used to label large groups of objects (e.g. a crowd of people)
    image_id: int  # id of the target image
    bbox: tuple[int, int, int, int]  # bounding box coordinates [xmin, ymin, width, height]
    category_id: int
    id: int  # annotation id

@dataclass
class Category:
    supercategory: str  # example 'vehicle'
    id: int  # category id
    name: str  # example 'airplane'

@dataclass
class Instances:
    info: Info
    images: list[Image]
    licenses: list[License]
    annotations: list[Annotation]
    categories: list[Category]

@dataclass
class Sentence:
    tokens: list[str]  # tokenized version of referring expression
    raw: str  # unprocessed referring expression
    sent: str  # referring expression with mild processing, lower case, spell correction, etc.
    sent_id: int  # unique referring expression id

@dataclass
class Ref:
    image_id: int  # unique image id
    split: Split
    sentences: list[Sentence]
    file_name: str  # file name of image relative to img_root
    category_id: int  # object category label
    ann_id: int  # id of object annotation in instance.json
    sent_ids: list[int]  # same ids as nested sentences[...][sent_id]
    ref_id: int  # unique id for refering expression


Read the dataset infos

In [None]:
def fix_ref(x: Ref) -> Ref:
    x.file_name = fix_filename(x.file_name)
    return x


def fix_filename(x: str) -> str:
    """
    :param x: COCO_..._[image_id]_[annotation_id].jpg
    :return:  COCO_..._[image_id].jpg
    """
    return re.sub('_\d+\.jpg$', '.jpg', x)

In [None]:
f = open(data_refs, 'rb')
raw = pickle.load(f)
f.close()

In [None]:
refs: list[Ref] = [
    fix_ref(Ref(**ref))
    for ref in raw
]

In [None]:
f = open(data_instances, 'r')
raw = json.load(f)
f.close()

In [None]:
instances: Instances = Instances(**raw)

In [None]:
id2annotation: Mapping[int, Annotation] = {
    x.id: x
    for x in instances.annotations
}

Define custom dataset

In [None]:
class CocoDataset(Dataset[tuple[I, P, B]]):

    def __init__(
        self,
        split: Split,
        img_transform: Callable[[Img], I] = lambda x: x,
        prompt_transform: Callable[[list[Sentence]], P] = lambda ps: [ p.sent for p in ps ],
        bb_transform: Callable[[UInt[torch.Tensor, '4']], B] = lambda x: x
    ):
        """
        :param split: train, test or val
        :param img_transform: apply transformation on the processed images
        :param prompt_transform: apply transformation on the prompts
        :param bb_transform: apply transformation on the bounding box
        """
        self.img_transform = img_transform
        self.prompt_transform = prompt_transform
        self.bb_transform = bb_transform

        # Internally the dataset is a list of tuple[str, list[Sentence], UInt[torch.Tensor, '4']]
        # Such that:
        # str                     : image filename
        # list[Sentence]          : list of reference expression objects
        # UInt[torch.Tensor, '4'] : bounding box 
        self.items: list[tuple[str, list[Sentence], UInt[torch.Tensor, '4']]] = [
            (i, ps, o)
            for ref in refs
            if ref.split == split
            for i in [os.path.join(data_images, ref.file_name)]
            for ps in [ref.sentences]
            for o in [torch.tensor(id2annotation[ref.ann_id].bbox, dtype=torch.int)]
        ]

    def __len__(self) -> int:
        return len(self.items)


    def __getitem__(self, item: int) -> tuple[I, P, B]:
        i, ps, b = self.items[item]
        return (
            self.img_transform(read_image(i)),
            self.prompt_transform(ps),
            self.bb_transform(b),
        )

**TODO**: in realtà la baseline è training free. Quindi no ha molto senso splittare il dataset in training and test.

In [None]:
train_dataset: Dataset[tuple[Img, list[str], UInt[torch.Tensor, '4']]] = CocoDataset(split='train')
test_dataset: Dataset[tuple[Img, list[str], UInt[torch.Tensor, '4']]] = CocoDataset(split='test')

In [None]:
train_dataloader = DataLoader(
    dataset=train_dataset, # use custom created train Dataset
    batch_size=1,  # how many samples per batch?
    num_workers=0, # how many subprocesses to use for data loading? (higher = more)
    shuffle=False,   # shuffle the data?
)

test_dataloader = DataLoader(
    dataset=test_dataset, # use custom created test Dataset
    batch_size=1,
    num_workers=0,
    shuffle=False  # usually there is no need to shuffle testing data
)

Load yolo model

In [None]:
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
yolo_model.to(device=device).eval()

Load clip model

In [None]:
clip_model, preprocess = clip.load('RN50')
clip_model = clip_model.to(device=device).eval()

Evaluate entire dataset

In [None]:
ious: list[float] = []
coss: list[float] = []
euds: list[float] = []

batch: tuple[UInt[torch.Tensor, '1 C W H'], tuple[list[tuple[str]]], UInt[torch.Tensor, '1 4']]

with torch.no_grad():
    for batch in tqdm(iter(train_dataloader)):
        [img], (prompts), [true_xywh] = batch

        [true_xyxy] = torchvision.ops.box_convert(true_xywh.unsqueeze(0), in_fmt='xywh', out_fmt='xyxy')

        img_pil: Image = transforms.ToPILImage()(img)

        # yolo bboxes
        predictions = yolo_model(img_pil)

        # xmin,      ymin,      xmax,      ymax,      confidence, class
        # 274.06390, 231.20389, 392.66345, 372.59018, 0.93251,    23.00000
        bboxes: Float[torch.Tensor, 'X 6'] = predictions.xyxy[0]

        # if empty, put a bbox equal to image size
        if len(bboxes) == 0:
            bboxes = torch.tensor([[0, 0, img.size()[1], img.size()[2], 0, 0]], dtype=torch.float)

        # from yolo bboxes to cropped images
        crops: list[Image] = [
            img_pil.crop((xmin, ymin, xmax, ymax))
            for bbox in bboxes
            for [xmin, ymin, xmax, ymax, _, _] in [bbox.tolist()]
        ]

        # clip preprocess on cropped images
        preprocess_crops: Float[torch.Tensor, 'X 3 244 244'] = torch.stack([
            preprocess(crop)
            for crop in crops
        ])

        # format each available prompt
        prompts_tokens: Int[torch.Tensor, 'P 77'] = clip.tokenize([
            template.format(prompt)
            for template in ["{}", "A photo of {}", "We can see {}"]
            for (prompt,) in prompts  # <- ¯\_(ツ)_/¯
        ])

        # clip scores
        ass_z: tuple[Float[torch.Tensor, 'X P'], Float[torch.Tensor, 'P X']] = clip_model(preprocess_crops, prompts_tokens)
        _, logits_per_prompt = ass_z

        # final prediction
        best_match: int = torch.argmax(torch.max(logits_per_prompt, 0).values).item()
        prediction_bbox: Float[torch.Tensor, '4'] = bboxes[best_match][:4]

        # metrics
        iou: float = torchvision.ops.box_iou(true_xyxy.unsqueeze(0), prediction_bbox.unsqueeze(0)).item()
        ious.append(iou)

        rectangle: tuple[int, int, int, int] = true_xyxy.tolist()
        ground_truth_crop = img_pil.crop(rectangle)

        rectangle: tuple[int, int, int, int] = torch.tensor(prediction_bbox, dtype=torch.int).tolist()
        prediction_crop = img_pil.crop(rectangle)

        # from float16 to float32
        X: Float[torch.Tensor, '1'] = torch.tensor(
            clip_model.encode_image(torch.tensor(preprocess(ground_truth_crop)).unsqueeze(0)),
            dtype=torch.float
        )
        Y: Float[torch.Tensor, '1'] = torch.tensor(
            clip_model.encode_image(torch.tensor(preprocess(prediction_crop)).unsqueeze(0)),
            dtype=torch.float
        )

        cos: float = F.cosine_similarity(X, Y).item()
        coss.append(cos)

        eud: float = torch.cdist(X, Y, p=2).item()
        euds.append(eud)

        torch.cuda.empty_cache()

In [None]:
torch.mean(torch.tensor(ious, dtype=torch.float))

In [None]:
torch.mean(torch.tensor(coss, dtype=torch.float))

In [None]:
torch.mean(torch.tensor(euds, dtype=torch.float))

$$
J(X, Y) = \frac {|X \cap Y|} {|X \cup Y|}
$$

Where:

- $X$ is the ground truth bbox
- $B$ is our bbox

$$
\text{cosine similarity } X, Y := \cos(\theta) = \frac{\mathbf{X} \cdot \mathbf{Y}}{||\mathbf{X}||  ||\mathbf{Y}||} \qquad \text{euclidean distance } X, Y := ||\mathbf X - \mathbf Y||
$$

Where:

- $\mathbf X$ is the ground truth bbox in CLIP latin space
- $\mathbf B$ is out bbox in CLIP latin space

$$
\text{Pre} = \frac{TP}{TP + FP} \qquad \text{Rec} = \frac{TP}{TP + FN}
$$