In [1]:
%%shell
if ! [ -d dataset ]; then
  mkdir dataset &&
  gdown 1i-LHWSRp2F6--yhAi4IG3DiiCHmgE4cw &&
  tar -xf refcocog.tar -C dataset &&
  rm refcocog.tar
fi



In [2]:
%%shell
tee requirements.txt << END
jaxtyping
matplotlib
more-itertools
pandas
pydantic
torch
torchvision
tqdm
END

pip install -q -r requirements.txt

jaxtyping
matplotlib
more-itertools
pandas
pydantic
torch
torchvision
tqdm




In [3]:
import PIL.Image
import csv
import itertools as it
import os
import pandas as pd
import torch
import torchvision
import typing as t

from PIL.Image import Image
from collections import defaultdict
from jaxtyping import Float, UInt
from pydantic.dataclasses import dataclass
from torch.utils.data import DataLoader, Dataset
from torchvision.ops import box_iou
from tqdm import tqdm

In [4]:
path_root: str = os.path.join('dataset', 'refcocog', '')
path_annotations: str = os.path.join(path_root, 'annotations', '')
path_bboxes: str = os.path.join(path_root, 'bboxes', '')
path_images: str = os.path.join(path_root, 'images', '')

path_refs: str = os.path.join(path_annotations, 'refs.csv')
path_sentences: str = os.path.join(path_annotations, 'sentences.csv')

path_DETR: str = os.path.join(path_bboxes, 'bboxes[DETR].csv')
path_YOLOv5: str = os.path.join(path_bboxes, 'bboxes[YOLOv5].csv')
path_YOLOv8: str = os.path.join(path_bboxes, 'bboxes[YOLOv8].csv')

In [5]:
Split = t.Literal['train', 'test', 'val']

@dataclass
class Ref:
    ref_id: int  # unique id for refering expression
    file_name: str  # file name of image relative to img_root
    split: Split
    xmin: float
    ymin: float
    xmax: float
    ymax: float


with open(path_refs, 'r') as f:
    raw = csv.DictReader(f)
    refs: list[Ref] = [ Ref(**row) for row in raw ]

In [6]:
T = t.TypeVar('T')
K = t.TypeVar('K')
V = t.TypeVar('V')

def groupby(
    xs: list[T],
    map_key: t.Callable[[T], K],
    map_value: t.Callable[[T], V] = lambda x: x
) -> dict[K, list[V]]:
    return {
        k: [ map_value(v) for v in vs ]
        for k, vs in it.groupby(sorted(xs, key=map_key), key=map_key)
    }

In [7]:
@dataclass
class Sentence:
    ref_id: int  # unique id for refering expression
    sent: str


with open(path_sentences, 'r') as f:
    raw = csv.DictReader(f)
    sentences: list[Sentence] = [ Sentence(**row) for row in raw ]


id2sents: dict[int, list[str]] = groupby(sentences, lambda x: x.ref_id, lambda x: x.sent)

In [8]:
@dataclass
class BBox:
    file_name: str  # file name of image relative to img_root
    xmin: float
    ymin: float
    xmax: float
    ymax: float
    confidence: float


with open(path_DETR, 'r') as f:
    raw = csv.DictReader(f)
    bboxes: list[BBox] = [ BBox(**row) for row in raw ]

img2detr: dict[str, list[BBox]] = defaultdict(list, groupby(bboxes, lambda x: x.file_name))


with open(path_YOLOv5, 'r') as f:
    raw = csv.DictReader(f)
    bboxes: list[BBox] = [ BBox(**row) for row in raw ]

img2yolov5: dict[str, list[BBox]] = defaultdict(list, groupby(bboxes, lambda x: x.file_name))


with open(path_YOLOv8, 'r') as f:
    raw = csv.DictReader(f)
    bboxes: list[BBox] = [ BBox(**row) for row in raw ]

img2yolov8: dict[str, list[BBox]] = defaultdict(list, groupby(bboxes, lambda x: x.file_name))



In [9]:
class CocoMetricsDataset(Dataset[tuple[Float[torch.Tensor, 'X 5'], Float[torch.Tensor, '1 4']]]):

    def __init__(
        self,
        split: Split,
        img2bboxes: dict[str, list[BBox]],
        limit: int = -1,
    ):
        self.__init__
        self.items: list[tuple[Float[torch.Tensor, 'X 5'], Float[torch.Tensor, '1 4']]] = [
            (xyxys, xyxy)
            for ref in refs
            if ref.split == split
            for img in [os.path.join(path_images, ref.file_name)]
            for bboxes in [img2bboxes[ref.file_name]]
            for xyxys in [torch.tensor([ (bbox.xmin, bbox.ymin, bbox.xmax, bbox.ymax, bbox.confidence) for bbox in bboxes ], dtype=torch.float)]
            for xyxy in [torch.tensor([(ref.xmin, ref.ymin, ref.xmax, ref.ymax)], dtype=torch.float)]
        ]
        self.len: int = len(self.items) if limit < 0 else min(limit, len(self.items))


    def __len__(self) -> int:
        return self.len


    def __getitem__(self, index: int) -> tuple[Float[torch.Tensor, 'X 5'], Float[torch.Tensor, '1 4']]:
        return self.items[index]



In [10]:
def metrics(dataset: Dataset[tuple[Float[torch.Tensor, 'X 5'], Float[torch.Tensor, '1 4']]]) -> pd.DataFrame:

    dataloader: DataLoader[tuple[Float[torch.Tensor, 'X 5'], Float[torch.Tensor, '1 4']]] = DataLoader(dataset, batch_size=None)
    Z: Float[torch.Tensor, '1 5'] = torch.zeros(1, 5)

    ious: list[float] = [ torch.max(box_iou(true_xyxy, torch.cat((Z, xyxys))[:, :4])).item() for xyxys, true_xyxy in tqdm(dataloader) ]
    rs: list[int] = [ xyxys.shape[0] for xyxys, _ in tqdm(dataloader) ]

    return pd.DataFrame({'iou': ious, '#': rs})

In [11]:
splits: list[Split] = ['train', 'val', 'test']
report: pd.DataFrame = pd.concat(
    [
        pd.concat(
            [yolov5, yolov8, detr],
            axis=1,
            keys=['yolov5', 'yolov8', 'detr']
        ).describe()
        for split in splits
        for yolov5 in [metrics(CocoMetricsDataset(split, img2yolov5))]
        for yolov8 in [metrics(CocoMetricsDataset(split, img2yolov8))]
        for detr in [metrics(CocoMetricsDataset(split, img2detr))]
    ],
    axis=1,
    keys=splits
)

100%|██████████| 42226/42226 [00:12<00:00, 3271.47it/s]
100%|██████████| 42226/42226 [00:00<00:00, 56314.74it/s]
100%|██████████| 42226/42226 [00:12<00:00, 3443.90it/s]
100%|██████████| 42226/42226 [00:00<00:00, 60843.23it/s]
100%|██████████| 42226/42226 [00:12<00:00, 3435.47it/s]
100%|██████████| 42226/42226 [00:00<00:00, 56079.90it/s]
100%|██████████| 2573/2573 [00:00<00:00, 3886.52it/s]
100%|██████████| 2573/2573 [00:00<00:00, 59471.54it/s]
100%|██████████| 2573/2573 [00:00<00:00, 3918.80it/s]
100%|██████████| 2573/2573 [00:00<00:00, 58023.39it/s]
100%|██████████| 2573/2573 [00:00<00:00, 3836.92it/s]
100%|██████████| 2573/2573 [00:00<00:00, 58250.13it/s]
100%|██████████| 5023/5023 [00:01<00:00, 3290.59it/s]
100%|██████████| 5023/5023 [00:00<00:00, 38474.09it/s]
100%|██████████| 5023/5023 [00:02<00:00, 2440.85it/s]
100%|██████████| 5023/5023 [00:00<00:00, 35739.83it/s]
100%|██████████| 5023/5023 [00:01<00:00, 3024.15it/s]
100%|██████████| 5023/5023 [00:00<00:00, 35823.39it/s]


In [12]:
display(report)

Unnamed: 0_level_0,train,train,train,train,train,train,val,val,val,val,val,val,test,test,test,test,test,test
Unnamed: 0_level_1,yolov5,yolov5,yolov8,yolov8,detr,detr,yolov5,yolov5,yolov8,yolov8,detr,detr,yolov5,yolov5,yolov8,yolov8,detr,detr
Unnamed: 0_level_2,iou,#,iou,#,iou,#,iou,#,iou,#,iou,#,iou,#,iou,#,iou,#
count,42226.0,42226.0,42226.0,42226.0,42226.0,42226.0,2573.0,2573.0,2573.0,2573.0,2573.0,2573.0,5023.0,5023.0,5023.0,5023.0,5023.0,5023.0
mean,0.825308,11.371288,0.917952,11.666438,0.916711,25.652181,0.823866,11.614069,0.914574,12.156627,0.915542,26.676642,0.825951,11.19351,0.916911,11.474617,0.917533,25.201672
std,0.183443,9.895845,0.114324,9.746315,0.082985,23.022665,0.18291,9.82122,0.118846,10.222522,0.082728,23.541012,0.181452,9.891445,0.118269,9.782079,0.080795,22.871364
min,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.026681,1.0,0.103549,2.0,0.0,1.0,0.0,1.0,0.20701,1.0
25%,0.790363,5.0,0.916166,5.0,0.900589,8.0,0.778337,5.0,0.912336,5.0,0.899168,8.0,0.784575,5.0,0.916092,5.0,0.901468,8.0
50%,0.897469,8.0,0.952232,9.0,0.941296,18.0,0.89947,8.0,0.951201,9.0,0.940851,18.0,0.89694,8.0,0.953306,8.0,0.942469,17.0
75%,0.939805,15.0,0.971201,15.0,0.964255,37.0,0.939672,15.0,0.970959,16.0,0.963627,38.0,0.940734,14.0,0.971352,14.0,0.964712,35.0
max,0.999468,127.0,0.998281,117.0,0.998923,100.0,0.990354,72.0,0.996324,87.0,0.996266,99.0,0.995819,96.0,0.9972,99.0,0.998143,100.0
