## Import Dataset

In [None]:
import clip
import json
import torch
import pandas
import numpy as np

from PIL import Image, ImageDraw

In [None]:
BASE_PATH = "../refcocog/"
BASE_IMG = BASE_PATH + "images/"
annotations = pandas.read_pickle(BASE_PATH + "annotations/refs(umd).p")

ann_dt = pandas.DataFrame.from_records(annotations).filter(items=["image_id", "split", "sentences", "ann_id"])
display(ann_dt[ann_dt.split == 'train'])

instances = json.load(open(BASE_PATH + "annotations/instances.json", 'r'))
print(instances.keys())
instances_dt = pandas.DataFrame.from_records(instances['annotations'])
display(instances_dt)

images_dt = pandas.DataFrame.from_records(instances['images'])
display(images_dt)


train_ann = [ann for ann in annotations if ann['split'] == 'train']
test_ann = [ann for ann in annotations if ann['split'] == 'test']

Merge two pandas dataframe to obtain a single dataframe with all the information we need to run all the computations

In [None]:
a_dt = ann_dt.merge(instances_dt[["id", "bbox", "area"]], left_on="ann_id", right_on="id")
display(a_dt.head(1))

## Run YOLO prediction on image

### Load YOLO model

In [None]:
model = torch.hub.load('ultralytics/yolov5', 'yolov5x')

### Run prediction

In [None]:
def YoloBBoxes(img):
    result = model(img)
    result.show()
    bbox = result.pandas().xyxy[0]
    bbox = bbox.reset_index()
    bbox["tconfidence"] = np.nan
    bbox["crop"] = np.nan
    return bbox


In [None]:
def CropImage(image, boxs):
    crops = []

    for index, row in boxs.iterrows():
        box = (
            row['xmin'],
            row['ymin'],
            row['xmax'],
            row['ymax'],
        )

        crop = image.crop(box)

        crops.append(crop)
        boxs.at[index, 'crop'] = crop

    return crops


### Compute text similarity

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
modelCLIP, preprocess = clip.load("ViT-B/32", device=device)

In [None]:
def ExtractSent(annotation):
    return [f"a photo of a {s['sent']}" for s in annotation['sentences']]

def ClipSimilarity(image, text):
    image_features = modelCLIP.encode_image(image).float()
    text_features = modelCLIP.encode_text(text).float()
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = image_features @ text_features.T
    return similarity

def ComputeTextSimilarity(c, boxes):
    for index, row in boxes.iterrows():
        # display(c[index])
        text_simils = []

        for sent in test_sent:
            image = preprocess(row['crop']).unsqueeze(0).to(device)
            text = clip.tokenize(sent).to(device)

            text_simils.append(ClipSimilarity(image, text).detach().numpy())

        boxes.at[index, "tconfidence"] = (np.array(text_simils).max())

def ExtractBestMatch(boxes):
    return boxes[boxes.tconfidence == boxes.tconfidence.max()]


In [None]:
def CalculateIntersectionArea(fx1, fy1, fx2, fy2, sx1, sy1, sx2, sy2):
    print(fx1, fy1, fx2, fy2, sx1, sy1, sx2, sy2)
    dx = min(fx2, sx2) - max(fx1, sx1)
    dy = min(fy2, sy2) - max(fy1, sy1)
    if (dx>=0) and (dy>=0):
        area = dx*dy
    else:
        area = 0
    return area

def VisualizeIntersections(image, best, ann):
    draw = ImageDraw.Draw(image)
    draw.rectangle(
        [best_match.xmin, best_match.ymin, best_match.xmax, best_match.ymax],
        outline="red",
        width=3
    )
    draw.rectangle(
        [bbox_annotation[0], bbox_annotation[1], bbox_annotation[0] + bbox_annotation[2], bbox_annotation[1] + bbox_annotation[3]],
        outline="blue",
        width=3
    )
    display(image)

def CalculateIntersection(box, annotation):
    return


In [None]:
img = a_dt.head(1)

test_img = Image.open(BASE_IMG + "COCO_train2014_" + str(img['image_id'].values[0]).zfill(12) + ".jpg")

bbox = YoloBBoxes(test_img)
crops = CropImage(test_img, bbox)

test_sent = ExtractSent(test_ann[0])
ComputeTextSimilarity(crops, bbox)
display(bbox)

best_match = ExtractBestMatch(bbox)

bbox_annotation = img['bbox'].values[0]
display(bbox_annotation)

print("Best BBox Match")
display(best_match)
display(best_match['crop'][1])

VisualizeIntersections(test_img, best_match, bbox_annotation)

area = CalculateIntersectionArea(
    best_match.xmin.values[0], best_match.ymax.values[0], best_match.xmax.values[0], best_match.ymin.values[0],
    bbox_annotation[0], bbox_annotation[1], bbox_annotation[0] + bbox_annotation[2], bbox_annotation[1] + bbox_annotation[3]
)

print(area)