<a href="https://colab.research.google.com/github/francesco-vaccari/ProjectDL/blob/main/Transformer%2BMLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Downloads

In [1]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git
!gdown 1xijq32XfEm6FPhUb7RsZYWHc2UuwVkiq
!tar -xf /content/refcocog.tar.gz
!pip install -qr https://raw.githubusercontent.com/ultralytics/yolov5/master/requirements.txt

Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-7l2z9d4g
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-7l2z9d4g
  Resolved https://github.com/openai/CLIP.git to commit a9b1bf5920416aaeaec965c25dd9e8f98c864f16
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369370 sha256=9968d157d40a51e9d6ebf63e42f389dadc8f842c340ca7954bfb

### Dataset

In [2]:
import json
import clip
import torch
import pandas
import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader
from typing import Sequence, Union

from PIL import Image, ImageDraw



class RefcocogDataset(Dataset):
    def __init__(self, base_path, split=None, transform=None, tokenization=None):
        annotation_path = base_path + "/annotations/"

        self.IMAGES_PATH = base_path + "/images/"
        self.transform = transform
        self.tokenization = tokenization

        tmp_annotations = pandas.read_pickle(annotation_path + "refs(umd).p")
        tmp_instances = json.load(open(annotation_path + "instances.json", "r"))

        annotations_dt = pandas.DataFrame.from_records(tmp_annotations) \
            .filter(items=["image_id", "split", "sentences", "ann_id"])

        instances_dt = pandas.DataFrame.from_records(tmp_instances['annotations'])

        self.annotations = annotations_dt \
            .merge(instances_dt[["id", "bbox", "area"]], left_on="ann_id", right_on="id") \
            .drop(columns="id")

        if split is not None:
            self.annotations = self.__get_annotations_by_split(split.lower())

    def getImage(self, sample):
        id = sample['idx'][0].item()
        item = self.annotations.iloc[id]
        image = self.__getimage(item.image_id)

        return image

    def getSentences(self, sample):
        id = sample['idx'][0].item()
        item = self.annotations.iloc[id]

        return self.__extract_sentences(item.sentences)

    def showImage(self, train_features, train_bbox):
        img = self.getImage(train_features)
        img1 = ImageDraw.Draw(img)
        img1.rectangle([(train_bbox[0].item(), train_bbox[1].item()), (train_bbox[0].item()+train_bbox[2].item(), train_bbox[1].item()+train_bbox[3].item())], outline ="red")
        img.show()

    def __get_annotations_by_split(self, split):
        return self.annotations[self.annotations.split == split].reset_index()

    def __getimage(self, id):
        return Image.open(self.IMAGES_PATH + "COCO_train2014_" + str(id).zfill(12) + ".jpg")

    def __extract_sentences(self, sentences):
        return [f"a photo of {s['sent']}" for s in sentences]

    def __tokenize_sents(self, sentences):
        return [self.tokenization(s) for s in sentences]

    def __len__(self):
        return self.annotations.shape[0]

    def __getitem__(self, idx):
        item = self.annotations.iloc[idx]
        image = self.__getimage(item.image_id)
        sentences = self.__extract_sentences(item.sentences)

        if self.transform:
            image = self.transform(image)

        if self.tokenization:
            sentences = self.__tokenize_sents(sentences)

        sample = {'idx': idx, 'image': image, 'sentences': sentences}

        return sample, item.bbox

In [3]:
_, preprocess = clip.load("RN50")
REFCOCOG_PATH = "refcocog"

train_dataset = RefcocogDataset(REFCOCOG_PATH, split="train", transform=preprocess, tokenization=clip.tokenize)
val_dataset = RefcocogDataset(REFCOCOG_PATH, split="val", transform=preprocess, tokenization=clip.tokenize)
test_dataset = RefcocogDataset(REFCOCOG_PATH, split="test", transform=preprocess, tokenization=clip.tokenize)

train_dataloader = DataLoader(train_dataset, shuffle=False)
val_dataloader = DataLoader(val_dataset, shuffle=False)
test_dataloader = DataLoader(test_dataset, shuffle=False)

100%|████████████████████████████████████████| 244M/244M [00:02<00:00, 120MiB/s]


### Models

In [10]:
class CustomModel(torch.nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        self.linear = torch.nn.Linear(2048, 1024)
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        x = self.linear(x)
        x = self.relu(x)
        return x

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
modelYOLO = torch.hub.load('ultralytics/yolov5', 'yolov5x')
modelCLIP, preprocessCLIP = clip.load("RN50", device=device)
customModel = CustomModel().to(device)

Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2023-7-7 Python-3.10.12 torch-2.0.1+cu118 CUDA:0 (Tesla T4, 15102MiB)

Fusing layers... 
YOLOv5x summary: 444 layers, 86705005 parameters, 0 gradients
Adding AutoShape... 


### Functions

In [17]:
def extract_crops(image):
    results = modelYOLO(image)
    crops = []
    crops_bboxes = []

    for result in results.xyxy[0]:
        crops.append(image.crop(result[:4].tolist()))
        crops_bboxes.append(result[:4].tolist())

    return crops, crops_bboxes

def convert_bbox_from_xywh_to_xyxy(bbox):
    return [
        bbox[0],
        bbox[1],
        bbox[0] + bbox[2],
        bbox[1] + bbox[3]
    ]

def intersection_over_union(bbox1, bbox2):
    x_left = max(bbox1[0], bbox2[0])
    y_top = max(bbox1[1], bbox2[1])
    x_right = min(bbox1[2], bbox2[2])
    y_bottom = min(bbox1[3], bbox2[3])

    if x_right < x_left or y_bottom < y_top:
        return 0.0

    intersection_area = (x_right - x_left) * (y_bottom - y_top)
    bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
    bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])

    iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area)
    return iou


def get_labels(crops, crops_bboxes, bbox, image):
    bbox = [n.item() for n in bbox]
    bbox = convert_bbox_from_xywh_to_xyxy(bbox)
    labels = torch.zeros(len(crops)).to(device)

    found_crop = False
    for i, crop_bbox in enumerate(crops_bboxes):
        if(intersection_over_union(bbox, crop_bbox) > 0.5):
            labels[i] = 1
            found_crop = True
            break

    if not found_crop:
        crops.append(image.crop(bbox))
        crops_bboxes.append(bbox)
        labels = torch.cat((labels, torch.tensor([1]).to(device)))

    return crops, crops_bboxes, labels

def get_embeddings(preprocessed_image, crops):
    image_embedding = modelCLIP.encode_image(preprocessed_image).squeeze().float()

    embeddings = []
    for crop in crops:
        crop_preprocessed = preprocessCLIP(crop).unsqueeze(0).to(device)
        crop_embedding = modelCLIP.encode_image(crop_preprocessed).squeeze().float()
        embedding = torch.cat((image_embedding, crop_embedding))
        embeddings.append(embedding.to(device))

    return embeddings

def pass_embeddings_through_model(embeddings):
    outputs = []
    for embedding in embeddings:
        output = customModel(embedding)
        outputs.append(output)
    return outputs

def compute_loss(outputs, labels, sentences):
    # I want to maximize the similarity between the target vector and the outputs with label 1
    # and minimize the similarity between the target vector and the outputs with label 0
    loss = 0
    for sent in sentences:
        sent = sent.to(device)
        target_vector = modelCLIP.encode_text(sent[0]).squeeze().to(device)
        for i, output in enumerate(outputs):
            if labels[i] == 1:
                loss -= torch.cosine_similarity(output, target_vector, dim=0)
            else:
                loss += torch.cosine_similarity(output, target_vector, dim=0)
    return loss

### Main loop

In [18]:
loader = val_dataloader
dataset = val_dataset

learning_rate = 0.001
optimizer = torch.optim.Adam(customModel.parameters(), lr=learning_rate)

for data_features, data_bbox in loader:

    crops, crops_bboxes = extract_crops(dataset.getImage(data_features))

    crops, crops_bboxes, labels = get_labels(crops, crops_bboxes, data_bbox, dataset.getImage(data_features))

    intput_embeddings = get_embeddings(data_features['image'].to(device), crops)

    output_embeddings = pass_embeddings_through_model(intput_embeddings)

    loss = compute_loss(output_embeddings, labels, data_features['sentences'])

    print(loss)

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()



    break

tensor(-0.83568, device='cuda:0', grad_fn=<AddBackward0>)
