In [5]:
import os
import gc
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torchvision
from PIL import Image
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import models, transforms as T
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from transformers import (
    Trainer,
    TrainingArguments,
    VisionTextDualEncoderModel,
    VisionTextDualEncoderProcessor,
    AutoTokenizer,
    AutoImageProcessor,
    EarlyStoppingCallback
)
from tqdm.notebook import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2

In [6]:
pd.set_option('display.max_colwidth', None)

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f5dd054cd70>

In [7]:
data = pd.read_json(os.path.join("advanced", "vlm.jsonl"), lines=True)
data.head(5)

Unnamed: 0,image,annotations
0,image_0.jpg,"[{'caption': 'grey missile', 'bbox': [912, 164, 48, 152]}, {'caption': 'red, white, and blue light aircraft', 'bbox': [1032, 80, 24, 28]}, {'caption': 'green and black missile', 'bbox': [704, 508, 76, 64]}, {'caption': 'white and red helicopter', 'bbox': [524, 116, 112, 48]}]"
1,image_1.jpg,"[{'caption': 'grey camouflage fighter jet', 'bbox': [1112, 172, 64, 36]}, {'caption': 'grey and white fighter plane', 'bbox': [1108, 512, 144, 48]}, {'caption': 'white and black drone', 'bbox': [356, 452, 48, 32]}, {'caption': 'white and black fighter jet', 'bbox': [404, 156, 48, 36]}, {'caption': 'white missile', 'bbox': [544, 112, 40, 40]}, {'caption': 'black and white commercial aircraft', 'bbox': [808, 504, 68, 68]}]"
2,image_2.jpg,"[{'caption': 'grey drone', 'bbox': [552, 296, 56, 52]}, {'caption': 'white and black drone', 'bbox': [992, 504, 92, 48]}, {'caption': 'yellow, red, and grey helicopter', 'bbox': [304, 88, 56, 32]}, {'caption': 'yellow commercial aircraft', 'bbox': [808, 464, 76, 60]}, {'caption': 'black cargo aircraft', 'bbox': [948, 96, 44, 36]}, {'caption': 'yellow helicopter', 'bbox': [452, 108, 40, 36]}]"
3,image_3.jpg,"[{'caption': 'white and black light aircraft', 'bbox': [476, 324, 44, 44]}, {'caption': 'grey and black fighter plane', 'bbox': [760, 260, 56, 40]}, {'caption': 'yellow helicopter', 'bbox': [984, 500, 108, 44]}, {'caption': 'red fighter plane', 'bbox': [1016, 324, 208, 68]}, {'caption': 'yellow, red, and grey helicopter', 'bbox': [680, 340, 72, 44]}, {'caption': 'white missile', 'bbox': [1016, 176, 40, 48]}, {'caption': 'blue helicopter', 'bbox': [496, 512, 44, 40]}]"
4,image_4.jpg,"[{'caption': 'white, red, and green fighter plane', 'bbox': [260, 232, 60, 56]}, {'caption': 'black camouflage fighter jet', 'bbox': [640, 172, 36, 32]}, {'caption': 'green light aircraft', 'bbox': [632, 328, 76, 48]}]"


In [8]:
data_splits = {}
data_splits["train"], data_splits["val"] = random_split(data.to_dict(orient="index"), [0.9, 0.1])
# data_splits["train"][0]

In [14]:
model = VisionTextDualEncoderModel.from_vision_text_pretrained(
    "openai/clip-vit-base-patch32", "FacebookAI/roberta-base"
)
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
config = model.config
config

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The projection layer and logit scale weights `['visual_projection.weight', 'text_projection.weight', 'logit_scale']` are newly initialized. You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VisionTextDualEncoderConfig {
  "logit_scale_init_value": 2.6592,
  "model_type": "vision-text-dual-encoder",
  "projection_dim": 512,
  "text_config": {
    "_name_or_path": "FacebookAI/roberta-base",
    "add_cross_attention": false,
    "architectures": [
      "RobertaForMaskedLM"
    ],
    "attention_probs_dropout_prob": 0.1,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": 0,
    "chunk_size_feed_forward": 0,
    "classifier_dropout": null,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "early_stopping": false,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": 2,
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "id2label": {
      "0": "LABEL_0",
      "1": "LABEL_1"
    },
  

In [15]:
# transforms = {
#     "train": T.Compose([
#         T.Resize(config.vision_config.image_size),
#         T.CenterCrop(config.vision_config.image_size),
#         T.ToTensor(),
#         T.Normalize(image_processor.image_mean, image_processor.image_std),
#     ]),
#     "valid": T.Compose([
#         T.Resize(config.vision_config.image_size),
#         T.CenterCrop(config.vision_config.image_size),
#         T.ToTensor(),
#         T.Normalize(image_processor.image_mean, image_processor.image_std),
#     ])
# }

transform = A.Compose([
    A.SmallestMaxSize(config.vision_config.image_size),
    A.Rotate(limit=15, p=0.3),
    A.Blur(blur_limit=(3, 5), p=0.3),
    A.CenterCrop(height=config.vision_config.image_size, width=config.vision_config.image_size),
    # A.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.5),
    # A.RandomBrightnessContrast(p=0.5),
    A.Normalize(mean=image_processor.image_mean, std=image_processor.image_std),
    ToTensorV2(),
])

In [16]:
def crop(row):
    img = Image.open(os.path.join("advanced", "images", row["image"])).convert("RGB")
    bboxes = [anno["bbox"] for anno in row["annotations"]]
    crop_imgs = [img.crop([bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]]) for bbox in bboxes]
    
    return crop_imgs

In [22]:
class CLIPDataset(Dataset):
    def __init__(self, data, tokenizer, transform=None):   
        self.data = pd.DataFrame([row for row in data])
        self.tokenizer = tokenizer
        self.transform = transform

        # extract features
        self.crop_imgs = []
        self.data.apply(lambda row: self.crop_imgs.extend(crop(row)), axis=1)
        self.img_texts = []
        self.data["annotations"].apply(lambda annos: [self.img_texts.append(anno["caption"]) for anno in annos])
                
    def __getitem__(self, idx):
        # load image standardized to RGB and transform image
        img = self.crop_imgs[idx]
        text = self.img_texts[idx]
        
        if self.transform:
            img = self.transform(image=np.array(img))["image"]
            
        text_inputs = self.tokenizer(text, padding="max_length", truncation=True)
        target = {
            "input_ids": text_inputs.input_ids,
            "attention_mask": text_inputs.attention_mask
        }

        return img, target

    def __len__(self):
        return len(self.crop_imgs)
    
    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        
        if worker_info is None:
            return map(self.__getitem__, range(self.__len__()))

        per_worker = int(math.ceil((self.__len__()) / float(worker_info.num_workers)))
        worker_id = worker_info.id
        iter_start = worker_id * per_worker
        iter_end = min(iter_start + per_worker, self.__len__())
        return map(self.__getitem__, range(iter_start, iter_end))

In [23]:
datasets = {
    "train": CLIPDataset(data_splits["train"], tokenizer=tokenizer, transform=transform),
    "val": CLIPDataset(data_splits["val"], tokenizer=tokenizer, transform=transform)
}
print(datasets["train"].crop_imgs[0])
print(datasets["train"].img_texts[0])
print(len(datasets["train"].crop_imgs))
print(len(datasets["train"].img_texts))
print(len(datasets["val"].crop_imgs))
print(len(datasets["val"].img_texts))

<PIL.Image.Image image mode=RGB size=36x36 at 0x7F5CCBED9D50>
black camouflage fighter jet
25179
25179
2734
2734


In [24]:
print(datasets["train"][0])

(tensor([[[ 0.8063,  0.8063,  0.8063,  ...,  0.8209,  0.8209,  0.8209],
         [ 0.8063,  0.8063,  0.8063,  ...,  0.8209,  0.8209,  0.8209],
         [ 0.8063,  0.8063,  0.8063,  ...,  0.8063,  0.8063,  0.8063],
         ...,
         [ 0.7041,  0.7187,  0.7187,  ..., -1.2959, -1.3105, -1.3105],
         [ 0.6895,  0.6895,  0.7041,  ..., -1.2959, -1.3105, -1.3105],
         [ 0.6749,  0.6749,  0.6895,  ..., -1.3105, -1.3105, -1.3105]],

        [[ 1.0393,  1.0393,  1.0393,  ...,  1.0243,  1.0243,  1.0243],
         [ 1.0393,  1.0393,  1.0393,  ...,  1.0243,  1.0243,  1.0243],
         [ 1.0393,  1.0393,  1.0393,  ...,  1.0243,  1.0243,  1.0243],
         ...,
         [ 0.9343,  0.9343,  0.9493,  ..., -1.0918, -1.0918, -1.0918],
         [ 0.9193,  0.9193,  0.9343,  ..., -1.0918, -1.0918, -1.0918],
         [ 0.9043,  0.9043,  0.9043,  ..., -1.0918, -1.0918, -1.0918]],

        [[ 1.3496,  1.3496,  1.3496,  ...,  1.3496,  1.3496,  1.3496],
         [ 1.3496,  1.3496,  1.3496,  ...,  

In [28]:
def collate_fn(batch):
    imgs = torch.stack([img for img, target in batch])
    input_ids = torch.tensor([target["input_ids"] for img, target in batch], dtype=torch.long)
    att_masks = torch.tensor([target["attention_mask"] for img, target in batch], dtype=torch.long)
    
    return {
        "pixel_values": imgs,
        "input_ids": input_ids,
        "attention_mask": att_masks,
        "return_loss": True
    }

In [31]:
out_dir = "models/clip"
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 8
epochs = 10
lr = 1e-5
wd = 0.1

In [32]:
torch.cuda.empty_cache() if device == "cuda" else None
    
training_args = TrainingArguments(
    output_dir=out_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    learning_rate=lr,
    weight_decay=wd,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["val"],
    data_collator=collate_fn,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

trainer.train()
metrics = trainer.evaluate()
print(metrics)

trainer.save_model(os.path.join(out_dir, "best_model"))
tokenizer.save_pretrained(os.path.join(out_dir, "best_model"))
image_processor.save_pretrained(os.path.join(out_dir, "best_model"))

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Epoch,Training Loss,Validation Loss
1,0.1123,0.065024
2,0.0637,0.038983
3,0.0558,0.031773
4,0.0436,0.028868
5,0.0449,0.024279
6,0.0423,0.023101
7,0.0445,0.022984
8,0.0421,0.02162
9,0.0463,0.020464
10,0.0428,0.020674


{'eval_loss': 0.020595738664269447, 'eval_runtime': 92.8829, 'eval_samples_per_second': 29.435, 'eval_steps_per_second': 3.682, 'epoch': 10.0}


['models/clip/best_model/preprocessor_config.json']