# CLIP - Dataset

## 0. imports

In [1]:
%load_ext jupyter_black

In [20]:
import os
import random

import cv2
import numpy as np
import pandas as pd

import albumentations as A
from albumentations.pytorch import ToTensorV2

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import transformers
from transformers import AutoTokenizer, AutoModel

from sklearn.model_selection import train_test_split

In [40]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## 1. Dataset - Line by Line

### 1.1 image - transform

In [3]:
def get_transform(img_size: int, max_pixel_value: float = 255.0, stage: str = "train"):
    if stage == "train":
        transform = A.Compose(
            [
                A.Resize(img_size, img_size, always_apply=True),
                A.Normalize(max_pixel_value=max_pixel_value, always_apply=True),
                ToTensorV2(),
            ]
        )
    else:
        transform = A.Compose(
            [
                A.Resize(img_size, img_size, always_apply=True),
                A.Normalize(max_pixel_value=max_pixel_value, always_apply=True),
                ToTensorV2(),
            ]
        )

    return transform

In [4]:
transform = get_transform(img_size=224)

### 1.2 text - tokenizer

In [5]:
tokenizer_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

### 1.3 data load

In [6]:
img_dir = "../data/Flickr-8k/Images/"
caption_path = "../data/Flickr-8k/captions.txt"
info_df = pd.read_csv(caption_path)

In [7]:
info_df.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


### 1.4 img & text -  preprocess

In [8]:
idx = random.choice(range(len(info_df)))

fname = info_df.iloc[idx]["image"]
caption = info_df.iloc[idx]["caption"]

In [9]:
max_length = 200

# encoded_caption = tokenizer(
#     caption, padding=True, truncation=True, max_length=max_length
# )

captions = info_df["caption"].tolist()

encoded_captions = tokenizer(
    captions, padding=True, truncation=True, max_length=max_length, return_tensors="pt"
)

In [10]:
encoded_captions["input_ids"].shape

torch.Size([40455, 42])

In [11]:
img_path = os.path.join(img_dir, fname)

img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

img = transform(image=img)["image"]

In [12]:
item = {k: v[idx] for k, v in encoded_captions.items()}
item["image"] = img
item["caption"] = caption

In [13]:
item

{'input_ids': tensor([  101,  1037,  2158,  1998,  1037,  2450,  2265,  2037, 11937,  3406,
         29099,  8072,  2006,  2037, 12150,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'image': tensor([[[ 1.9235,  1.9235,  1.9235,  ...,  1.1872,  1.1700,  1.1529],
          [ 1.9235,  1.9235,  1.9235,  ...,  1.1700,  1.1700,  1.1529],
          [ 1.9235,  1.9235,  1.9235,  ...,  1.1700,  1.1700,  1.1529],
          ...,
          [-1.7069, -1.6898, -1.7412,  ...,  0.5193,  0.7419,  0.8104],
          [-1.7240, -1.7240, -1.6555,  ...,  0.8447,  0.7933,  0.8618],
          [-1.7754, -1.6898, -1.6042,  ...,  0.9303,  0.9303,  0.7248]],
 
         [[ 1.9909,  1.

## 2. Dataset - Class

In [14]:
class CLIPDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        img_dir: str,
        tokenizer,
        transform,
        txt_max_length: int = 200,
    ):
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.transform = transform
        self.txt_max_length = txt_max_length

        # dataframe
        self.data = df

        # encoded_captions
        captions = self.data["caption"].tolist()
        self.encoded_captions = self.tokenizer(
            captions,
            padding=True,
            truncation=True,
            max_length=self.txt_max_length,
            return_tensors="pt",
        )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        fname = self.data.iloc[idx]["image"]
        caption = self.data.iloc[idx]["caption"]

        # txt prep
        item = {k: v[idx] for k, v in self.encoded_captions.items()}

        # img prep
        img_path = os.path.join(self.img_dir, fname)

        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = self.transform(image=img)["image"]

        item["image"] = img
        item["caption"] = caption

        return item

In [15]:
data_path = "../data/Flickr-8k/captions.txt"
img_dir = "../data/Flickr-8k/Images"
tokenizer_name = "distilbert-base-uncased"
img_size = 224
txt_max_length = 200


tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
transform = get_transform(img_size=img_size)

In [23]:
test_size = 0.2
val_size = 0.2

df = pd.read_csv(data_path)

train_df, test_df = train_test_split(df, test_size=test_size, shuffle=True)
train_df, val_df = train_test_split(train_df, test_size=test_size, shuffle=True)

In [27]:
dataset = CLIPDataset(train_df, img_dir, tokenizer, transform, txt_max_length)

In [28]:
batch_size = 32

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [29]:
batch = next(iter(dataloader))

In [19]:
batch

{'input_ids': tensor([[ 101, 2093, 2111,  ...,    0,    0,    0],
         [ 101, 2048, 2312,  ...,    0,    0,    0],
         [ 101, 2048, 6077,  ...,    0,    0,    0],
         ...,
         [ 101, 2048, 4268,  ...,    0,    0,    0],
         [ 101, 1037, 2158,  ...,    0,    0,    0],
         [ 101, 1037, 2450,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'image': tensor([[[[-1.8782, -1.8268, -1.8268,  ..., -1.5699, -1.7069, -1.8097],
           [-1.8610, -1.8439, -1.8097,  ..., -1.7583, -1.5357, -1.7925],
           [-1.8610, -1.8268, -1.8268,  ..., -1.2617, -1.6042, -1.7412],
           ...,
           [-2.0665, -2.0665, -2.0665,  ..., -2.1179, -2.1179, -2.1179],
           [-2.0665, -2.0494, -2.0665,  ..., -2.1179, -2.1179, -2.1179],
           [-2.0837

## 3. DataModule

In [49]:
class CLIPDataModule:
    def __init__(
        self,
        data_path: str,
        img_dir: str,
        tokenizer_name: str,
        img_size: int = 224,
        txt_max_length: int = 200,
        val_size: float = 0.2,
        test_size: float = 0.2,
        batch_size: int = 32,
        num_workers: int = 4,
    ):
        self.data_path = data_path
        self.img_dir = img_dir
        self.tokenizer_name = tokenizer_name
        self.img_size = img_size
        self.txt_max_length = txt_max_length
        self.val_size = val_size
        self.test_size = test_size
        self.batch_size = batch_size
        self.num_workers = num_workers

        self.setup()

    def setup(self):
        # load data
        self.df = pd.read_csv(self.data_path)

        # tokenizer & transform
        self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
        self.train_transform = get_transform(img_size=self.img_size, stage="train")
        self.test_transform = get_transform(img_size=self.img_size, stage="test")

        # train/val/test split
        train_df, test_df = train_test_split(
            self.df, test_size=self.test_size, shuffle=True
        )
        train_df, val_df = train_test_split(
            train_df, test_size=self.test_size, shuffle=True
        )

        # train/val/test set
        self.trainset = CLIPDataset(
            train_df,
            self.img_dir,
            self.tokenizer,
            self.train_transform,
            self.txt_max_length,
        )
        self.valset = CLIPDataset(
            val_df,
            self.img_dir,
            self.tokenizer,
            self.test_transform,
            self.txt_max_length,
        )
        self.testset = CLIPDataset(
            test_df,
            self.img_dir,
            self.tokenizer,
            self.test_transform,
            self.txt_max_length,
        )

    def train_dataloader(self):
        return DataLoader(
            self.trainset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=True,
        )

    def val_dataloader(self):
        return DataLoader(
            self.valset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=True,
        )

    def test_dataloader(self):
        return DataLoader(
            self.testset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=False,
        )

In [50]:
dm_params = {
    "data_path": "../data/Flickr-8k/captions.txt",
    "img_dir": "../data/Flickr-8k/Images",
    "tokenizer_name": "distilbert-base-uncased",
    "img_size": 224,
    "txt_max_length": 200,
    "val_size": 0.2,
    "test_size": 0.2,
    "batch_size": 32,
    "num_workers": 4,
}

dm = CLIPDataModule(**dm_params)

In [51]:
train_batch = next(iter(dm.train_dataloader()))
val_batch = next(iter(dm.val_dataloader()))
test_batch = next(iter(dm.test_dataloader()))