In [1]:
import evaluate
import open_clip
import pandas as pd
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model, _, transform = open_clip.create_model_and_transforms(
  model_name="coca_ViT-L-14",
  pretrained="logs/2023_04_19-19_25_33-model_coca_ViT-L-14-lr_1e-05-b_6-j_2-p_amp/checkpoints/epoch_1.pt"
)

In [54]:
data = pd.read_parquet('metadata.parquet')
data = data[data['part_id'].between(0, 100)]
X_train, X_test  = train_test_split(data, test_size=0.2, random_state=42)
X_train = X_train[['image_name', 'prompt']].rename(columns={'image_name': 'filepath', 'prompt': 'title'})
X_test = X_test[['image_name', 'prompt']].rename(columns={'image_name': 'filepath', 'prompt': 'title'})
X_train['filepath'] = 'diff2m/' + X_train['filepath']
X_test['filepath'] = 'diff2m/' + X_test['filepath']

In [55]:
X_train.to_csv('diff2m_data.csv', index=False, sep="\t")
X_test.to_csv('val_data.csv', index=False, sep="\t")

## Compute metrics

In [56]:
! python -m training.main \
    --val-data="val_data.csv"  \
    --model 'coca_ViT-L-14' \
    --pretrained 'mscoco_finetuned_CoCa-ViT-L-14-laion2B-s13B-b90k.bin'

2023-04-19,18:14:21 | INFO | Running with a single process. Device cuda:0.
2023-04-19,18:14:21 | INFO | Loaded coca_ViT-L-14 model config.
2023-04-19,18:14:23 | INFO | Loading pretrained coca_ViT-L-14 weights (mscoco_finetuned_CoCa-ViT-L-14-laion2B-s13B-b90k.bin).
2023-04-19,18:14:25 | INFO | Model:
2023-04-19,18:14:25 | INFO | CoCa(
  (text): TextTransformer(
    (token_embedding): Embedding(49408, 768)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
           

In [59]:
! python -m training.main \
    --val-data="val_data.csv"  \
    --model 'coca_ViT-L-14' \
    --pretrained 'logs/2023_04_12-16_21_27-model_coca_ViT-L-14-lr_1e-05-b_6-j_2-p_amp/checkpoints/epoch_3.pt'

2023-04-19,18:23:39 | INFO | Running with a single process. Device cuda:0.
2023-04-19,18:23:39 | INFO | Loaded coca_ViT-L-14 model config.
2023-04-19,18:23:42 | INFO | Loading pretrained coca_ViT-L-14 weights (logs/2023_04_12-16_21_27-model_coca_ViT-L-14-lr_1e-05-b_6-j_2-p_amp/checkpoints/epoch_3.pt).
2023-04-19,18:23:47 | INFO | Model:
2023-04-19,18:23:47 | INFO | CoCa(
  (text): TextTransformer(
    (token_embedding): Embedding(49408, 768)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu)

In [1]:
! python -m training.main \
    --val-data="val_data.csv"  \
    --model 'coca_ViT-L-14' \
    --pretrained 'logs/2023_04_19-19_25_33-model_coca_ViT-L-14-lr_1e-05-b_6-j_2-p_amp/checkpoints/epoch_1.pt'

2023-04-20,09:24:43 | INFO | Running with a single process. Device cuda:0.
2023-04-20,09:24:43 | INFO | Loaded coca_ViT-L-14 model config.
2023-04-20,09:24:46 | INFO | Loading pretrained coca_ViT-L-14 weights (logs/2023_04_19-19_25_33-model_coca_ViT-L-14-lr_1e-05-b_6-j_2-p_amp/checkpoints/epoch_1.pt).
2023-04-20,09:24:50 | INFO | Model:
2023-04-20,09:24:50 | INFO | CoCa(
  (text): TextTransformer(
    (token_embedding): Embedding(49408, 768)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu)

In [3]:
import os
import pandas as pd
from torchvision.io import read_image
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class ImageDataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
        self.img_labels = pd.read_csv(annotations_file, sep='\t').iloc[:2560]#.iloc[:38400]
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        # image = read_image(img_path)
        image = Image.open(img_path).convert("RGB")
        label = self.img_labels.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label

In [4]:
img_dataset = ImageDataset('val_data.csv', '', transform=transform)
train_dataloader = DataLoader(img_dataset, batch_size=256, shuffle=False)

In [5]:
device = 'cuda'
model = model.to(device)

In [6]:
from PIL import Image
import torch
from tqdm import tqdm

# im = Image.open("img.jpg").convert("RGB")
# im = transform(im).unsqueeze(0)
output_text = []
orig_text = []
for batch in tqdm(train_dataloader):
    train_features, train_labels = batch

    with torch.no_grad(), torch.cuda.amp.autocast():
        train_features = train_features.to(device)
        generated = model.generate(train_features)

    for i in range(len(generated)):
        output_text.append(open_clip.decode(generated[i]).split("<end_of_text>")[0].replace("<start_of_text>", ""))
    orig_text.extend(train_labels)

100%|██████████| 10/10 [04:58<00:00, 29.90s/it]


In [7]:
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=output_text, references=list(map(lambda x: [x], orig_text)))
print(results)

{'bleu': 0.08682854120861037, 'precisions': [0.3798662577732345, 0.18555725741057752, 0.13211110048761832, 0.10652273537349152], 'brevity_penalty': 0.48926025088869046, 'length_ratio': 0.5831377370440743, 'translation_length': 46956, 'reference_length': 80523}


In [8]:
from sentence_transformers import SentenceTransformer
st_model = SentenceTransformer('all-MiniLM-L6-v2')

In [9]:
label_embeddings = st_model.encode(orig_text)
prompt_embeddings = st_model.encode(output_text)

In [10]:
label_embeddings.shape

(2560, 384)

In [11]:
prompt_embeddings.shape

(2560, 384)

In [12]:
import numpy as np
from numpy import dot
from numpy.linalg import norm


def cosine_similarity(a, b):
    return dot(a, b) / (norm(a) * norm(b))

scores = []
for i in range(prompt_embeddings.shape[1]):
    scores.append(cosine_similarity(prompt_embeddings[i], label_embeddings[i]))
scores = np.array(scores)
print(scores.mean())

0.5209617
