In [1]:
import sys
sys.path.append("..")

import pandas as pd

from torch.utils.data import DataLoader
from model_zoo import get_model
from dataset_zoo import VG_Relation, VG_Attribution

from transformers import ViltProcessor, ViltForImageAndTextRetrieval

import clip
import torch
import numpy as np
from tqdm import tqdm
import torch.nn.functional as F


In [2]:
vilt_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-coco")
vilt_model = ViltForImageAndTextRetrieval.from_pretrained("dandelin/vilt-b32-finetuned-coco")

In [3]:
# Please put your data root directory below. We'll download VG-Relation and VG-Attribution images here. 
# Will be a 1GB zip file (a subset of GQA).
root_dir="C:/Users/ewang/OneDrive/Desktop/Fall 2023/CompVLMs/vision-language-models-are-bows/data2" 


In [4]:
model, preprocess = get_model(model_name="openai-clip:ViT-B/32", device="cuda", root_dir=root_dir)

In [135]:
@torch.no_grad()
def get_retrieval_scores_batched(joint_loader):
    """Computes the scores for each image_option / caption_option pair in the joint loader.

    Args:
        joint_loader (DataLoader): batches have "image_options" and "caption_options" fields.
        "image_options" is a list of images, and "caption_options" is a list of captions.

    Returns:
        all_scores: A numpy array containing the scores of the shape NxKxL,
        where N is the number of test cases, K is the number of image options per the test case,
        and L is the number of caption options per the test case.
    """

    global image_ex
    global caption_ex
    caption_ex = []
    scores = []
    tqdm_loader = tqdm(joint_loader)
    tqdm_loader.set_description("Computing retrieval scores")
    for batch in tqdm_loader:

        image_options = []
        # print(len(batch["caption_options"]))
        # print(batch["caption_options"][0])
        # print(batch["caption_options"][1])
        counter = 0
        for i_option in batch["image_options"]: # length 1
            # print(i_option)
            # print(i_option.shape) # torch.Size([16, 3, 224, 224])
            image_embeddings = model.model.encode_image(i_option.to(model.device)).cpu().numpy() # B x D
            # print(image_embeddings.shape)
            # print(np.expand_dims(image_embeddings, axis=1).shape)
            image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, axis=1, keepdims=True) # B x D
            image_options.append(np.expand_dims(image_embeddings, axis=1)) # B x 1 x D

            # image_ex = i_option

        
        caption_options = []
        # caption_ex = []
        for c_option in batch["caption_options"]: # length 2

            caption_tokenized = torch.cat([clip.tokenize(c) for c in c_option])
            caption_embeddings = model.model.encode_text(caption_tokenized.to(model.device)).cpu().numpy() # B x D
            caption_embeddings = caption_embeddings / np.linalg.norm(caption_embeddings, axis=1, keepdims=True) # B x D
            caption_options.append(np.expand_dims(caption_embeddings, axis=1)) # B x 2 x D
            # caption_ex.append(c_option)


        image_options = np.concatenate(image_options, axis=1) # B x K x D
        caption_options = np.concatenate(caption_options, axis=1) # B x L x D
        batch_scores = np.einsum("nkd,nld->nkl", image_options, caption_options) # B x K x L
        # print(batch_scores.shape) # 16 x 1 x 2 (B x K x L)
        scores.append(batch_scores)
    
    all_scores = np.concatenate(scores, axis=0) # N x K x L
    print(all_scores.shape) # (23937, 1, 2)
    return all_scores

In [136]:
# Get the VG-R dataset
vgr_dataset = VG_Relation(image_preprocess=preprocess, download=False, root_dir=root_dir)
vgr_loader = DataLoader(vgr_dataset, batch_size=16, shuffle=False)

# Compute the scores for each test case
vgr_scores = get_retrieval_scores_batched(vgr_loader)


Computing retrieval scores: 100%|██████████| 1497/1497 [04:44<00:00,  5.25it/s]

(23937, 1, 2)





In [131]:
import torch
import matplotlib.pyplot as plt

# Your tensor 'image_tensor' should be of shape (3, 224, 224) if it's an RGB image
# image_tensor = torch.randn(3, 224, 224)  # Example tensor, replace with your own

def show_tensor_image(image_tensor):
    # Check if the tensor is on GPU, and if so, move it back to CPU
    if image_tensor.is_cuda:
        image_tensor = image_tensor.cpu()

    # Convert to NumPy array after transposing the dimensions to (H x W x C)
    image_numpy = image_tensor.permute(1, 2, 0).numpy()

    # Display the image
    plt.imshow(image_numpy)
    plt.axis('off')  # Hide the axes
    plt.show()

In [None]:
show_tensor_image(image_ex[15])

In [None]:
if isinstance(vgr_scores, tuple):

In [6]:
# Evaluate the macro accuracy
vgr_records = vgr_dataset.evaluate_scores(vgr_scores)
symmetric = ['adjusting', 'attached to', 'between', 'bigger than', 'biting', 'boarding', 'brushing', 'chewing', 'cleaning', 'climbing', 'close to', 'coming from', 'coming out of', 'contain', 'crossing', 'dragging', 'draped over', 'drinking', 'drinking from', 'driving', 'driving down', 'driving on', 'eating from', 'eating in', 'enclosing', 'exiting', 'facing', 'filled with', 'floating in', 'floating on', 'flying', 'flying above', 'flying in', 'flying over', 'flying through', 'full of', 'going down', 'going into', 'going through', 'grazing in', 'growing in', 'growing on', 'guiding', 'hanging from', 'hanging in', 'hanging off', 'hanging over', 'higher than', 'holding onto', 'hugging', 'in between', 'jumping off', 'jumping on', 'jumping over', 'kept in', 'larger than', 'leading', 'leaning over', 'leaving', 'licking', 'longer than', 'looking in', 'looking into', 'looking out', 'looking over', 'looking through', 'lying next to', 'lying on top of', 'making', 'mixed with', 'mounted on', 'moving', 'on the back of', 'on the edge of', 'on the front of', 'on the other side of', 'opening', 'painted on', 'parked at', 'parked beside', 'parked by', 'parked in', 'parked in front of', 'parked near', 'parked next to', 'perched on', 'petting', 'piled on', 'playing', 'playing in', 'playing on', 'playing with', 'pouring', 'reaching for', 'reading', 'reflected on', 'riding on', 'running in', 'running on', 'running through', 'seen through', 'sitting behind', 'sitting beside', 'sitting by', 'sitting in front of', 'sitting near', 'sitting next to', 'sitting under', 'skiing down', 'skiing on', 'sleeping in', 'sleeping on', 'smiling at', 'sniffing', 'splashing', 'sprinkled on', 'stacked on', 'standing against', 'standing around', 'standing behind', 'standing beside', 'standing in front of', 'standing near', 'standing next to', 'staring at', 'stuck in', 'surrounding', 'swimming in', 'swinging', 'talking to', 'topped with', 'touching', 'traveling down', 'traveling on', 'tying', 'typing on', 'underneath', 'wading in', 'waiting for', 'walking across', 'walking by', 'walking down', 'walking next to', 'walking through', 'working in', 'working on', 'worn on', 'wrapped around', 'wrapped in', 'by', 'of', 'near', 'next to', 'with', 'beside', 'on the side of', 'around']
df = pd.DataFrame(vgr_records)
df = df[~df.Relation.isin(symmetric)]
print(f"VG-Relation Macro Accuracy: {df.Accuracy.mean()}")

VG-Relation Macro Accuracy: 0.5947169407014137


In [8]:
# Get the VG-A dataset
vga_dataset = VG_Attribution(image_preprocess=preprocess, download=False, root_dir=root_dir)
vga_loader = DataLoader(vga_dataset, batch_size=16, shuffle=False)
# Compute the scores for each test case
vga_scores = model.get_retrieval_scores_batched(vga_loader)


Computing retrieval scores: 100%|██████████| 1797/1797 [05:33<00:00,  5.38it/s]


In [9]:
# Evaluate the macro accuracy
vga_records = vga_dataset.evaluate_scores(vga_scores)
df = pd.DataFrame(vga_records)
print(f"VG-Attribution Macro Accuracy: {df.Accuracy.mean()}")

VG-Attribution Macro Accuracy: 0.6284264294250497
