In [1]:
from PIL import Image
import requests
import pandas as pd
import os
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel
import torch
import torch.nn as nn
import torchvision.transforms as transforms


  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


In [3]:
# Define data transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [20]:
class LookingWhileListeningDataset(Dataset):
    def __init__(self, image_folder, pair_csv, transform=None, text_transform=None, k=4):
        self.image_folder = image_folder
        self.pairs = pd.read_csv(pair_csv)
        self.transform = transform if transform is not None else transforms.ToTensor()
        self.text_transform = text_transform
        self.k = k

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        row = self.pairs.iloc[idx]
        images = []
        text = row['text1']
        if self.text_transform:
            text = self.text_transform(text)
        # Only one text for all k images
        for i in range(1, self.k + 1):  # Adjust index to start at 1
            image_filename = row[f'image{i}']  # Adjusted to directly use i for indexing
            image_path = os.path.join(self.image_folder, image_filename)
            image = Image.open(image_path)
            if self.transform:
                image = self.transform(image)  # This will convert the PIL image to a tensor
            images.append(image)
        return images, text




In [21]:
class MultimodalModel(nn.Module):
    def __init__(self, encoder, text_encoder, fusion_module, k=4):
        super(MultimodalModel, self).__init__()
        self.encoder = encoder  # For images
        self.text_encoder = text_encoder  # For text
        self.fusion_module = fusion_module
        self.k = k

    def forward(self, images, text):
        # Encode each image using the image encoder
        image_encodings = [self.encoder(image) for image in images]
        # Encode the text only once since there's only one text for all images
        text_encoding = self.text_encoder(text) if self.text_encoder else None
        # If text_encoder is None, replicate None k times for the loop below
        text_encodings = [text_encoding] * self.k if text_encoding is not None else [None] * self.k
        # Fuse each image encoding with the text encoding
        fused_encodings = [self.fusion_module(image_enc, text_enc) for image_enc, text_enc in zip(image_encodings, text_encodings)]
        return fused_encodings

    
    
    

In [22]:

image_folder = "C:\\Users\\sunny\\Desktop\\research\\cogeval\\sample_items\\trog-ex\\"

pair_csv = "C:\\Users\\sunny\\Desktop\\research\\cogeval\\sample_items\\trog-ex\\trog-manifest.csv"






In [23]:
dataset = LookingWhileListeningDataset(
    image_folder= image_folder,
    pair_csv= pair_csv,
    transform=None, 
    text_transform=None,
    k=4
)

In [24]:
def collator(batch):
    images = [image for data in batch for image in data[0]]
    # Since there is only one text per set of images, we don't need to flatten a list of texts
    texts = [data[1] for data in batch]
    return images, texts

In [25]:
data_loader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=collator)


In [26]:
def calculate_softmax_probs(data_loader, processor, model):
    model.eval()  
    softmax_probs = []

    with torch.no_grad():
        for images, text in data_loader:  # Notice how we're expecting a single text here
            # Repeat the single text k times (k = number of images per set)
            texts = [text] * len(images)
            # Process the images and texts through the CLIP processor
            inputs = processor(text=texts, images=images, return_tensors="pt", padding=True)
            outputs = model(**inputs)
            logits_per_image = outputs.logits_per_image  # Image-text similarity score
            probs = logits_per_image.softmax(dim=1)  # Softmax to get probabilities
            softmax_probs.extend(probs)
    
    return torch.stack(softmax_probs)


In [27]:
softmax_probs = calculate_softmax_probs(data_loader, processor, model)

ValueError: Unable to infer channel dimension format

In [18]:
softmax_probs

tensor([[0.0134, 0.0098, 0.0021, 0.0070, 0.0070, 0.0047, 0.0021, 0.0043, 0.0014,
         0.0091, 0.0115, 0.0070, 0.0098, 0.0021, 0.0022, 0.0071, 0.0115, 0.0115,
         0.0014, 0.0115, 0.0021, 0.0134, 0.0021, 0.0069, 0.0070, 0.0147, 0.0098,
         0.0036, 0.0021, 0.0043, 0.0014, 0.0091, 0.0115, 0.0070, 0.0098, 0.0134,
         0.0021, 0.0069, 0.0070, 0.0147, 0.0098, 0.0036, 0.0021, 0.0043, 0.0014,
         0.0091, 0.0115, 0.0070, 0.0098, 0.0021, 0.0022, 0.0071, 0.0115, 0.0115,
         0.0014, 0.0115, 0.0021, 0.0134, 0.0098, 0.0021, 0.0070, 0.0070, 0.0047,
         0.0021, 0.0043, 0.0014, 0.0091, 0.0115, 0.0070, 0.0098, 0.0117, 0.0012,
         0.0014, 0.0021, 0.0117, 0.0134, 0.0070, 0.0070, 0.0014, 0.0091, 0.0021,
         0.0043, 0.0014, 0.0091, 0.0115, 0.0070, 0.0098, 0.0021, 0.0012, 0.0147,
         0.0036, 0.0115, 0.0021, 0.0117, 0.0012, 0.0014, 0.0021, 0.0115, 0.0012,
         0.0070, 0.0091, 0.0117, 0.0014, 0.0091, 0.0021, 0.0043, 0.0014, 0.0091,
         0.0115, 0.0070, 0.0

In [19]:
os.makedirs("evals/grammar-winoground", exist_ok=True)

In [21]:
import numpy as np
np.save("evals/grammar-winoground/clip.npy", softmax_probs)