In [11]:
from PIL import Image
import requests
import pandas as pd
import os
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel
import torch
import torch.nn as nn
#import torchvision.transforms as transforms


In [12]:
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


In [50]:
# Define data transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [64]:
class LookingWhileListeningDataset(Dataset):
    def __init__(self, image_folder, pair_csv, transform=None, text_transform=None, k=2):
        self.image_folder = image_folder
        self.pairs = pd.read_csv(pair_csv)
        # If no transform is provided, use ToTensor as default
        self.transform = transform if transform is not None else transforms.ToTensor()
        self.text_transform = text_transform
        self.k = k
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        row = self.pairs.iloc[idx]
        images, texts = [], []
        for i in range(self.k):
            image_filename = row[f'image{i+1}']
            image_path = os.path.join(self.image_folder, image_filename)
            image = Image.open(image_path)
            image = self.transform(image)  # This will convert the PIL image to a tensor
            images.append(image)

            text = row[f'text{i+1}']
            if self.text_transform:
               text = self.text_transform(text)
            texts.append(text)
        return images, texts




In [65]:
class MultimodalModel(nn.Module):
    def __init__(self, encoder, text_encoder, fusion_module, k=2):
        super(MultimodalModel, self).__init__()
        self.encoder = encoder  # For images
        self.text_encoder = text_encoder  # For text
        self.fusion_module = fusion_module
        self.k = k

    def forward(self, images, texts):
        image_encodings = [self.encoder(image) for image in images]
        if self.text_encoder:
            text_encodings = [self.text_encoder(text) for text in texts]
        else:
            text_encodings = [None] * self.k
        fused_encodings = [self.fusion_module(image_enc, text_enc) for image_enc, text_enc in zip(image_encodings, text_encodings)]
        return fused_encodings

In [66]:

image_folder = "C:\\Users\\sunny\\Desktop\\research\\cogeval\\tablet_images\\"

pair_csv = "C:\\Users\\sunny\\Desktop\\research\\cogeval\\manifest.csv"




In [67]:
dataset = LookingWhileListeningDataset(
    image_folder= image_folder,
    pair_csv= pair_csv,
    transform=None, 
    text_transform=None,
    k=2
)

In [68]:
def collator(batch):
    images = [item for data in batch for item in data[0]]
    texts = [item for data in batch for item in data[1]]
    return images, texts

In [69]:
data_loader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=collator)


In [70]:
def calculate_softmax_probs(data_loader, processor, model):
    model.eval()  
    softmax_probs = []

    with torch.no_grad():  
        for batch in data_loader:
            images, texts = batch
            # Flatten the list of texts if they are in tuples for each image
            texts = [text for text_pair in texts for text in text_pair]
            # Process the images and texts through the CLIP processor
            # Convert the list of PIL Images to a list and then to a batch of tensors
            inputs = processor(text=texts, images=images, return_tensors="pt", padding=True)
            outputs = model(**inputs)
            logits_per_image = outputs.logits_per_image  # Image-text similarity score
            probs = logits_per_image.softmax(dim=1)  # Softmax to get label probabilities
            softmax_probs.extend(probs)
    
    # Convert the list of softmax probabilities to a tensor
    return torch.stack(softmax_probs)

In [71]:
softmax_probs = calculate_softmax_probs(data_loader, processor, model)

In [72]:
softmax_probs

tensor([[0.0137, 0.0037, 0.0037,  ..., 0.0074, 0.0089, 0.0087],
        [0.0141, 0.0039, 0.0039,  ..., 0.0076, 0.0082, 0.0095],
        [0.0167, 0.0042, 0.0042,  ..., 0.0059, 0.0065, 0.0115],
        ...,
        [0.0137, 0.0039, 0.0039,  ..., 0.0076, 0.0088, 0.0090],
        [0.0137, 0.0037, 0.0037,  ..., 0.0074, 0.0090, 0.0087],
        [0.0137, 0.0038, 0.0038,  ..., 0.0074, 0.0090, 0.0086]])