In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
import json
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from transformers import CLIPModel

# Load the CLIP model
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# Load the embedding dictionary and golden truth JSON
with open("/content/drive/MyDrive/golden_truth/golden_truth_idiomatic.json", "r") as gt_file:
    golden_truth = json.load(gt_file)

# Load the embedding dictionary
embedding_dict_path = "/content/drive/MyDrive/clip_train_idiom_embeddings.pt"
embedding_dict = torch.load(embedding_dict_path)

print(type(embedding_dict))

<class 'list'>


  embedding_dict = torch.load(embedding_dict_path)


In [41]:
print(type(embedding_dict[0]))  # Print the first element

<class 'dict'>


In [43]:
# Inspect the structure of the first dictionary in the list
for i, compound_data in enumerate(embedding_dict[:1]):  # Print details of the first compound
    print(f"Compound {i + 1}: {compound_data['compound_name']}")
    print(f"Text Embedding Shape: {compound_data['text_embedding'].shape}")
    print(f"Number of Images: {len(compound_data['images'])}")
    for img in compound_data['images']:
        print(f"  Image ID: {img['image_id']}, Image Embedding Shape: {img['image_embedding'].shape}")


Compound 1: elbow grease
Text Embedding Shape: torch.Size([1, 512])
Number of Images: 5
  Image ID: 74852536462.png, Image Embedding Shape: torch.Size([1, 512])
  Image ID: 53378381715.png, Image Embedding Shape: torch.Size([1, 512])
  Image ID: 39938261459.png, Image Embedding Shape: torch.Size([1, 512])
  Image ID: 54879908369.png, Image Embedding Shape: torch.Size([1, 512])
  Image ID: 35234427395.png, Image Embedding Shape: torch.Size([1, 512])


In [52]:
data = embedding_dict['compound_name']

KeyError: 'compound_name'

In [51]:
class RankingDataset(Dataset):
    def __init__(self, embedding_dict, golden_truth):
        """
        :param embedding_dict: Precomputed embeddings for text and images
        :param golden_truth: JSON dictionary with the correct ranking for each compound
        """
        self.embedding_dict = embedding_dict
        self.golden_truth = golden_truth
        self.compounds = list(golden_truth.keys())

    def __len__(self):
        return len(self.compounds)

    def __getitem__(self, idx):
        compound = self.compounds[idx]
        print(f"\nProcessing Compound: {compound}")

        # Check if compound exists in the embedding dictionary
        if compound not in self.embedding_dict:
            print(f"Compound '{compound}' not found in embedding dictionary. Skipping.")
            return None  # Skip missing entries

        # Access text embedding and image embeddings
        data = self.embedding_dict[compound]
        text_embedding = data["text_embedding"]  # Shape: (1, 512)
        print(f"Text Embedding Shape: {text_embedding.shape}")

        images = data["images"]
        print(f"Number of Images: {len(images)}")
        image_embeddings = torch.stack([img["image_embedding"] for img in images])  # Shape: (num_images, 512)
        print(f"Image Embeddings Shape: {image_embeddings.shape}")

        image_ids = [img["image_id"] for img in images]
        print(f"Image IDs: {image_ids}")

        # Map golden truth rank to indices
        correct_order = self.golden_truth.get(compound, [])
        print(f"Correct Order from Golden Truth: {correct_order}")

        correct_indices = [
            image_ids.index(img_id) for img_id in correct_order if img_id in image_ids
        ]
        print(f"Correct Indices: {correct_indices}")

        return text_embedding.squeeze(0), image_embeddings, correct_indices

dataset = RankingDataset(embedding_dict, golden_truth)

# Iterate over the dataset and print the debug output for the first 2 compounds
for idx in range(2):
    result = dataset[idx]
    if result:
        text_embedding, image_embeddings, correct_indices = result
        print(f"Processed Result for Compound {idx + 1}:")
        print(f"  Text Embedding Shape: {text_embedding.shape}")
        print(f"  Image Embeddings Shape: {image_embeddings.shape}")
        print(f"  Correct Indices: {correct_indices}")




Processing Compound: elbow grease


IndexError: too many indices for tensor of dimension 2

In [46]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

# Define the CLIP Adapter
class CLIPAdapter(nn.Module):
    def __init__(self, embedding_dim):
        super(CLIPAdapter, self).__init__()
        self.adapter = nn.Sequential(
            nn.Linear(embedding_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 1)  # Predict a single score per image embedding
        )

    def forward(self, text_embedding, image_embeddings):
        # Normalize embeddings
        text_embedding = text_embedding / text_embedding.norm(dim=-1, keepdim=True)
        image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)

        # Compute dot product similarity scores
        similarity_scores = torch.matmul(image_embeddings, text_embedding.unsqueeze(-1)).squeeze(-1)
        # Refine scores using the adapter
        refined_scores = self.adapter(similarity_scores.unsqueeze(-1)).squeeze(-1)
        return refined_scores


# Dataset class
class RankingDataset(Dataset):
    def __init__(self, embedding_dict, golden_truth):
        """
        :param embedding_dict: Precomputed embeddings for text and images
        :param golden_truth: JSON dictionary with the correct ranking for each compound
        """
        self.embedding_dict = embedding_dict
        self.golden_truth = golden_truth
        self.compounds = list(golden_truth.keys())

    def __len__(self):
        return len(self.compounds)

    def __getitem__(self, idx):
        compound = self.compounds[idx]

        # Check if compound exists in the embedding dictionary
        if compound not in self.embedding_dict:
            return None  # Skip missing entries

        # Access text embedding and image embeddings
        data = self.embedding_dict[compound]
        text_embedding = data["text_embedding"]  # Shape: (1, 512)
        images = data["images"]
        image_embeddings = torch.stack([img["image_embedding"] for img in images])  # Shape: (num_images, 512)
        image_ids = [img["image_id"] for img in images]

        # Map golden truth rank to indices
        correct_order = self.golden_truth.get(compound, [])
        correct_indices = [image_ids.index(img_id) for img_id in correct_order if img_id in image_ids]

        return text_embedding.squeeze(0), image_embeddings, correct_indices


# Custom collate function
def collate_fn(batch):
    batch = [b for b in batch if b is not None]  # Filter out None entries
    if not batch:
        return None  # Return None if the batch is empty
    return torch.utils.data.default_collate(batch)


# Loss function
def listnet_loss(pred_scores, true_indices):
    """
    Compute the ListNet loss for predicted scores and true rankings.
    """
    num_images = pred_scores.size(0)
    true_distribution = torch.zeros(num_images, dtype=torch.float32, device=pred_scores.device)
    for i, idx in enumerate(true_indices):
        true_distribution[idx] = len(true_indices) - i  # Higher rank gets higher weight
    true_distribution = true_distribution / true_distribution.sum()  # Normalize to probabilities
    pred_prob = torch.softmax(pred_scores, dim=0)  # Predicted probabilities
    loss = -torch.sum(true_distribution * torch.log(pred_prob + 1e-9))  # Cross-entropy loss
    return loss


# Training function
def train_model(model, dataloader, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            if batch is None:  # Skip empty batches
                continue

            text_embedding, image_embeddings, true_indices = batch
            text_embedding = text_embedding.squeeze(0)  # Shape: (512,)
            image_embeddings = image_embeddings.squeeze(0)  # Shape: (num_images, 512)

            optimizer.zero_grad()
            pred_scores = model(text_embedding, image_embeddings)
            loss = listnet_loss(pred_scores, true_indices)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")


# Initialize model, optimizer, and dataloader
embedding_dim = 512  # Dimension of the embeddings
clip_adapter = CLIPAdapter(embedding_dim)
optimizer = optim.Adam(clip_adapter.parameters(), lr=1e-4)

# Example: Load dataset and golden truth
# embedding_dict = torch.load("/path/to/embedding_dict.pt")
# golden_truth = json.load(open("/path/to/golden_truth.json"))

dataset = RankingDataset(embedding_dict, golden_truth)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)

# Train the model
train_model(clip_adapter, dataloader, optimizer, epochs=10)


IndexError: too many indices for tensor of dimension 2

In [45]:

# Define the CLIP Adapter
class CLIPAdapter(nn.Module):
    def __init__(self, embedding_dim):
        super(CLIPAdapter, self).__init__()
        self.adapter = nn.Sequential(
            nn.Linear(embedding_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 1)  # Predict a single score per image embedding
        )

    def forward(self, text_embedding, image_embeddings):
        # Normalize embeddings
        text_embedding = text_embedding / text_embedding.norm(dim=-1, keepdim=True)
        image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)

        # Compute dot product similarity scores
        similarity_scores = torch.matmul(image_embeddings, text_embedding.unsqueeze(-1)).squeeze(-1)
        # Refine scores using the adapter
        refined_scores = self.adapter(similarity_scores.unsqueeze(-1)).squeeze(-1)
        return refined_scores

# Dataset class
class RankingDataset(Dataset):
    def __init__(self, embedding_dict, golden_truth):
        """
        :param embedding_dict: Precomputed embeddings for text and images
        :param golden_truth: JSON dictionary with the correct ranking for each compound
        """
        self.embedding_dict = embedding_dict
        self.golden_truth = golden_truth
        self.compounds = list(golden_truth.keys())

    def __len__(self):
        return len(self.compounds)

    def __getitem__(self, idx):
        compound = self.compounds[idx]

        # Check if compound exists in the embedding dictionary
        if compound not in self.embedding_dict:
            print(f"Warning: Missing embedding for compound '{compound}', skipping.")
            return None  # Skip missing entries

        # Access text embedding and image embeddings
        data = self.embedding_dict[compound]
        text_embedding = data["text_embedding"]  # Shape: (1, 512)
        images = data["images"]
        image_embeddings = torch.stack([img["image_embedding"] for img in images])  # Shape: (num_images, 512)
        image_ids = [img["image_id"] for img in images]

        # Map golden truth rank to indices
        correct_order = self.golden_truth.get(compound, [])
        correct_indices = [image_ids.index(img_id) for img_id in correct_order if img_id in image_ids]

        return text_embedding.squeeze(0), image_embeddings, correct_indices


# Loss function
def listnet_loss(pred_scores, true_indices):
    """
    Compute the ListNet loss for predicted scores and true rankings.
    """
    num_images = pred_scores.size(0)
    true_distribution = torch.zeros(num_images, dtype=torch.float32)
    for i, idx in enumerate(true_indices):
        true_distribution[idx] = len(true_indices) - i  # Higher rank gets higher weight
    true_distribution = true_distribution / true_distribution.sum()  # Normalize to probabilities
    pred_prob = torch.softmax(pred_scores, dim=0)  # Predicted probabilities
    loss = -torch.sum(true_distribution * torch.log(pred_prob + 1e-9))  # Cross-entropy loss
    return loss

def train_model(model, dataloader, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            if batch is None:  # Skip empty batches
                continue

            text_embedding, image_embeddings, true_indices = batch
            text_embedding = text_embedding.squeeze(0)  # Shape: (512,)
            image_embeddings = image_embeddings.squeeze(0)  # Shape: (num_images, 512)

            optimizer.zero_grad()
            pred_scores = model(text_embedding, image_embeddings)
            loss = listnet_loss(pred_scores, true_indices)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")


IndexError: too many indices for tensor of dimension 2

In [None]:
# Create dataset and dataloader
dataset = RankingDataset(embedding_dict, golden_truth)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Initialize CLIP Adapter model, optimizer
clip_adapter = CLIPAdapter(embedding_dim=512)
optimizer = optim.Adam(clip_adapter.parameters(), lr=0.001)

# Train the model
train_model(clip_adapter, dataloader, optimizer)