In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from dataset import CLIPDataset
from model import CLIP
from tokenizer import CLIPTextTokenizer
from loss import InfoNCECriterion
from utils import timeit

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import importlib
import model, dataset, tokenizer, loss, utils

# Reload all modules



In [16]:
class Trainer:
    def __init__(self, model, criterion, train_dataloader,test_dataloader, optimizer, device, tokenizer, num_epochs):
        self.device = device
        self.model = model.to(self.device)
        #self.criterion = criterion.to(self.device)
        self.criterion = F.cross_entropy
        self.optimizer = optimizer
        self.num_epochs = num_epochs

        # Initialize OpenAI's GPT-2 BPE tokenizer
        self.tokenizer = tokenizer

        self.train_dataloader = train_dataloader
        self.test_dataloader = test_dataloader

    def train_step(self, images, token_ids):
        images = images.to(self.device)
        token_ids = token_ids.to(self.device)
        #print("token id",token_ids)
        logits = self.model(images, token_ids)
        target = torch.arange(logits.size(0), device=self.device)  # B
        loss = self.criterion(logits, target)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()
    
    @timeit
    def train_epoch(self, epoch):
        self.model.train()
        for i, (images, token_ids) in enumerate(self.train_dataloader):
            loss = self.train_step(images, token_ids)
            if (i + 1) % 10 == 0:
                print(f"Epoch [{epoch+1}/{self.num_epochs}], Step [{i+1}/{len(self.train_dataloader)}], Loss: {loss:.4f}")
        return loss

    @timeit
    def run(self):
        for epoch in range(self.num_epochs):
            loss = self.train_epoch(epoch)
            print(f"Epoch [{epoch+1}/{self.num_epochs}], Loss: {loss:.4f}")



In [None]:
tokenizer_ = CLIPTextTokenizer(context_length=25)

train_dataloader = torch.utils.data.DataLoader(
            CLIPDataset(image_dir='../Images', captions_filepath='../captions.txt', tokenizer=tokenizer.tokenize_text),
            batch_size=32, shuffle=True, num_workers=4
        )
test_dataloader = torch.utils.data.DataLoader(
            CLIPDataset(image_dir='../Images', captions_filepath='../captions.txt', tokenizer=tokenizer.tokenize_text),
            batch_size=32, shuffle=False, num_workers=4
        )

Using tokenizer: gpt2 with vocab size: 50257


In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import importlib
import model, dataset, tokenizer, loss, utils
importlib.reload(model)
importlib.reload(dataset)
importlib.reload(tokenizer)
importlib.reload(loss)
importlib.reload(utils)
from dataset import CLIPDataset
from model import CLIP
from tokenizer import CLIPTextTokenizer
from loss import InfoNCECriterion
from utils import timeit


In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
tokenizer_ = CLIPTextTokenizer(context_length=25)

# Initialize model components
model = CLIP(vocab_size=tokenizer_.n_vocab,image_dim=192, caption_dim=512, embedding_dim=512)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = InfoNCECriterion()

trainer = Trainer(model, criterion, train_dataloader,test_dataloader, optimizer, device, tokenizer_, num_epochs=10)
print("Starting training...")
trainer.run()

Using device: cuda
Using tokenizer: gpt2 with vocab size: 50257
Starting training...
Epoch [1/10], Step [10/253], Loss: 5.1160
Epoch [1/10], Step [20/253], Loss: 4.4140
Epoch [1/10], Step [30/253], Loss: 3.9029
Epoch [1/10], Step [40/253], Loss: 3.5686
Epoch [1/10], Step [50/253], Loss: 3.7846
Epoch [1/10], Step [60/253], Loss: 3.7231
Epoch [1/10], Step [70/253], Loss: 3.6017
Epoch [1/10], Step [80/253], Loss: 3.7355
Epoch [1/10], Step [90/253], Loss: 3.6202
Epoch [1/10], Step [100/253], Loss: 3.6485
Epoch [1/10], Step [110/253], Loss: 3.5051
Epoch [1/10], Step [120/253], Loss: 3.7207
Epoch [1/10], Step [130/253], Loss: 3.6085
Epoch [1/10], Step [140/253], Loss: 3.7181
Epoch [1/10], Step [150/253], Loss: 3.5126
Epoch [1/10], Step [160/253], Loss: 3.4836
Epoch [1/10], Step [170/253], Loss: 3.5998
Epoch [1/10], Step [180/253], Loss: 3.5560
Epoch [1/10], Step [190/253], Loss: 3.5891
Epoch [1/10], Step [200/253], Loss: 3.5263
Epoch [1/10], Step [210/253], Loss: 3.4621
Epoch [1/10], Step [2

In [29]:
trainer.run()

Epoch [1/10], Step [10/253], Loss: 0.6999
Epoch [1/10], Step [20/253], Loss: 1.0488
Epoch [1/10], Step [30/253], Loss: 1.2342
Epoch [1/10], Step [40/253], Loss: 1.6791
Epoch [1/10], Step [50/253], Loss: 1.0836
Epoch [1/10], Step [60/253], Loss: 1.1651
Epoch [1/10], Step [70/253], Loss: 1.3230
Epoch [1/10], Step [80/253], Loss: 1.2022
Epoch [1/10], Step [90/253], Loss: 1.7410
Epoch [1/10], Step [100/253], Loss: 0.8990
Epoch [1/10], Step [110/253], Loss: 1.2030
Epoch [1/10], Step [120/253], Loss: 1.0674
Epoch [1/10], Step [130/253], Loss: 1.1343
Epoch [1/10], Step [140/253], Loss: 1.2541
Epoch [1/10], Step [150/253], Loss: 1.2865
Epoch [1/10], Step [160/253], Loss: 1.3873
Epoch [1/10], Step [170/253], Loss: 1.2811
Epoch [1/10], Step [180/253], Loss: 1.2079
Epoch [1/10], Step [190/253], Loss: 0.9278
Epoch [1/10], Step [200/253], Loss: 1.1761
Epoch [1/10], Step [210/253], Loss: 1.1748
Epoch [1/10], Step [220/253], Loss: 1.2018
Epoch [1/10], Step [230/253], Loss: 1.2082
Epoch [1/10], Step [

In [30]:
torch.save(model.state_dict(), 'clip_model.pth')
print("Model saved to 'clip_model.pth'")

Model saved to 'clip_model.pth'


In [39]:
test_iter = iter(test_dataloader)


In [42]:
images, token_ids = next(test_iter)  # first batch

import matplotlib.pyplot as plt



# Step 2: Select one image and 5 captions
image = images[0].unsqueeze(0)  # Shape: (1, 3, H, W)
captions = token_ids[:5]        # Shape: (5, T)

# Step 3: Repeat the image 5 times to pair with 5 captions
image_batch = image.repeat(5, 1, 1, 1)  # Shape: (5, 3, H, W)

# Step 4: Move to device
image_batch = image_batch.to(trainer.device)
captions = captions.to(trainer.device)

#suffle captions
captions = captions[torch.randperm(captions.size(0))]  # Shuffle captions

# Step 5: Run inference
trainer.model.eval()
with torch.no_grad():
    logits = trainer.model(image_batch, captions)  # Shape: (5, 5)

# Step 6: Print logits
print("Logits (image vs 5 captions):")
print(logits)

# Optional: Compute similarity scores (e.g., softmax)
import torch.nn.functional as F
scores = F.softmax(logits[0], dim=0)  # Similarity of image to 5 captions
print("Similarity scores:", scores)

plt.imshow(images[0].permute(1, 2, 0).cpu())
plt.axis('off')
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

# Assume `captions` is a tensor of shape (5, T)
captions_text = [tokenizer.decode(caption.tolist()) for caption in captions]

print("Captions:")
for i, caption in enumerate(captions_text):
    print(f"Caption {i+1}: {caption}")

plt.show()




Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).


Logits (image vs 5 captions):
tensor([[31.8644, 39.7345, 52.7754, 43.4383, 71.9336],
        [31.8644, 39.7345, 52.7754, 43.4383, 71.9336],
        [31.8644, 39.7345, 52.7754, 43.4383, 71.9336],
        [31.8644, 39.7345, 52.7754, 43.4383, 71.9336],
        [31.8644, 39.7345, 52.7754, 43.4383, 71.9336]], device='cuda:0')
Similarity scores: tensor([3.9644e-18, 1.0378e-14, 4.7831e-09, 4.2138e-13, 1.0000e+00],
       device='cuda:0')
Captions:
Caption 1: perorsA female softball player making a pitch . fielded!!!!!!!!!!!!!!
Caption 2: perorsA woman in a dress is crossing a suspended bridge . fielded!!!!!!!!!!!!
Caption 3: perors"A girl dressed in a red top  fielded!!!!!!!!!!!!!!
Caption 4: perorsThe two children make a funny pose in front of some bushes . fielded!!!!!!!!!!
Caption 5: perorsa woman at a desk signing paperwork in front of another fielded!!!!!!!!!!!!


KeyboardInterrupt: 