In [1]:
import os
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from torch import nn, optim
from torchvision import transforms






  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Define the dataset class
class ImageCaptionDataset(Dataset):
    def __init__(self, csv_file, img_dir, processor):
        self.data = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.data.iloc[idx, 0])
        caption = self.data.iloc[idx, 1]
        image = Image.open(img_name).convert("RGB")
        
        # Use processor to handle padding dynamically
        inputs = self.processor(
            images=image,
            text=caption,
            return_tensors="pt",
            padding="max_length",  # Ensure padding to a maximum length
            max_length=50,         # Define a maximum caption length
            truncation=True        # Truncate captions longer than max_length
        )
        return {
            "pixel_values": inputs["pixel_values"].squeeze(0),  # Remove batch dimension
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
        }


In [9]:
# Initialize processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")


In [10]:

# Define paths
csv_file = "tnj/tnj.csv"
img_dir = "tnj/"


In [11]:
# Prepare the dataset and dataloader
dataset = ImageCaptionDataset(csv_file, img_dir, processor)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)


In [12]:
# Define training components
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()

In [14]:
# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        pixel_values = batch["pixel_values"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Forward pass with labels and attention mask
        outputs = model(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,  # Include the attention mask
            labels=input_ids  # Labels are the same as input_ids for caption generation
        )

        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")


Epoch 1, Loss: 9.117932891845703
Epoch 2, Loss: 6.902939319610596
Epoch 3, Loss: 6.058586692810058
Epoch 4, Loss: 5.434732723236084
Epoch 5, Loss: 4.908096408843994


In [15]:
# Save the fine-tuned model
model.save_pretrained("fine_tuned_blip")
processor.save_pretrained("fine_tuned_blip")

print("Model fine-tuned and saved successfully!")


Model fine-tuned and saved successfully!
