<a href="https://www.kaggle.com/code/gabrielfcarvalho/blip-finetune?scriptVersionId=254380641" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# BLIP Fine-Tuning for Image Captioning on CarDD Dataset

# 📌 **Notebook Overview**
# This notebook demonstrates fine-tuning the BLIP model for generating car descriptions based on images.
# It follows these steps:
# 1️⃣ **Install Dependencies**
# 2️⃣ **Load and Prepare Data** (Extract metadata and annotations)
# 3️⃣ **Test Pretrained Model on a Sample Image**
# 4️⃣ **Preprocess Images & Text**
# 5️⃣ **Fine-Tune BLIP Model**
# 6️⃣ **Generate Captions with the Trained Model**

# 📌 1️⃣ Install Required Libraries

In [None]:
!pip install torch torchvision transformers timm datasets -q

# 📌 2️⃣ Load Pretrained BLIP Model and Processor

In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration

# Set device (use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load BLIP processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# 📌 3️⃣ Test Pretrained Model on a Sample Image

In [None]:
from PIL import Image

# Load and preprocess a test image
image_path = "/kaggle/input/blip-for-captioning-car-damage/CarDD_COCO/CarDD_COCO/test2017/000012.jpg"
image = Image.open(image_path).convert("RGB")

# Process the image
inputs = processor(images=image, return_tensors="pt").to(device)

# Generate a caption using the BLIP model
outputs = model.generate(**inputs)
caption = processor.decode(outputs[0], skip_special_tokens=True)

display(image)
print("Generated Caption:", caption)

# 📌 4️⃣ Prepare Dataset for Fine-Tuning

In [None]:
import json
import pandas as pd
from tqdm import tqdm
from collections import Counter

# Load metadata from Excel file
excel_file = "/kaggle/input/blip-for-captioning-car-damage/CarDD_COCO/CarDD_COCO/annotations/image_info.xlsx"
image_info = pd.read_excel(excel_file)

# Function to extract metadata from the Excel file
def get_image_metadata(image_name):
    record = image_info[image_info["file_name"] == image_name]
    if not record.empty:
        color = record["color"].values[0] if not pd.isna(record["color"].values[0]) else "unknown"
        shooting_angle = record["shooting angle"].values[0] if not pd.isna(record["shooting angle"].values[0]) else "N/A"
        complete_partial = record["complete or partial "].values[0] if not pd.isna(record["complete or partial "].values[0]) else "unknown"
        return color, shooting_angle, complete_partial
    return "unknown", "N/A", "unknown"

# Function to generate captions from JSON annotations
def generate_captions(annotation_file, image_folder, output_csv):
    with open(annotation_file, "r") as file:
        data = json.load(file)

    captions = []
    for image in tqdm(data["images"]):
        img_id = image["id"]
        img_file = image["file_name"]

        # Retrieve metadata
        color, shooting_angle, complete_partial = get_image_metadata(img_file)

        # Extract damage annotations
        annotations = [ann for ann in data["annotations"] if ann["image_id"] == img_id]
        category_counts = Counter(next(cat["name"] for cat in data["categories"] if cat["id"] == ann["category_id"]) for ann in annotations)

        categories = []
        for category, count in category_counts.items():
            categories.append(f"{count} {category + 'es' if category[-1] == 'h' else category + 's'}" if count > 1 else category)

        # Construct caption
        description_parts = []
        if color != "N/A":
            description_parts.append(f"A {color} car")
        if complete_partial != "N/A":
            description_parts.append(f"shown {complete_partial.lower()}")
        if shooting_angle != "N/A":
            description_parts.append(f"from the {shooting_angle.lower()} angle")
        if categories:
            categories_string = f"{', '.join(categories[:-1])} and {categories[-1]}" if len(categories) > 1 else categories[0]
            description_parts.append(f"displaying {categories_string}")

        caption = ", ".join(description_parts) + "."
        captions.append({"file_path": f"{image_folder}/{img_file}", "caption": caption})

    # Save captions
    pd.DataFrame(captions).to_csv(output_csv, index=False)
    print(f"Saved captions to {output_csv}")

# Generate captions for train, val, and test datasets
generate_captions(
    "/kaggle/input/blip-for-captioning-car-damage/CarDD_COCO/CarDD_COCO/annotations/instances_train2017.json",
    "/kaggle/input/blip-for-captioning-car-damage/CarDD_COCO/CarDD_COCO/train2017",
    "train_captions.csv"
)
generate_captions(
    "/kaggle/input/blip-for-captioning-car-damage/CarDD_COCO/CarDD_COCO/annotations/instances_val2017.json",
    "/kaggle/input/blip-for-captioning-car-damage/CarDD_COCO/CarDD_COCO/annotations/instances_val2017.json",
    "val_captions.csv"
)
generate_captions(
    "/kaggle/input/blip-for-captioning-car-damage/CarDD_COCO/CarDD_COCO/annotations/instances_test2017.json",
    "/kaggle/input/blip-for-captioning-car-damage/CarDD_COCO/CarDD_COCO/test2017",
    "test_captions.csv"
)

In [None]:
import os
from datasets import Dataset

# Helper function to load images and captions
def load_dataset(csv_file, image_folder):
    df = pd.read_csv(csv_file)
    image_paths = df["file_path"].apply(lambda x: os.path.join(image_folder, os.path.basename(x))).tolist()
    captions = df["caption"].tolist()
    data = {"image": image_paths, "text": captions}
    return Dataset.from_dict(data)

def transform_fn(batch):
    # batch is a dict with e.g. {"image": [path1, path2, ...], "text": [txt1, txt2, ...]}
    # We'll transform each image on the fly.

    resize_transform = transforms.Compose([
        transforms.Resize((384, 384)),
        transforms.ToTensor()
    ])

    pixel_values = []
    # For images and text in the *current* batch:
    for img_path in batch["image"]:
        img = Image.open(img_path).convert("RGB")
        img = resize_transform(img)
        pixel_values.append(img)

    # Return the original text (no need to transform it here if you want a custom data collator)
    # or you can do partial tokenization if you want.
    return {
        "pixel_values": pixel_values,
        "text": batch["text"]
    }

# Prepare datasets
train_dataset = load_dataset("/kaggle/working/train_captions.csv", "/kaggle/input/blip-for-captioning-car-damage/CarDD_COCO/CarDD_COCO/train2017")
val_dataset = load_dataset("/kaggle/working/test_captions.csv", "/kaggle/input/blip-for-captioning-car-damage/CarDD_COCO/CarDD_COCO/val2017")

train_dataset.set_transform(transform_fn)

# 📌 5️⃣ Fine-Tuning BLIP Model

In [None]:
def collate_fn(examples):
    pixel_values = torch.stack([ex["pixel_values"] for ex in examples], dim=0)
    texts        = [ex["text"] for ex in examples]

    # Tokenize text
    encoding = processor.tokenizer(
        texts,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    input_ids      = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]
    labels         = input_ids.clone()

    return {
        "pixel_values":   pixel_values,      # shape [B, 3, 384, 384]
        "input_ids":      input_ids,         # shape [B, seq_len]
        "attention_mask": attention_mask,    # shape [B, seq_len]
        "labels":         labels
    }

In [None]:
from transformers import BlipForConditionalGeneration, get_scheduler  # drop AdamW here
from torch.optim import AdamW  # use PyTorch's AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
from torchvision import transforms

# Load and move model to device
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Prepare DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

# Optimizer and Learning Rate Scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_dataloader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training Loop
model.train()
progress_bar = tqdm(range(num_training_steps))
for epoch in range(3):
    for batch in train_dataloader:
        pixel_values = batch["pixel_values"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

# Save fine-tuned model
model.save_pretrained("blip-fine-tuned-final")
processor.save_pretrained("blip-fine-tuned-final")

# 📌 6️⃣ Generate Captions with Fine-Tuned Model

In [None]:
import os
import torch
import matplotlib.pyplot as plt
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

# Set device (use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load fine-tuned model and processor
processor = BlipProcessor.from_pretrained("/kaggle/working/blip-fine-tuned-final")
model = BlipForConditionalGeneration.from_pretrained("/kaggle/working/blip-fine-tuned-final").to(device)

# Path to test images folder
test_images_folder = "/kaggle/input/blip-for-captioning-car-damage/CarDD_COCO/CarDD_COCO/test2017"

# Function to generate captions for images
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    outputs = model.generate(**inputs)
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

# Function to display multiple test images with their generated captions
def display_test_results(test_folder, num_samples=6):
    image_files = sorted(os.listdir(test_folder))[:num_samples]  # Pick first N images
    image_paths = [os.path.join(test_folder, img) for img in image_files]

    captions = [generate_caption(img_path) for img_path in image_paths]

    # Plot images with captions
    fig, axes = plt.subplots(6, 1, figsize=(15, 10))  # 2 rows, 3 columns
    axes = axes.flatten()

    for i, img_path in enumerate(image_paths):
        img = Image.open(img_path)
        axes[i].imshow(img)
        axes[i].set_title(f"Caption: {captions[i]}", fontsize=10)
        axes[i].axis("off")

    plt.tight_layout()
    plt.show()

# Run the visualization
display_test_results(test_images_folder, num_samples=6)


In [None]:
!pip install evaluate rouge_score

In [None]:
import os
import torch
import matplotlib.pyplot as plt
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import pandas as pd
import evaluate
from nltk.translate.bleu_score import sentence_bleu

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = BlipProcessor.from_pretrained("/kaggle/working/blip-fine-tuned-final")
model = BlipForConditionalGeneration.from_pretrained("/kaggle/working/blip-fine-tuned-final").to(device)

test_captions_file = "/kaggle/working/test_captions.csv"
test_images_folder = "/kaggle/input/blip-for-captioning-car-damage/CarDD_COCO/CarDD_COCO/test2017"

# Read test dataset with ground-truth captions
df_test = pd.read_csv(test_captions_file)


def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    outputs = model.generate(**inputs)
    return processor.decode(outputs[0], skip_special_tokens=True)

# Process first N images for evaluation
num_samples = 6  # Number of test samples to evaluate
sample_data = df_test[:num_samples].copy()

generated_captions = []
for _, row in sample_data.iterrows():
    img_path = row["file_path"]
    generated_caption = generate_caption(img_path)
    generated_captions.append(generated_caption)

# Add generated captions to the DataFrame
sample_data["generated_caption"] = generated_captions

def display_caption_comparison(df):
    fig, axes = plt.subplots(6, 1, figsize=(15, 10))  # 2 rows, 3 columns
    axes = axes.flatten()

    for i, row in df.iterrows():
        img = Image.open(row["file_path"])
        axes[i].imshow(img)
        axes[i].set_title(f"- Ground Truth: {row['caption']}\n- Generated: {row['generated_caption']}", fontsize=10)
        axes[i].axis("off")

    plt.tight_layout()
    plt.show()

# Show images with captions
display_caption_comparison(sample_data)

# Load metrics
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")
rouge = evaluate.load("rouge")

# Prepare references and predictions
references = [[row["caption"]] for _, row in sample_data.iterrows()]  # Tokenized ground truth captions
predictions = [row["generated_caption"] for _, row in sample_data.iterrows()]  # Tokenized model predictions

# Compute BLEU Score
bleu_score = bleu.compute(predictions=predictions, references=references)["bleu"]

# Compute METEOR Score
meteor_score = meteor.compute(predictions=predictions, references=references)["meteor"]

# Compute ROUGE Score
rouge_score = rouge.compute(predictions=predictions, references=references)

# Display evaluation results
print("\n📊 Evaluation Metrics:")
print(f"- BLEU Score: {bleu_score:.4f}")
print(f"- METEOR Score: {meteor_score:.4f}")
print(f"- ROUGE Score: {rouge_score}")