# 📚 Image Captioning Challenge 📚

## 0-Imports



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
from tqdm import tqdm
from numpy import cov, trace, iscomplexobj
from scipy.linalg import sqrtm
import numpy as np

## 1- Data Reading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!unzip "/content/drive/MyDrive/caption_dataset/caption_dataset.zip" -d "/content/dataset/"

train_df = pd.read_csv("/content/dataset/train.csv")
test_df = pd.read_csv("/content/dataset/test.csv")
sample_df = pd.read_csv("/content/dataset/sample_submission.csv")

print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)
train_df.head()


In [None]:
train_df['caption_length'] = train_df['caption'].apply(lambda x: len(x.split()))
plt.figure(figsize=(10,5))
sns.histplot(train_df['caption_length'], bins=30, kde=True)
plt.title("Caption Length Distribution")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.show()
print("Unique captions:", train_df['caption'].nunique())
print("Average caption length:", train_df['caption_length'].mean())


In [None]:
def show_random_examples(df, n=5):
    fig, axes = plt.subplots(1, n, figsize=(15, 5))
    for i in range(n):
        row = df.sample(1).iloc[0]
        img_filename = f"{row['image_id']}.jpg"
        img_path = f"/content/dataset/train/train/{img_filename}"
        if not os.path.exists(img_path):
            print(f"Image not found: {img_path}")
            continue
        image = Image.open(img_path).convert("RGB")
        axes[i].imshow(image)
        axes[i].axis("off")
        axes[i].set_title("\n".join(row['caption'].split()[:8]))
    plt.tight_layout()
    plt.show()

show_random_examples(train_df, 5)
show_random_examples(train_df, 5)

In [None]:
print("Unique captions:", train_df['caption'].nunique())
print("Average caption length:", train_df['caption_length'].mean())


## 2- Data Preprocessing and Modelling

In [None]:
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
train_df['caption_proc'] = train_df['caption'].apply(lambda x: "<start> " + x.strip() + " <end>")

blip_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b",
                                                           device_map="auto", # distrubute CPU/GPU
                                                           torch_dtype=torch.float16)# float16 for Vram

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Create example caption
sample_row = test_df.sample(1).iloc[0]
sample_img_id = str(sample_row['image_id'])
if not sample_img_id.endswith(".jpg"):
    sample_img_id += ".jpg"
img_path = f"/content/dataset/test/test/{sample_img_id}"

image = Image.open(img_path).convert("RGB")
inputs = processor(images=image, return_tensors="pt")
inputs = {k: v.to(device, dtype=torch.float16) for k, v in inputs.items()}

with torch.no_grad():
    generated_ids = blip_model.generate(**inputs, max_new_tokens=30, do_sample=False)
caption = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)

plt.figure(figsize=(6,6))
plt.imshow(image)
plt.axis("off")
plt.title(f"ID: {sample_img_id}\nCaption creation example from test set:\n\n{caption}", fontsize=10)
plt.tight_layout()
plt.show()

## 3-Creating Captions For submission.csv

In [None]:
submission = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    img_id = str(row["image_id"])
    if not img_id.endswith(".jpg"):
        img_id += ".jpg"
    img_path = f"/content/dataset/test/test/{img_id}"
    try:
        image = Image.open(img_path).convert("RGB")
    except Exception as e:
        print(f"Image not found or corrupted: {img_path}")
        continue
    inputs = processor(images=image, return_tensors="pt")
    inputs = {k: v.to(device, dtype=torch.float16) for k, v in inputs.items()}
    with torch.no_grad():
        generated_ids = blip_model.generate(**inputs, max_new_tokens=30, do_sample=False)
    caption = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    submission.append({"image_id": img_id, "caption": caption})

submission_df = pd.DataFrame(submission)
submission_df.to_csv("submission.csv", index=False)
submission_df.head()

## 5- CALCULATE FGD SCORE

In [None]:
!pip install -q sentence-transformers
from sentence_transformers import SentenceTransformer
gte_model = SentenceTransformer("thenlper/gte-small")

gt_df = pd.read_csv("/content/dataset/train.csv")
pred_df = pd.read_csv("submission.csv")
N = min(len(gt_df), len(pred_df))
gt_captions = gt_df["caption"].values[:N]
pred_captions = pred_df["caption"].values[:N]

gt_embed = gte_model.encode(list(gt_captions), batch_size=64, show_progress_bar=True)
pred_embed = gte_model.encode(list(pred_captions), batch_size=64, show_progress_bar=True)


In [None]:
# Function from the datathon main page
def calculate_fgd(solution_embed: np.ndarray, submission_embed: np.ndarray) -> float:
    fgd_list = []
    for _idx, (sol_emb_sample, sub_emb_sample) in enumerate(zip(solution_embed, submission_embed)):
        sol_emb_sample_rshaped = sol_emb_sample.reshape((1,384))
        sub_emb_sample_rshaped = sub_emb_sample.reshape((1,384))
        e1 = np.concatenate([sol_emb_sample_rshaped, sol_emb_sample_rshaped])
        e2 = np.concatenate([sub_emb_sample_rshaped, sub_emb_sample_rshaped])
        mu1, sigma1 = e1.mean(axis=0), cov(e1, rowvar=False)
        mu2, sigma2 = e2.mean(axis=0), cov(e2, rowvar=False)
        ssdiff = np.sum((mu1 - mu2)**2.0)
        covmean = sqrtm(sigma1.dot(sigma2))
        if iscomplexobj(covmean):
            covmean = covmean.real
        fgd = ssdiff + trace(sigma1 + sigma2 - 2.0 * covmean)
        fgd_list.append(fgd)
    return float(np.mean(fgd_list))

In [None]:
fgd_score = calculate_fgd(gt_embed, pred_embed)
print(f"\nFinal FGD Score: {fgd_score:.4f}")
