In [1]:
"""
This metric implements the mean Fréchet GTE Distance (FGD) score for text embeddings using the GTE-small model.
The metric measures the similarity between ground truth and predicted text captions by comparing their embedding distributions.

The score is calculated by:
1. Converting text to embeddings using GTE-small
2. Computing mean and covariance statistics of the embeddings
3. Calculating the FGD score between the distributions
"""

import pandas as pd
import pandas.api.types
import numpy as np
from numpy import cov, trace, iscomplexobj
from scipy.linalg import sqrtm
from typing import List
from sentence_transformers import SentenceTransformer

def calculate_fgd(solution_embed: np.ndarray, submission_embed: np.ndarray) -> float:
    '''
    solution_embed: Embedding of the ground truth from GTE-small.
    submission_embed: Embedding of the predicted caption from GTE-small.
    '''
    fgd_list = []
    for _idx, (sol_emb_sample, sub_emb_sample) in enumerate(zip(solution_embed, submission_embed)):
        sol_emb_sample_rshaped, sub_emb_sample_rshaped = sol_emb_sample.reshape((1,384)), sub_emb_sample.reshape((1,384))
        e1 = np.concatenate([sol_emb_sample_rshaped, sol_emb_sample_rshaped])
        e2 = np.concatenate([sub_emb_sample_rshaped, sub_emb_sample_rshaped])
        """Calculate Fréchet GTE Distance between two embedding distributions"""
        # Calculate mean and covariance statistics
        mu1, sigma1 = e1.mean(axis=0), cov(e1, rowvar=False)
        mu2, sigma2 = e2.mean(axis=0), cov(e2, rowvar=False)

        # Calculate sum squared difference between means
        ssdiff = np.sum((mu1 - mu2)**2.0)

        # Calculate sqrt of product between cov
        covmean = sqrtm(sigma1.dot(sigma2))

        # Check and correct imaginary numbers from sqrt
        if iscomplexobj(covmean):
            covmean = covmean.real

        # Calculate score
        fgd = ssdiff + trace(sigma1 + sigma2 - 2.0 * covmean)
        fgd_list.append(fgd)
        if _idx % 100 == 0:
            print(f"Processed {_idx} samples", end="\r")
    return float(np.mean(fgd_list))


In [2]:
# Cell 1: Optimized Imports
import os
import torch
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW, lr_scheduler
from torchvision import transforms
from transformers import BlipProcessor, BlipForConditionalGeneration
from tqdm import tqdm

# Memory optimization
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision('medium')

In [3]:
# Cell 2: Enhanced Data Loading with Streaming
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!test -d "/content/dataset" || unzip -q "/content/drive/MyDrive/caption_dataset/caption_dataset.zip" -d "/content/dataset/"

DATASET_PATH = "/content/dataset/"
TRAIN_IMAGE_DIR = os.path.join(DATASET_PATH, "train", "train")
TEST_IMAGE_DIR = os.path.join(DATASET_PATH, "test", "test")
TRAIN_CSV = os.path.join(DATASET_PATH, "train.csv")
TEST_CSV = os.path.join(DATASET_PATH, "test.csv")

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")


Mounted at /content/drive
Train shape: (21367, 2), Test shape: (3771, 1)


In [10]:
# Cell 3: Corrected Data Pipeline
class OptimizedDataset(Dataset):
    def __init__(self, df, image_dir, processor):
        self.df = df
        self.image_dir = image_dir
        self.processor = processor
        self.transform = transforms.Compose([
            transforms.Resize((384, 384)),  # BLIP's native size
            transforms.ToTensor(),
        ])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = self.df.iloc[idx]['image_id']
        caption = self.df.iloc[idx]['caption']
        img_path = os.path.join(self.image_dir, f"{img_id}.jpg")

        try:
            image = Image.open(img_path).convert('RGB')
            image = self.transform(image)
        except:
            image = torch.zeros(3, 384, 384)

        inputs = self.processor(
            images=image,
            text=caption,
            return_tensors="pt",
            padding='max_length',
            max_length=32,
            truncation=True,
            do_rescale=False  # Critical fix for image scaling
        )
        inputs['labels'] = inputs['input_ids'].clone()  # Essential for loss calculation
        return {k: v.squeeze(0) for k, v in inputs.items()}

In [None]:
# Cell 4: Optimized Model Setup
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

# Freeze entire model first
for param in model.parameters():
    param.requires_grad = False

# Correctly access decoder layers
decoder_layers = model.text_decoder.bert.encoder.layer  # Updated path

# Unfreeze last 2 decoder layers
for layer in decoder_layers[-2:]:
    for param in layer.parameters():
        param.requires_grad = True

# Unfreeze cross-attention layers
for param in model.text_decoder.cls.parameters():
    param.requires_grad = True

model = model.to('cuda', memory_format=torch.channels_last)

In [12]:
# Cell 5: Streaming DataLoader
def chunked_loader(df, chunk_size=5000):
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i+chunk_size]
        dataset = OptimizedDataset(chunk, TRAIN_IMAGE_DIR, processor)
        yield DataLoader(
            dataset,
            batch_size=8,
            shuffle=True,
            pin_memory=True,
            num_workers=2
        )


In [None]:
# Cell 6: Training with Memory Optimization
scaler = torch.cuda.amp.GradScaler()  # Original syntax for current versions
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=3e-5, weight_decay=0.01)
scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=3, eta_min=1e-6)

gradient_accumulation = 4
best_score = float('inf')

for epoch in range(8):
    model.train()
    epoch_loss = 0
    train_df = pd.read_csv(TRAIN_CSV).sample(frac=1.0)

    for loader in chunked_loader(train_df):
        for batch_idx, batch in enumerate(tqdm(loader)):
            batch = {k: v.to('cuda', non_blocking=True) for k, v in batch.items()}

            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(**batch)
                loss = outputs.loss / gradient_accumulation

            scaler.scale(loss).backward()

            if (batch_idx + 1) % gradient_accumulation == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad(set_to_none=True)

            epoch_loss += loss.item() * gradient_accumulation

    scheduler.step()
    print(f"Epoch {epoch+1} Loss: {epoch_loss/len(train_df):.4f}")

    if epoch_loss < best_score:
        torch.save(model.state_dict(), f"blip_epoch_{epoch+1}.pt")
        best_score = epoch_loss

  scaler = torch.cuda.amp.GradScaler()  # Original syntax for current versions
100%|██████████| 625/625 [00:50<00:00, 12.42it/s]
100%|██████████| 625/625 [00:50<00:00, 12.32it/s]
100%|██████████| 625/625 [00:50<00:00, 12.37it/s]
100%|██████████| 625/625 [00:50<00:00, 12.35it/s]
100%|██████████| 171/171 [00:13<00:00, 12.23it/s]


Epoch 1 Loss: 0.2475


100%|██████████| 625/625 [00:49<00:00, 12.53it/s]
100%|██████████| 625/625 [00:49<00:00, 12.56it/s]
100%|██████████| 625/625 [00:49<00:00, 12.56it/s]
100%|██████████| 625/625 [00:49<00:00, 12.64it/s]
100%|██████████| 171/171 [00:13<00:00, 12.30it/s]


Epoch 2 Loss: 0.1732


100%|██████████| 625/625 [00:49<00:00, 12.64it/s]
100%|██████████| 625/625 [00:49<00:00, 12.58it/s]
100%|██████████| 625/625 [00:50<00:00, 12.32it/s]
100%|██████████| 625/625 [00:50<00:00, 12.48it/s]
100%|██████████| 171/171 [00:13<00:00, 12.32it/s]


Epoch 3 Loss: 0.1574


100%|██████████| 625/625 [00:49<00:00, 12.71it/s]
100%|██████████| 625/625 [00:49<00:00, 12.67it/s]
100%|██████████| 625/625 [00:49<00:00, 12.65it/s]
100%|██████████| 625/625 [00:49<00:00, 12.60it/s]
100%|██████████| 171/171 [00:13<00:00, 12.46it/s]


Epoch 4 Loss: 0.1517


100%|██████████| 625/625 [00:49<00:00, 12.70it/s]
100%|██████████| 625/625 [00:49<00:00, 12.63it/s]
100%|██████████| 625/625 [00:50<00:00, 12.47it/s]
100%|██████████| 625/625 [00:49<00:00, 12.59it/s]
100%|██████████| 171/171 [00:13<00:00, 12.46it/s]


Epoch 5 Loss: 0.1515


100%|██████████| 625/625 [00:49<00:00, 12.65it/s]
100%|██████████| 625/625 [00:49<00:00, 12.64it/s]
100%|██████████| 625/625 [00:50<00:00, 12.49it/s]
100%|██████████| 625/625 [00:49<00:00, 12.57it/s]
100%|██████████| 171/171 [00:13<00:00, 12.42it/s]


Epoch 6 Loss: 0.1482


100%|██████████| 625/625 [00:49<00:00, 12.72it/s]
100%|██████████| 625/625 [00:49<00:00, 12.67it/s]
100%|██████████| 625/625 [00:49<00:00, 12.55it/s]
100%|██████████| 625/625 [00:49<00:00, 12.53it/s]
100%|██████████| 171/171 [00:13<00:00, 12.58it/s]


Epoch 7 Loss: 0.1383


100%|██████████| 625/625 [00:49<00:00, 12.75it/s]
100%|██████████| 625/625 [00:50<00:00, 12.49it/s]
100%|██████████| 625/625 [00:49<00:00, 12.55it/s]
100%|██████████| 625/625 [00:49<00:00, 12.53it/s]
100%|██████████| 171/171 [00:13<00:00, 12.45it/s]


Epoch 8 Loss: 0.1248


In [19]:
# Cell 7: Optimized Inference with Batching
from torch.utils.data import DataLoader

class TestDataset(Dataset):
    def __init__(self, df, image_dir, transform):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform
        self.image_ids = df['image_id'].tolist()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = self.image_ids[idx]
        img_path = os.path.join(self.image_dir, f"{img_id}.jpg")
        try:
            image = Image.open(img_path).convert('RGB')
            image = self.transform(image)
        except:
            image = torch.zeros(3, 384, 384)
        return image, img_id

# Batch processing setup
test_transform = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.ToTensor()
])

test_dataset = TestDataset(test_df, TEST_IMAGE_DIR, test_transform)
test_loader = DataLoader(
    test_dataset,
    batch_size=32,  # Increased batch size
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

model.load_state_dict(torch.load("blip_epoch_8.pt"))
model.eval()
predictions = []

with torch.inference_mode(), torch.cuda.amp.autocast():
    for images, img_ids in tqdm(test_loader):
        images = images.to('cuda', non_blocking=True)
        inputs = processor(images=images, return_tensors="pt", do_rescale=False).to('cuda')

        # Faster generation config
        outputs = model.generate(
            **inputs,
            max_length=32,
            num_beams=3,  # Reduced from 5
            early_stopping=True,
            temperature=0.85,
            repetition_penalty=1.2,
            num_return_sequences=1,
            do_sample=False  # Faster than sampling
        )

        # Batch decoding
        captions = processor.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend([{
            'image_id': img_id,
            'caption': caption
        } for img_id, caption in zip(img_ids, captions)])

pd.DataFrame(predictions).to_csv("fast_submission.csv", index=False)
print(f"Processed {len(predictions)} images in {len(test_loader)} batches")

  with torch.inference_mode(), torch.cuda.amp.autocast():
100%|██████████| 118/118 [02:58<00:00,  1.51s/it]


Processed 3771 images in 118 batches


In [27]:
# Cell 8: Training-based FGD Calculation (For Development Only)
import warnings
from tqdm import tqdm

# Warning about methodology
warnings.warn(
    "\n⚠️ Using training captions for FGD evaluation is not valid for final metrics! "
    "This only measures how similar generated captions are to training texts, not actual test performance.",
    UserWarning
)

# Load data with memory optimization
gt_df = pd.read_csv(TRAIN_CSV, dtype={'caption': 'string'}).dropna(subset=['caption'])
sub_df = pd.read_csv("fast_submission.csv", dtype={'caption': 'string'}).dropna(subset=['caption'])

# Align by index (since image_ids don't match)
N = min(len(gt_df), len(sub_df))
gt_captions = gt_df['caption'].iloc[:N].tolist()
pred_captions = sub_df['caption'].iloc[:N].tolist()

# Initialize model on GPU
gte_model = SentenceTransformer("thenlper/gte-small", device='cuda')

# Batch encoding with memory management
def batch_encode(model, texts, batch_size=256):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding", unit="batch"):
        batch = texts[i:i+batch_size]
        embeddings.append(model.encode(batch, convert_to_numpy=True))
    return np.concatenate(embeddings)

# Encode with mixed precision
with torch.cuda.amp.autocast():
    gt_emb = batch_encode(gte_model, gt_captions)
    pred_emb = batch_encode(gte_model, pred_captions)

# Calculate FGD
fgd_score = calculate_fgd(gt_emb, pred_emb)
print(f"\nDevelopment FGD Score (Training-based): {fgd_score:.5f}")

# Cleanup
del gte_model, gt_emb, pred_emb
torch.cuda.empty_cache()

⚠️ Using training captions for FGD evaluation is not valid for final metrics! This only measures how similar generated captions are to training texts, not actual test performance.
  with torch.cuda.amp.autocast():
Encoding: 100%|██████████| 15/15 [00:01<00:00,  8.55batch/s]
Encoding: 100%|██████████| 15/15 [00:01<00:00,  9.20batch/s]



Development FGD Score (Training-based): 0.44469
