In [3]:
pip install nltk pycocoevalcap

Note: you may need to restart the kernel to use updated packages.


In [4]:
import random
import os
from PIL import Image
import matplotlib.pyplot as plt
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, BertTokenizer
import torch
import nltk
from nltk.translate.bleu_score import corpus_bleu
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.meteor.meteor import Meteor

In [5]:
# Initialize NLTK
nltk.download('punkt')

# Load pre-trained model and processor
feature_extractor = ViTImageProcessor.from_pretrained("atasoglu/vit-bert-flickr8k")
tokenizer = BertTokenizer.from_pretrained("atasoglu/vit-bert-flickr8k")
model = VisionEncoderDecoderModel.from_pretrained("atasoglu/vit-bert-flickr8k")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [6]:
# Function to load captions
def load_captions(captions_file):
    image_captions = {}
    with open(captions_file, 'r') as file:
        for line in file:
            parts = line.strip().split('#', 2)
            if len(parts) != 3:
                continue
            img_id, _, caption = parts
            if img_id not in image_captions:
                image_captions[img_id] = []
            image_captions[img_id].append(caption)
    return image_captions

# Function to generate a caption for a single image
def generate_caption(image_path, max_new_tokens=50):
    img = Image.open(image_path)
    pixel_values = feature_extractor(images=[img], return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    output_ids = model.generate(pixel_values, max_new_tokens=max_new_tokens)
    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    return preds[0]

In [11]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# Evaluation function
def evaluate_captions(generated_captions, reference_captions):
    # Ensure references are structured as a list of lists, where each sublist contains tokenized reference captions
    refs = [[ref.split() for ref in refs_for_image] for refs_for_image in reference_captions]
    hyps = [gen_caption.split() for gen_caption in generated_captions]

    # Ensure the number of hypotheses matches the number of reference sets
    assert len(hyps) == len(refs), "The number of hypotheses and reference sets should be the same."

    # Apply smoothing function to BLEU score
    smoothing_function = SmoothingFunction().method4
    bleu_score = corpus_bleu(refs, hyps, smoothing_function=smoothing_function)

    # METEOR Score
    meteor = Meteor()
    meteor_score, _ = meteor.compute_score({i: [" ".join(ref) for ref in refs[i]] for i in range(len(refs))}, {i: [" ".join(hyps[i])] for i in range(len(hyps))})

    # ROUGE Score
    rouge = Rouge()
    rouge_score, _ = rouge.compute_score({i: [" ".join(ref) for ref in refs[i]] for i in range(len(refs))}, {i: [" ".join(hyps[i])] for i in range(len(hyps))})

    # CIDEr Score
    cider = Cider()
    cider_score, _ = cider.compute_score({i: [" ".join(ref) for ref in refs[i]] for i in range(len(refs))}, {i: [" ".join(hyps[i])] for i in range(len(hyps))})

    return bleu_score, meteor_score, rouge_score, cider_score


In [12]:
  
# Path to your dataset
image_folder = r'C:\Users\User\Workbooks\Dissertation\Flickr8k_Dataset\Flicker8k_Dataset'
captions_file = r'C:\Users\User\Workbooks\Dissertation\Flickr8k_text\Flickr8k.token.txt'

In [13]:
# Load captions
image_captions = load_captions(captions_file)

# Select 5 random images
image_files = list(image_captions.keys())
random_image_files = random.sample(image_files, 5)

# Generate captions for the selected images
image_paths = [os.path.join(image_folder, img_file) for img_file in random_image_files]
generated_captions = [generate_caption(image_path) for image_path in image_paths]

# Reference captions for evaluation (list of lists, where each list corresponds to an image)
reference_captions = [image_captions[img_file] for img_file in random_image_files]

# Evaluate captions
bleu_score, meteor_score, rouge_score, cider_score = evaluate_captions(generated_captions, reference_captions)


In [None]:
print(f"BLEU Score: {bleu_score}")
print(f"METEOR Score: {meteor_score}")
print(f"ROUGE Score: {rouge_score}")
print(f"CIDEr Score: {cider_score}")