## Load Food Dataset

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
food_images_directory = '/shared/data/painting/'
food_annotation_file_path = '/shared/data/painting/paintings.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head()


Unnamed: 0,id,file_path,captions
0,46323,/shared/data/painting/picts/46323.jpg,"[women are bent over a river 's edge , doing laundry, it is wash day at the edge of the river for these women, women are washing clothes by the river, women in rustic clothing are lined up along the bank of a river, people are kneeling near a shore]"
1,63879,/shared/data/painting/picts/63879.jpg,"[a group of people is swimming in the ocean, a lot of people in the water are playing and one person with clothes on, a group of people goes for a swim in the sea, the perfect boys are bathing in the sea stock photo, a group of men in the ocean is enjoying a swim]"
2,13922,/shared/data/painting/picts/13922.jpg,"[two people are sitting on a beach with their dog next to them, these two people are enjoying watching the sunset on the beach, a woman is sitting on a bench and someone is sitting on the ground, a woman and a man are sitting on the beach, some individuals are sitting in front of a body of water]"
3,80041,/shared/data/painting/picts/80041.jpg,"[an adult and a child are sitting at the table talking, a man and a woman were playing cards at a table, the woman sitting calmly plays cards with the man, a man and a woman sit opposite each other at the table, a woman hands a man a piece of paper across the table]"
4,6692,/shared/data/painting/picts/6692.jpg,"[a man and a woman are looking at a river in a forest, a man and woman posing in a field by a river, two people out in a wooded area near the water, painters are painting beside a stream, an afternoon in the countryside with a man , a woman , and an easel]"


In [2]:
df.shape

(200, 3)

## Generate Multiple Choice Question

In [3]:
# Generate random choice in [A, B, C, D]
import random

def generate_random_choice():
    return random.choice(['A', 'B', 'C', 'D'])

In [4]:
df['multiple_choice_solution'] = df.apply(lambda x: generate_random_choice(), axis=1)

In [5]:
df.head()

Unnamed: 0,id,file_path,captions,multiple_choice_solution
0,46323,/shared/data/painting/picts/46323.jpg,"[women are bent over a river 's edge , doing laundry, it is wash day at the edge of the river for these women, women are washing clothes by the river, women in rustic clothing are lined up along the bank of a river, people are kneeling near a shore]",C
1,63879,/shared/data/painting/picts/63879.jpg,"[a group of people is swimming in the ocean, a lot of people in the water are playing and one person with clothes on, a group of people goes for a swim in the sea, the perfect boys are bathing in the sea stock photo, a group of men in the ocean is enjoying a swim]",C
2,13922,/shared/data/painting/picts/13922.jpg,"[two people are sitting on a beach with their dog next to them, these two people are enjoying watching the sunset on the beach, a woman is sitting on a bench and someone is sitting on the ground, a woman and a man are sitting on the beach, some individuals are sitting in front of a body of water]",C
3,80041,/shared/data/painting/picts/80041.jpg,"[an adult and a child are sitting at the table talking, a man and a woman were playing cards at a table, the woman sitting calmly plays cards with the man, a man and a woman sit opposite each other at the table, a woman hands a man a piece of paper across the table]",B
4,6692,/shared/data/painting/picts/6692.jpg,"[a man and a woman are looking at a river in a forest, a man and woman posing in a field by a river, two people out in a wooded area near the water, painters are painting beside a stream, an afternoon in the countryside with a man , a woman , and an easel]",D


In [None]:
# Generate multiple choice questions

In [6]:
import os
from openai import OpenAI
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),  
)


def generate_multiple_choice_question(reference_caption, correct_choice): 
    # Define the prompt to generate inferior choices
    prompt = f"""
    The ground truth caption is:
    "{reference_caption}"

    Generate three inferior captions that include either inaccurate details, or are non-fluent with syntactic errors. 
    Format the result as a multiple-choice question. 
    Question title should be "Which of the following captions best describes the painting?".
    The correct choice should be placed at choice "{correct_choice}". 
    Ensure the incorrect choices are realistic but clearly wrong.
    Do not generate special symbols such as '*'.
    """

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.7,
        max_tokens=200,
    )

    # Extract the generated multiple-choice question
    question = response.choices[0].message.content    
    
    return question

In [8]:
df['multiple_choice_question'] = df.apply(lambda x: generate_multiple_choice_question(x['captions'][0], x['multiple_choice_solution']), axis=1)

In [12]:
import json

# Load the original dataset
with open("/shared/data/painting/paintings_with_MCQ.json", "r") as f:
    data = json.load(f)

# Process the dataset to keep only the first caption
processed_data = []
for item in data:
    processed_data.append({
        "id": item["id"],
        "file_path": item["file_path"],
        "caption": item["captions"][0],  # Retain only the first caption
        "multiple_choice_solution": item["multiple_choice_solution"],
        "multiple_choice_question": item["multiple_choice_question"]
    })

# Save the processed dataset to a new JSON file
with open("/shared/data/painting/painting_with_MCQ.json", "w") as f:
    json.dump(processed_data, f, indent=4)

print("Processed dataset saved to '/shared/data/painting/painting_with_MCQ.json'.")


Processed dataset saved to '/shared/data/painting/painting_with_MCQ.json'.


In [13]:
import pandas as pd

# Load the JSON file into a pandas DataFrame
df = pd.read_json("/shared/data/painting/painting_with_MCQ.json")


df.head()

Unnamed: 0,id,file_path,caption,multiple_choice_solution,multiple_choice_question
0,46323,/shared/data/painting/picts/46323.jpg,"women are bent over a river 's edge , doing laundry",C,"Which of the following captions best describes the painting?\n\nA) Women are sitting on a boat in the river, fishing.\n\nB) Women are bending over near a pond's shore, cooking.\n\nC) Women are bent over a river's edge, doing laundry.\n\nD) Woman are bending over river edge, working in the garden."
1,63879,/shared/data/painting/picts/63879.jpg,a group of people is swimming in the ocean,C,Which of the following captions best describes the painting?\n\nA) A group of cats is playing in the sand.\n\nB) A single person is jogging on the beach.\n\nC) A group of people is swimming in the ocean.\n\nD) A crowd of people sitting by the pool in the park.
2,13922,/shared/data/painting/picts/13922.jpg,two people are sitting on a beach with their dog next to them,C,Which of the following captions best describes the painting?\n\nA) One person is standing on a mountain with a cat nearby.\n\nB) Two people sitting under a tree with their dog flying above them.\n\nC) Two people are sitting on a beach with their dog next to them.\n\nD) Three people are running in the park with their dog chasing a ball.
3,80041,/shared/data/painting/picts/80041.jpg,an adult and a child are sitting at the table talking,B,Which of the following captions best describes the painting?\n\nA) A child is sitting alone at the table watching TV.\n\nB) An adult and a child are sitting at the table talking.\n\nC) Two adults are standing near the table while cooking.\n\nD) A dog and a child are sitting at the table eating.
4,6692,/shared/data/painting/picts/6692.jpg,a man and a woman are looking at a river in a forest,D,Which of the following captions best describes the painting?\n\nA) A woman and a child are staring at a lake in a desert.\n\nB) A man and a girl are sitting on a boat in the ocean.\n\nC) Two people are having a picnic on the beach.\n\nD) A man and a woman are looking at a river in a forest.


In [9]:
# Save the annotation with multiple choice question to output file

# Convert DataFrame to a list of dictionaries
list_of_dicts = df.to_dict(orient="records")

# Save the list of dictionaries to a JSON file
output_file = "/shared/data/painting/paintings_with_MCQ.json"
with open(output_file, "w") as file:
    json.dump(list_of_dicts, file, indent=4)

print(f"DataFrame saved as a list of dictionaries in {output_file}")

DataFrame saved as a list of dictionaries in /shared/data/painting/paintings_with_MCQ.json


## Perform Multiple Choice Selection

In [5]:
# Load annotation with multiple choice question data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
#food_images_directory = '/shared/data/food_data/food_images/'
food_annotation_file_path = '/shared/data/painting/painting_with_MCQ.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head()

Unnamed: 0,id,file_path,caption,multiple_choice_solution,multiple_choice_question
0,46323,/shared/data/painting/picts/46323.jpg,"women are bent over a river 's edge , doing laundry",C,"Which of the following captions best describes the painting?\n\nA) Women are sitting on a boat in the river, fishing.\n\nB) Women are bending over near a pond's shore, cooking.\n\nC) Women are bent over a river's edge, doing laundry.\n\nD) Woman are bending over river edge, working in the garden."
1,63879,/shared/data/painting/picts/63879.jpg,a group of people is swimming in the ocean,C,Which of the following captions best describes the painting?\n\nA) A group of cats is playing in the sand.\n\nB) A single person is jogging on the beach.\n\nC) A group of people is swimming in the ocean.\n\nD) A crowd of people sitting by the pool in the park.
2,13922,/shared/data/painting/picts/13922.jpg,two people are sitting on a beach with their dog next to them,C,Which of the following captions best describes the painting?\n\nA) One person is standing on a mountain with a cat nearby.\n\nB) Two people sitting under a tree with their dog flying above them.\n\nC) Two people are sitting on a beach with their dog next to them.\n\nD) Three people are running in the park with their dog chasing a ball.
3,80041,/shared/data/painting/picts/80041.jpg,an adult and a child are sitting at the table talking,B,Which of the following captions best describes the painting?\n\nA) A child is sitting alone at the table watching TV.\n\nB) An adult and a child are sitting at the table talking.\n\nC) Two adults are standing near the table while cooking.\n\nD) A dog and a child are sitting at the table eating.
4,6692,/shared/data/painting/picts/6692.jpg,a man and a woman are looking at a river in a forest,D,Which of the following captions best describes the painting?\n\nA) A woman and a child are staring at a lake in a desert.\n\nB) A man and a girl are sitting on a boat in the ocean.\n\nC) Two people are having a picnic on the beach.\n\nD) A man and a woman are looking at a river in a forest.


### Use llava model

In [2]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
from PIL import Image
import requests 

In [3]:
# Load model from local directory 
model_path = '/shared/model/llava-v1.6-mistral-7b-hf'

processor = LlavaNextProcessor.from_pretrained(model_path)

model = LlavaNextForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, load_in_4bit=True) 
#model.to("cuda:0")

Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
# helper function: 
def perform_multiple_choice_task_llava(image_path, question):
    # image = Image.open(img_url)
    image = Image.open(image_path)
    
    conversation = [
        {

          "role": "user",
          "content": [
              {"type": "text", "text": question + "\nOnly return the correct choice with a single letter."},
              {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

    # autoregressively complete prompt
    output = model.generate(**inputs, max_new_tokens=150)
    output = processor.decode(output[0], skip_special_tokens=True)

    # mcq_question = output.split('[/INST]')[0].split('[INST] ')[1].strip()
    mcq_answer = output.split('[/INST]')[1].strip()
    return mcq_answer

In [7]:
df['multiple_choice_prediction'] = df.apply(lambda x: perform_multiple_choice_task_llava(x['file_path'], x['multiple_choice_question']), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pa

In [8]:
df.head(1)

Unnamed: 0,id,file_path,caption,multiple_choice_solution,multiple_choice_question,multiple_choice_prediction
0,46323,/shared/data/painting/picts/46323.jpg,"women are bent over a river 's edge , doing laundry",C,"Which of the following captions best describes the painting?\n\nA) Women are sitting on a boat in the river, fishing.\n\nB) Women are bending over near a pond's shore, cooking.\n\nC) Women are bent over a river's edge, doing laundry.\n\nD) Woman are bending over river edge, working in the garden.",C


In [9]:
df[['multiple_choice_solution', 'multiple_choice_prediction']]

Unnamed: 0,multiple_choice_solution,multiple_choice_prediction
0,C,C
1,C,C
2,C,C
3,B,B
4,D,D
...,...,...
195,C,C
196,D,D
197,C,C
198,C,C


In [10]:
# Save the MCQ result

# Convert DataFrame to a list of dictionaries
list_of_dicts = df.to_dict(orient="records")

# Save the list of dictionaries to a JSON file
output_file = "/shared/data/painting/painting_with_MCQ_result.json"

with open(output_file, "w") as file:
    json.dump(list_of_dicts, file, indent=4)

print(f"DataFrame saved as a list of dictionaries in {output_file}")

DataFrame saved as a list of dictionaries in /shared/data/painting/painting_with_MCQ_result.json


## Perform Image Captioning Task

In [11]:
# Load annotation with multiple choice question data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
#food_images_directory = '/shared/data/food_data/food_images/'
food_annotation_file_path = '/shared/data/painting/painting_with_MCQ.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head(1)

Unnamed: 0,id,file_path,caption,multiple_choice_solution,multiple_choice_question
0,46323,/shared/data/painting/picts/46323.jpg,"women are bent over a river 's edge , doing laundry",C,"Which of the following captions best describes the painting?\n\nA) Women are sitting on a boat in the river, fishing.\n\nB) Women are bending over near a pond's shore, cooking.\n\nC) Women are bent over a river's edge, doing laundry.\n\nD) Woman are bending over river edge, working in the garden."


In [12]:
# helper function: 
def perform_image_captioning_task_llava(img_path):
    image = Image.open(img_path)
    
    conversation = [
        {

          "role": "user",
          "content": [
              {"type": "text", "text": "Generate a caption for this image."},
              {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

    # autoregressively complete prompt
    output = model.generate(**inputs, max_new_tokens=150)
    output = processor.decode(output[0], skip_special_tokens=True)

    caption = output.split('[/INST]')[1].strip()
    print(caption)
    return caption

In [13]:
df['predicted_caption'] = df.apply(lambda x: perform_image_captioning_task_llava(x['file_path']), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene gathering by the water's edge."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene beach scene, where the sea meets the sky, and life unfolds in the golden light of the setting sun."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment of reflection by the sea."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"An intimate moment captured in art: a man and a child share a meal at a table, their expressions reflecting the warmth of their connection."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment in the countryside, where art and nature intertwine."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of tranquility: a mother and child share a quiet moment in the comfort of their home."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Two ballerinas in a dance studio, captured in a moment of grace and elegance."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene afternoon by the pond, where life's simple pleasures are captured in a moment of tranquility."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A dance of harmony: A classical music performance captured in a painting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of tranquility: A woman in a historical dress, washing her hands at a vintage sink."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene rural scene with cows and a farmer's hut, captured in the impressionist style."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"The Art of Bathing: A Study in Impressionism"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Two young minds immersed in the world of books."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of tranquility: A woman finds solace in the beauty of nature, her feet resting on the soft grass as she savors the serenity of the landscape."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A day in the life of a rural farmer, carrying the day's harvest on her head."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A lively gathering of musicians in a quaint village, under a dramatic sky."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection, captured in the soft glow of a candlelit room."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene afternoon by the pond, where the girl and the swans share a moment of tranquility."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene afternoon in the garden, captured in the style of Fauvism."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene rural scene captured in the style of post-impressionism."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment in a garden, where the beauty of nature and the elegance of human grace intertwine."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A day at the beach: A lively scene of seaside leisure, with colorful umbrellas, a bustling boardwalk, and the sound of the ocean waves."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A young man, lost in thought, holds a book and a pair of shoes, perhaps contemplating the journey ahead."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A lively beach scene with people and boats, captured in the impressionist style."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A dance of life: the rhythm of movement and the harmony of nature."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A Ballet Dancer in a Dreamy Landscape"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment on the water, with the tranquility of the sea and the calmness of the sky."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Two warriors in a dance of honor and valor."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Reflections of a moment of quiet contemplation."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A tender moment captured in black and white."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Contemplation by the Sea: A Moment of Reflection"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A vibrant abstract still life with a hint of a figure, inviting the viewer to explore the interplay of colors and shapes."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A gathering of friends in a quaint village, sharing stories and laughter by the light of the moon."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of tranquility amidst the dance of life."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet companionship captured in the stillness of a room."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene pastoral scene with horses and a mountainous landscape."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of tranquility: A young girl immersed in the simple joy of a cup of tea in a serene garden setting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of elegance and innocence captured in a painting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of camaraderie amidst the harshness of nature."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in a cozy living room."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A group of jockeys and their horses, captured in the midst of a race, under a dramatic sky."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment on the water, as two men share a quiet conversation in their boat."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene gathering by the sea, where the horizon blends with the sky."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment of two jockeys and their horses, galloping through a field under the warm glow of the setting sun."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Two figures in a field, captured in the impressionist style."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in a cozy setting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in the mirror."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene afternoon in the garden, where the flowers bloom and the tea is served."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection, as the world outside the window unfolds."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in the heart of a bustling city."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A solitary figure in a landscape, captured in the impressionist style."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A surreal portrait of a woman, her face blending with the surrounding abstract elements, evoking a dreamlike atmosphere."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A surreal encounter between a painter and his muse in a dreamlike setting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A whimsical scene of childhood innocence and imagination, captured in the heart of a cozy room."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene day at the lake, with the tranquility of nature and the joy of leisure."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene gathering in a picturesque landscape, where the sky meets the earth in a dance of colors."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet conversation in a bustling Victorian pub."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene landscape with a castle-like building, under a sky filled with clouds."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of camaraderie and conversation in a cozy setting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A lively gathering in a park, where the colors of nature and the attire of the people blend harmoniously."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A romantic dance in a park, where the colors of nature blend with the elegance of the couple's attire."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in the heart of the city."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A lively scene at the beach, where the umbrellas provide a colorful canopy for the people to enjoy the seaside atmosphere."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"An artistic portrayal of a man and his feline companion sharing a quiet moment at a table."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A lively gathering in a park, where the colors of nature and the attire of the people blend harmoniously."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in an urban setting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment of friendship and relaxation in the park."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A vibrant gathering of figures, their forms intertwined in a dance of color and emotion."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene day in a small town, where life moves at the pace of the river."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A solitary figure in a serene landscape, 1935."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Three women in colorful attire, each with a unique expression, standing together in a vibrant and abstract setting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in a bustling cityscape."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene day at the harbor, where life unfolds on the water's edge."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene gathering of equestrians in a pastoral landscape, under the soft glow of a setting sun."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of tranquility: two young girls share a story in the heart of nature."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A surreal encounter between two figures, each with a distinct expression and attire, set against a backdrop of a chair and a book."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A young boy in a pink shirt, immersed in the joy of nature, reaches out to a vibrant yellow flower in a field of blooms."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Two figures in a room, their forms intertwined with the vibrant hues of the painting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of camaraderie in a bygone era."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A rugged path through a mountainous landscape, where the elements of nature have left their mark."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A tender moment captured in a sketch, where two figures share a kiss, their love transcending the boundaries of their forms."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment in a garden, where the beauty of nature and the elegance of a woman's attire blend harmoniously."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A young girl in a white dress, lost in thought, with a serene backdrop of a sleeping figure."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Elegance in Motion: A Dance of Art and Expression"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment of leisure by the sea."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of elegance and intrigue in a vintage setting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A Gathering of Family and Friends in a Rustic Courtyard"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in an artist's studio."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A tender moment captured in art: a mother's love and care for her child."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A young girl stands contemplatively in a serene garden, with the warm glow of a setting sun casting long shadows and bathing the scene in a soft, golden light."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of tranquility in the wilderness, where man and nature coexist in harmony."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A congregation of faith, gathered under the watchful gaze of the church."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in a traditional setting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A woman in a blue dress, hanging laundry in a garden setting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A whimsical encounter in a dreamlike landscape."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Elegance in Motion: A Ballet Dancer in a Studio"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene village scene, where life unfolds amidst the tranquility of nature."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Contemplation and elegance: a moment of quiet reflection between two women in a luxurious setting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A Mother and Child in the Countryside, with a flock of birds in the foreground."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A quiet moment shared between two individuals, captured in the timeless art of impressionism."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A group of children enjoying a day of fun in the water, with a sense of camaraderie and adventure."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene day at the beach, where the sun casts long shadows and the sand is a canvas for the day's adventures."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene beach scene with a boat and a person, captured in the impressionist style."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A tender moment captured in the impressionist style."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment of tranquility and companionship in the embrace of nature."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet concentration as the woman weaves a tapestry of life's stories."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment of a woman immersed in the beauty of music, surrounded by the delicate elegance of flowers."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A Rainy Day in Paris: A Couple Strolls Amidst the City's Charm"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A Gathering of Equestrian Gentlemen in a Field of Dreams"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment of two women sharing a moment of tranquility in a garden, captured in the impressionist style."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A dance of elegance and grace."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment of solitude in the embrace of nature."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Autumn's Embrace: A Group of People Strolls Through a Park, Amidst the Falling Leaves and the Fading Light of Day."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A day in the life of Paris, captured in the sketch of a group of people strolling along the Seine."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A dance of life and artistry, captured in a single, breathtaking moment."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A woman in a traditional red dress, carrying a woven basket filled with purple flowers, walking down a pathway lined with trees and a wooden bridge."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A tender moment captured in art, two figures entwined in a serene embrace."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Two figures in a dimly lit room, their forms shrouded in mystery, as they stand in front of a window, their gazes lost in the darkness beyond."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Two figures in a dance of harmony, set against a backdrop of nature's embrace."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A celebration of life and artistry, as seen through the eyes of a master."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in the soft glow of the setting sun."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A group of cowboys riding through the desert, with the sun casting long shadows and painting the sky with hues of orange and pink."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in a cozy parlor."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of tranquility: A woman in a historical dress, immersed in a book, finds solace in the serene garden setting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene day in the park, where the horse-drawn carriages add a touch of elegance to the scene."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Serenity at the Shore: A Lady's Solitude"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A dance of color and form, where two figures become one with the art."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection, captured in the timeless art of impressionism."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment of domesticity, as two women share a quiet moment in the tranquil beauty of nature."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"The solitude of the artist's studio: a moment of quiet reflection."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Two young girls in a garden, sharing a moment of tranquility and imagination."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in the heart of a bustling city."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A Gathering of Artists in a Studio, Captured in a Realistic Style"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A dance of innocence and youth, captured in the golden hues of impressionism."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene path through a lush forest, inviting the viewer to explore the tranquil beauty of nature."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A dance of elegance and grace, captured in the art of impressionism."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in an impressionist setting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A lively gathering in a grand room, where art and society intersect."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene afternoon in the park, where the world seems to pause and let the beauty of nature and artistry unfold."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A whimsical scene of a musician and a muse, captured in the style of cubism."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection, as the artist captures the essence of her subject."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of tranquility: A young woman in a white dress, engrossed in her reflection in the mirror, as she prepares for the day."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in a cozy living room."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A silent symphony of life and death."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A solitary figure in the shadows, lost in thought amidst the chaos of the world."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of elegance at the seaside."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment in the park, where the trees and the people come together in harmony."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A quiet moment in a vibrant room."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment in the garden, where two individuals share a moment of tranquility amidst the lush greenery."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A family's journey through the park, captured in the style of impressionism."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A tender moment between two individuals, captured in the warmth of a wooden cabin."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A dance of abstract forms, where color and shape intertwine in a harmonious ballet."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A Gathering of Rural Life: A Painting Capturing the Spirit of Community"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Elegance in Motion: A Ballerina's Grace"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A lively market scene with a dog and a cat, under a cloudy sky."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A lively gathering at a bar, with a woman in a striking dress as the center of attention."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A solitary figure in a room bathed in the soft glow of a window, lost in the quiet beauty of the night."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A young girl immersed in the art of weaving, her mind and hands in harmony with the loom."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"The Art of the Female Form: A Study in Expression and Form"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A solitary figure in a vast landscape, the artist captures a moment of quiet reflection under the open sky."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Embrace of the moment: A dance of connection and freedom."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A lively scene of a horse-drawn carriage parade, with spectators enjoying the festivities under a clear sky."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A gathering of elegance and tradition in a serene garden setting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Contemplation of the Seasons: A Moment of Reflection"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of serenity captured in the art of impressionism."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of tranquility in the countryside, as a young girl gathers flowers and a boy watches from afar."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A tender moment captured in the impressionist style."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of tranquility: a man and his dog share a peaceful slumber."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A dance of life: the joy of movement and the beauty of nature."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in the company of nature."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"An evening of quiet companionship."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene winter scene with people enjoying a day out in the snow."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A tender moment captured in art, where a mother and child share a moment of love and innocence."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in an artist's studio."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection as the world outside waits."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of camaraderie and shared joy in a field of dreams."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A group of adventurers on horseback, crossing a river in search of treasure."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"An intimate moment captured in the style of post-impressionism."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene afternoon in the park, where the trees whisper stories and the grass is a carpet of dreams."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A tranquil moment in nature, where the river whispers stories of the past."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Elegance in Motion: A Ballerina's Grace"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A tender moment captured in art: a mother's love and a child's innocence."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A quiet moment of reflection in a cozy parlor."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene day by the water, with a woman in a white dress and a boat in the background."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in the comfort of a cozy room."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of shared joy and curiosity as the children immerse themselves in the world of the book."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment in nature, where two figures find solace in the tranquil setting of a riverbank."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A whimsical dance of color and form."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A tender moment captured in art: a mother's love and care for her child."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection amidst the chaos of the day."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of tranquility in the countryside, where the past meets the present."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of quiet reflection in a cozy kitchen."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A family gathering around a table laden with food and drink, sharing a moment of togetherness in a serene garden setting."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene day by the river, with a steamboat and a group of people enjoying the view."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of rural tranquility: A farmer tending to his hay bale, with the watchful eyes of his chickens nearby."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A lively gathering by the water, where the boats are as much a part of the scene as the people."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene moment on the water, where two individuals share a quiet conversation."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A serene day by the water, where life's simple pleasures are found in the company of friends and the rhythmic sound of waves."
"Two figures in a boat, with the sea as their backdrop, painted in an impressionistic style."


In [15]:
df[['caption', 'predicted_caption']]

Unnamed: 0,caption,predicted_caption
0,"women are bent over a river 's edge , doing laundry","""A serene gathering by the water's edge."""
1,a group of people is swimming in the ocean,"""A serene beach scene, where the sea meets the sky, and life unfolds in the golden light of the setting sun."""
2,two people are sitting on a beach with their dog next to them,"""A serene moment of reflection by the sea."""
3,an adult and a child are sitting at the table talking,"""An intimate moment captured in art: a man and a child share a meal at a table, their expressions reflecting the warmth of their connection."""
4,a man and a woman are looking at a river in a forest,"""A serene moment in the countryside, where art and nature intertwine."""
...,...,...
195,a man bails hay while chicken eats around him,"""A moment of rural tranquility: A farmer tending to his hay bale, with the watchful eyes of his chickens nearby."""
196,people are walking around and standing on a boat,"""A lively gathering by the water, where the boats are as much a part of the scene as the people."""
197,two fishermen in a small boat out fishing together,"""A serene moment on the water, where two individuals share a quiet conversation."""
198,women at the riverfront are washing their clothes,"""A serene day by the water, where life's simple pleasures are found in the company of friends and the rhythmic sound of waves."""


In [16]:
# Save the image_captioning result

# Convert DataFrame to a list of dictionaries
list_of_dicts = df.to_dict(orient="records")

# Save the list of dictionaries to a JSON file
output_file = "/shared/data/painting/painting_with_image_captioning_result.json"

with open(output_file, "w") as file:
    json.dump(list_of_dicts, file, indent=4)

print(f"DataFrame saved as a list of dictionaries in {output_file}")

DataFrame saved as a list of dictionaries in /shared/data/painting/painting_with_image_captioning_result.json


## Evaluate results

In [17]:
# import libraries
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice

### Multiple Choice Accuracy

In [19]:
# Load annotation with multiple choice question result data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
food_annotation_file_path = '/shared/data/painting/painting_with_MCQ_result.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head(1)

Unnamed: 0,id,file_path,caption,multiple_choice_solution,multiple_choice_question,multiple_choice_prediction
0,46323,/shared/data/painting/picts/46323.jpg,"women are bent over a river 's edge , doing laundry",C,"Which of the following captions best describes the painting?\n\nA) Women are sitting on a boat in the river, fishing.\n\nB) Women are bending over near a pond's shore, cooking.\n\nC) Women are bent over a river's edge, doing laundry.\n\nD) Woman are bending over river edge, working in the garden.",C


In [20]:
def calculate_multiple_choice_question_accuracy(df):
    # Calculate accuracy
    accuracy = (df["multiple_choice_solution"] == df["multiple_choice_prediction"]).mean()

    print(f"Prediction Accuracy: {accuracy * 100:.2f}%") 
    return accuracy

In [21]:
calculate_multiple_choice_question_accuracy(df)

Prediction Accuracy: 75.00%


0.75

### Caption Quality


In [23]:
# Load annotation with multiple choice question result data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
food_annotation_file_path = '/shared/data/painting/painting_with_image_captioning_result.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head(1)

Unnamed: 0,id,file_path,caption,multiple_choice_solution,multiple_choice_question,predicted_caption
0,46323,/shared/data/painting/picts/46323.jpg,"women are bent over a river 's edge , doing laundry",C,"Which of the following captions best describes the painting?\n\nA) Women are sitting on a boat in the river, fishing.\n\nB) Women are bending over near a pond's shore, cooking.\n\nC) Women are bent over a river's edge, doing laundry.\n\nD) Woman are bending over river edge, working in the garden.","""A serene gathering by the water's edge."""


In [None]:
# BLEU

In [26]:
from nltk.translate.bleu_score import sentence_bleu


# BLEU Evaluation (Average across multiple references)
def evaluate_bleu(df):
    bleu_scores = []
    
    for _, row in df.iterrows():
        references = row['caption']  # List of reference captions
        candidate = row['predicted_caption']  # Predicted caption

        # Tokenize the candidate and reference captions
        tokenized_references = [ref.strip('"').split() for ref in references]  # List of tokenized references
        tokenized_candidate = candidate.strip('"').split()  # Tokenized candidate

        # Compute BLEU for all references
        row_bleu_scores = [
            sentence_bleu([ref], tokenized_candidate) for ref in tokenized_references
        ]
        
        # Average across references
        bleu_scores.append(sum(row_bleu_scores) / len(row_bleu_scores))
    
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    print(f"Average BLEU: {avg_bleu:.4f}")
    return bleu_scores

In [27]:
evaluate_bleu(df)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU: 0.0000


[0.0,
 0.0,
 0.0,
 1.2298083538063708e-232,
 0.0,
 4.655181264783869e-233,
 8.562165723306571e-233,
 4.0173765965413875e-233,
 1.063959715593678e-232,
 8.972141065609099e-233,
 9.494692261260039e-233,
 0.0,
 0.0,
 0.0,
 8.07140754817601e-233,
 2.231279780268054e-233,
 5.5402248797866055e-233,
 9.699611962820646e-233,
 0.0,
 0.0,
 1.1056055503861688e-232,
 1.0736158929804562e-232,
 7.4795097680039515e-233,
 0.0,
 0.0,
 1.3440488685343995e-232,
 0.0,
 5.843528830692392e-233,
 1.2620177169337084e-232,
 0.0,
 0.0,
 1.3195397097297673e-232,
 2.6802558797240456e-233,
 0.0,
 3.3753204770558055e-233,
 1.0244914152188952e-232,
 6.47081973040559e-233,
 8.53742846015746e-233,
 0.0,
 1.1383237946876614e-232,
 3.076467618465861e-233,
 7.385805362616657e-233,
 0.0,
 2.0757241739270874e-233,
 1.1383237946876614e-232,
 7.888763921434729e-233,
 0.0,
 0.0,
 0.0,
 4.833792288129302e-233,
 1.2731966955639581e-232,
 4.655181264783869e-233,
 5.535290224126518e-233,
 3.6436639788908844e-233,
 0.0,
 7.3707036

In [None]:
# METEOR

In [30]:
import nltk
nltk.download('wordnet')
from nltk.translate.meteor_score import meteor_score

# METEOR Evaluation (Average across multiple references)
def evaluate_meteor(df):
    meteor_scores = []
    
    for _, row in df.iterrows():
        references = row['caption']  # List of reference captions
        candidate = row['predicted_caption'].strip('"')  # Predicted caption (raw string)

        # Compute METEOR for all references
        row_meteor_scores = [
            meteor_score([[ref]], [candidate]) for ref in references
        ]
        
        # Average across references
        meteor_scores.append(sum(row_meteor_scores) / len(row_meteor_scores))
    
    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    print(f"Average METEOR: {avg_meteor:.4f}")
    return meteor_scores


[nltk_data] Downloading package wordnet to /home/Cassie-
[nltk_data]     nlp/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
evaluate_meteor(df)

Average METEOR: 0.0000


[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [None]:
# ROUGE

In [33]:
# ROUGE Evaluation (Average across multiple references)
def evaluate_rouge(df):
    rouge1_scores, rouge2_scores, rougeL_scores = [], [], []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    for _, row in df.iterrows():
        references = row['caption']  # List of reference captions
        candidate = row['predicted_caption']  # Predicted caption
        
        # Compute ROUGE scores for all references
        row_rouge1_scores, row_rouge2_scores, row_rougeL_scores = [], [], []
        for ref in references:
            scores = scorer.score(ref, candidate.strip('"'))
            row_rouge1_scores.append(scores['rouge1'].fmeasure)
            row_rouge2_scores.append(scores['rouge2'].fmeasure)
            row_rougeL_scores.append(scores['rougeL'].fmeasure)
        
        # Average across references
        rouge1_scores.append(sum(row_rouge1_scores) / len(row_rouge1_scores))
        rouge2_scores.append(sum(row_rouge2_scores) / len(row_rouge2_scores))
        rougeL_scores.append(sum(row_rougeL_scores) / len(row_rougeL_scores))
    
    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
    avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
    avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)
    
    print(f"Average ROUGE-1: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L: {avg_rougeL:.4f}")
    return rouge1_scores, rouge2_scores, rougeL_scores

In [34]:
evaluate_rouge(df)

Average ROUGE-1: 0.0131
Average ROUGE-2: 0.0000
Average ROUGE-L: 0.0131


([0.016771488469601675,
  0.004329004329004328,
  0.01092896174863388,
  0.011611030478955007,
  0.028846153846153855,
  0.00554016620498615,
  0.012121212121212121,
  0.009426551453260015,
  0.016722408026755852,
  0.01111111111111111,
  0.021116138763197588,
  0.03333333333333334,
  0.0,
  0.005172413793103447,
  0.01406469760900141,
  0.003322259136212625,
  0.00784313725490196,
  0.012012012012012012,
  0.009324009324009324,
  0.010416666666666668,
  0.013157894736842105,
  0.01043478260869565,
  0.008571428571428572,
  0.011904761904761906,
  0.009852216748768475,
  0.03,
  0.0053523639607493305,
  0.01111111111111111,
  0.028169014084507043,
  0.016771488469601675,
  0.01754385964912281,
  0.013640238704177327,
  0.003189792663476874,
  0.019607843137254898,
  0.005305039787798409,
  0.01818181818181818,
  0.006688963210702343,
  0.01515151515151515,
  0.016,
  0.020202020202020204,
  0.003661327231121281,
  0.009538950715421303,
  0.01360544217687075,
  0.002217294900221729,
  0

In [None]:
# CIDEr and SPICE (unchanged, since they handle multiple references internally)

In [35]:
def evaluate_cider(df):
    ref_dict = {str(idx): row['caption'] for idx, row in df.iterrows()}
    cand_dict = {str(idx): [row['predicted_caption']] for idx, row in df.iterrows()}
    
    cider_scorer = Cider()
    score, _ = cider_scorer.compute_score(ref_dict, cand_dict)
    print(f"Average CIDEr: {score:.4f}")
    return score

In [36]:
evaluate_cider(df)

AssertionError: 

In [71]:
def evaluate_spice(df):
    ref_dict = {str(idx): row['reference_caption'] for idx, row in df.iterrows()}
    cand_dict = {str(idx): [row['predicted_caption']] for idx, row in df.iterrows()}
    
    spice_scorer = Spice()
    score, _ = spice_scorer.compute_score(ref_dict, cand_dict)
    print(f"Average SPICE: {score:.4f}")
    return score

In [73]:
evaluate_spice(df)

Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.9 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.8 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [1.0 sec].
Loading classif

SPICE evaluation took: 12.85 s
Average SPICE: 0.2341


0.23411371237458198