## Load Food Dataset

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json

In [2]:
# Data Directory: 
food_images_directory = '/shared/data/food_data/food_images/'
food_annotation_file_path = '/shared/data/food_data/food_annotation_modified.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head(1)

Unnamed: 0,id,img_url,reference_caption
0,1,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0094.jpg,"[On the wooden chopping board, there is a cut up grilled and cooked pork rib.]"


In [3]:
df.shape

(100, 3)

## Generate Multiple Choice Question

In [4]:
# Generate random choice in [A, B, C, D]
import random

def generate_random_choice():
    return random.choice(['A', 'B', 'C', 'D'])

In [5]:
df['multiple_choice_solution'] = df.apply(lambda x: generate_random_choice(), axis=1)

In [6]:
df.head()

Unnamed: 0,id,img_url,reference_caption,multiple_choice_solution
0,1,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0094.jpg,"[On the wooden chopping board, there is a cut up grilled and cooked pork rib.]",A
1,2,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0096.jpg,[Many grilled pork ribs are arranged in a curiously shaped wooden cutlery.],D
2,3,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0091.jpg,[Two grilled brown-red pork ribs on an oval white dinner plate.],A
3,4,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0093.jpg,[Three pork ribs drizzled with a rich sauce and served on a white round dinner plate.],A
4,5,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0084.jpg,"[Raw, long, fresh pork ribs next to yellow, good potatoes.]",A


In [7]:
import os
from openai import OpenAI
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),  
)


def generate_multiple_choice_question(reference_caption, correct_choice, level='medium'): 
    # Define the prompt to generate inferior choices
    if level == 'easy':
        level_message = "The distractors are obviously incorrect but still loosely related to the context."
    elif level == 'medium':
        level_message = "The distractors are somewhat related to the context but contain inaccuracies or non-fluent language."
    elif level == 'hard':
        level_message = "The distractors are closely related to the context but may confuse someone without careful observation."


    prompt = f"""
    Given the ground truth caption below:
    "{reference_caption}"
    Generate three plausible but incorrect distractors.
    "{level_message}"
    Format the result as a multiple-choice question. 
    Question title should be "Which of the following captions best describes the painting?".
    The correct choice should be placed at choice "{correct_choice}". 
    Do not generate special symbols such as '*'.
    """

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.7,
        max_tokens=200,
    )

    # Extract the generated multiple-choice question
    question = response.choices[0].message.content    
    
    return question

In [8]:
# generate hard question 
df['multiple_choice_question_hard'] = df.apply(lambda x: generate_multiple_choice_question(x['reference_caption'][0], x['multiple_choice_solution'], level='hard'), axis=1)

In [9]:
# generate medium question
df['multiple_choice_question_medium'] = df.apply(lambda x: generate_multiple_choice_question(x['reference_caption'][0], x['multiple_choice_solution'], level='medium'), axis=1)

In [10]:
# generate easy question
df['multiple_choice_question_easy'] = df.apply(lambda x: generate_multiple_choice_question(x['reference_caption'][0], x['multiple_choice_solution'], level='easy'), axis=1)

In [12]:
df.head(1)

Unnamed: 0,id,img_url,reference_caption,multiple_choice_solution,multiple_choice_question_hard,multiple_choice_question_medium,multiple_choice_question_easy
0,1,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0094.jpg,"[On the wooden chopping board, there is a cut up grilled and cooked pork rib.]",A,"Question: Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden chopping board, there is a whole, uncooked pork rib.\n\nC. On the wooden chopping board, there is a cut up grilled and cooked chicken breast.\n\nD. On the wooden chopping board, there is a sliced loaf of bread.","Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden table, there is a whole raw pork rib with sauce.\n\nC. On the wooden chopping board, a raw fish is being sliced in half.\n\nD. On the chopping board, there is a grilled chicken leg with vegetables.","Question: Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden chopping board, there is a whole raw fish ready to be cooked.\n\nC. On the wooden chopping board, there is a freshly baked loaf of bread.\n\nD. On the wooden chopping board, there is a colorful assortment of fresh fruits."


In [15]:
df.iloc[0].multiple_choice_question_hard

'Question: Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden chopping board, there is a whole, uncooked pork rib.\n\nC. On the wooden chopping board, there is a cut up grilled and cooked chicken breast.\n\nD. On the wooden chopping board, there is a sliced loaf of bread.'

In [None]:
# Save the annotation with multiple choice question to output file

# Convert DataFrame to a list of dictionaries
list_of_dicts = df.to_dict(orient="records")

# Save the list of dictionaries to a JSON file
output_file = "/shared/data/upking/upking_annotation_with_MCQ_3_difficulies.json"
with open(output_file, "w") as file:
    json.dump(list_of_dicts, file, indent=4)

print(f"DataFrame saved as a list of dictionaries in {output_file}")

DataFrame saved as a list of dictionaries in /shared/data/food_data/food_annotation_with_MCQ_3_difficulies.json


## Perform Multiple Choice Selection

In [None]:
# Load annotation with multiple choice question data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
#food_images_directory = '/shared/data/food_data/food_images/'
food_annotation_file_path = '/shared/data/upking/upking_annotation_with_MCQ.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head(1)

Unnamed: 0,id,img_url,reference_caption,multiple_choice_solution,multiple_choice_question_hard,multiple_choice_question_medium,multiple_choice_question_easy
0,1,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0094.jpg,"[On the wooden chopping board, there is a cut up grilled and cooked pork rib.]",A,"Question: Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden chopping board, there is a whole, uncooked pork rib.\n\nC. On the wooden chopping board, there is a cut up grilled and cooked chicken breast.\n\nD. On the wooden chopping board, there is a sliced loaf of bread.","Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden table, there is a whole raw pork rib with sauce.\n\nC. On the wooden chopping board, a raw fish is being sliced in half.\n\nD. On the chopping board, there is a grilled chicken leg with vegetables.","Question: Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden chopping board, there is a whole raw fish ready to be cooked.\n\nC. On the wooden chopping board, there is a freshly baked loaf of bread.\n\nD. On the wooden chopping board, there is a colorful assortment of fresh fruits."


### Use llava model

In [3]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
import skimage.io as io
import requests 

In [4]:
# Load model from local directory 
model_path = '/shared/model/llava-v1.6-mistral-7b-hf'

processor = LlavaNextProcessor.from_pretrained(model_path)

model = LlavaNextForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, load_in_4bit=True) 
#model.to("cuda:0")

Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
# helper function: 
def perform_multiple_choice_task_llava(img_url, question):
    image = io.imread(img_url)
    
    conversation = [
        {

          "role": "user",
          "content": [
              {"type": "text", "text": question + "\nOnly return the correct choice with a single letter."},
              {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

    # autoregressively complete prompt
    output = model.generate(**inputs, max_new_tokens=150)
    output = processor.decode(output[0], skip_special_tokens=True)

    # mcq_question = output.split('[/INST]')[0].split('[INST] ')[1].strip()
    mcq_answer = output.split('[/INST]')[1].strip()
    return mcq_answer

In [6]:
df['multiple_choice_prediction_easy'] = df.apply(lambda x: perform_multiple_choice_task_llava(x['img_url'], x['multiple_choice_question_easy']), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pa

In [7]:
df['multiple_choice_prediction_medium'] = df.apply(lambda x: perform_multiple_choice_task_llava(x['img_url'], x['multiple_choice_question_medium']), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

In [8]:
df['multiple_choice_prediction_hard'] = df.apply(lambda x: perform_multiple_choice_task_llava(x['img_url'], x['multiple_choice_question_hard']), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

In [9]:
df.head(1)

Unnamed: 0,id,img_url,reference_caption,multiple_choice_solution,multiple_choice_question_hard,multiple_choice_question_medium,multiple_choice_question_easy,multiple_choice_prediction_easy,multiple_choice_prediction_medium,multiple_choice_prediction_hard
0,1,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0094.jpg,"[On the wooden chopping board, there is a cut up grilled and cooked pork rib.]",A,"Question: Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden chopping board, there is a whole, uncooked pork rib.\n\nC. On the wooden chopping board, there is a cut up grilled and cooked chicken breast.\n\nD. On the wooden chopping board, there is a sliced loaf of bread.","Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden table, there is a whole raw pork rib with sauce.\n\nC. On the wooden chopping board, a raw fish is being sliced in half.\n\nD. On the chopping board, there is a grilled chicken leg with vegetables.","Question: Which of the following captions best describes the painting?\n\nA. On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB. On the wooden chopping board, there is a whole raw fish ready to be cooked.\n\nC. On the wooden chopping board, there is a freshly baked loaf of bread.\n\nD. On the wooden chopping board, there is a colorful assortment of fresh fruits.",A,A,A


In [10]:
# Save the MCQ result

# Convert DataFrame to a list of dictionaries
list_of_dicts = df.to_dict(orient="records")

# Save the list of dictionaries to a JSON file
output_file = "/shared/data/upking/upking_annotation_with_MCQ_result.json"
with open(output_file, "w") as file:
    json.dump(list_of_dicts, file, indent=4)

print(f"DataFrame saved as a list of dictionaries in {output_file}")

DataFrame saved as a list of dictionaries in /shared/data/upking/upking_annotation_with_MCQ_result.json


In [11]:
def calculate_multiple_choice_question_accuracy(df):
    # Calculate accuracy
    accuracy_easy = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy"]).mean()
    accuracy_medium = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium"]).mean()
    accuracy_hard = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard"]).mean()

    print(f"Prediction Accuracy Easy: {accuracy_easy * 100:.2f}%") 
    print(f"Prediction Accuracy Medium: {accuracy_medium * 100:.2f}%") 
    print(f"Prediction Accuracy Hard: {accuracy_hard * 100:.2f}%") 
    return accuracy_easy, accuracy_medium, accuracy_hard

In [12]:
calculate_multiple_choice_question_accuracy(df)

Prediction Accuracy Easy: 94.00%
Prediction Accuracy Medium: 71.00%
Prediction Accuracy Hard: 63.00%


(0.94, 0.71, 0.63)

### Use Phi

In [13]:
from PIL import Image 
import requests 
from transformers import AutoModelForCausalLM 
from transformers import AutoProcessor 

model_id = "/shared/model/Phi-3.5-vision-instruct" 

# Note: set _attn_implementation='eager' if you don't have flash_attn installed
model = AutoModelForCausalLM.from_pretrained(
  model_id, 
  device_map="cuda", 
  trust_remote_code=True, 
  torch_dtype="auto", 
  _attn_implementation='eager'    
)

# for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
processor = AutoProcessor.from_pretrained(model_id, 
  trust_remote_code=True, 
  num_crops=4
) 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [5]:
# helper function: 
def perform_multiple_choice_task_Phi(img_url, question):
    image = Image.open(img_url)

    images = []
    images.append(image)

    messages = [
        {"role": "user", "content": "<|image_1|>\n" + question + "\nOnly return the correct choice with a single letter."},
    ]

    prompt = processor.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )

    inputs = processor(prompt, images, return_tensors="pt").to("cuda:0") 

    generation_args = { 
        "max_new_tokens": 10, 
        "temperature": 0.0, 
        "do_sample": False, 
    } 

    generate_ids = model.generate(**inputs, 
    eos_token_id=processor.tokenizer.eos_token_id, 
    **generation_args
    )

    # remove input tokens 
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, 
    skip_special_tokens=True, 
    clean_up_tokenization_spaces=False)[0] 

    return response


In [6]:
df['multiple_choice_prediction_easy_Phi'] = df.apply(lambda x: perform_multiple_choice_task_Phi(x['img_url'], x['multiple_choice_question_easy']), axis=1)

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48


In [None]:
df['multiple_choice_prediction_medium_Phi'] = df.apply(lambda x: perform_multiple_choice_task_Phi(x['img_url'], x['multiple_choice_question_medium']), axis=1)

In [None]:
df['multiple_choice_prediction_hard_Phi'] = df.apply(lambda x: perform_multiple_choice_task_Phi(x['img_url'], x['multiple_choice_question_hard']), axis=1)

In [None]:
# # Save the prediction results to output file
# # Convert DataFrame to a list of dictionaries
# list_of_dicts = df.to_dict(orient="records")

# # Save the list of dictionaries to a JSON file
# output_file = "llava_prediction_result_food_image.json"
# with open(output_file, "w") as file:
#     json.dump(list_of_dicts, file, indent=4)

# print(f"DataFrame saved as a list of dictionaries in {output_file}")

In [7]:
# Evaluate the performance of the model
def calculate_multiple_choice_question_accuracy_Phi(df):
    # Calculate accuracy
    accuracy_easy = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy_Phi"]).mean()
    accuracy_medium = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium_Phi"]).mean()
    accuracy_hard = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard_Phi"]).mean()

    print(f"Prediction Accuracy Easy: {accuracy_easy * 100:.2f}%") 
    print(f"Prediction Accuracy Medium: {accuracy_medium * 100:.2f}%") 
    print(f"Prediction Accuracy Hard: {accuracy_hard * 100:.2f}%") 
    return accuracy_easy, accuracy_medium, accuracy_hard

In [8]:
calculate_multiple_choice_question_accuracy_Phi(df)

Prediction Accuracy: 87.00%


0.87

## ~~Perform Image Captioning Task~~

In [29]:
# Load annotation with multiple choice question data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
#food_images_directory = '/shared/data/food_data/food_images/'
food_annotation_file_path = '/shared/data/food_data/food_annotation_with_MCQ.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head(1)

Unnamed: 0,id,img_url,reference_caption,multiple_choice_solution,multiple_choice_question
0,1,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0094.jpg,"[On the wooden chopping board, there is a cut up grilled and cooked pork rib.]",A,"Question: Which of the following captions best describes the painting?\n\nA) On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB) On the wooden cutting board, there is a raw fish and some vegetables.\n\nC) The wooden chopping board having a sliced beef steak on it.\n\nD) On the wooden table, there is a whole roasted chicken with side of greens."


In [30]:
# helper function: 
def perform_image_captioning_task_llava(img_url):
    image = Image.open(img_url)
    
    conversation = [
        {

          "role": "user",
          "content": [
              {"type": "text", "text": "Generate a caption for this image."},
              {"type": "image"},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")

    # autoregressively complete prompt
    output = model.generate(**inputs, max_new_tokens=150)
    output = processor.decode(output[0], skip_special_tokens=True)

    caption = output.split('[/INST]')[1].strip()
    print(caption)
    return caption

In [31]:
df['predicted_caption'] = df.apply(lambda x: perform_image_captioning_task_llava(x['img_url']), axis=1)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Savoring the moment before the feast begins."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Savoring the flavors of a succulent barbecue ribs feast."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Savoring the smoky, savory flavor of a perfectly grilled rib."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Savoring the flavors of a succulent barbecue ribs meal."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty meal of succulent ribs, crispy potatoes, and a refreshing can of soda."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Savoring the taste of a succulent BBQ rib, garnished with fresh herbs and a tangy sauce, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Savoring the flavors of a succulent barbecue rib."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Savoring the moment before the feast begins."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Savoring the taste of a delicious, crispy snack."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty meal of succulent meat and creamy beans, served on a rustic wooden table."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful treat, a slice of cake with a dusting of powdered sugar, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful plate of crispy, golden-brown fried dumplings, garnished with a sprinkle of sugar and served with a refreshing slice of orange."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful assortment of freshly baked pastries, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful breakfast spread featuring a stack of golden pancakes, a glass of refreshing milk, and a side of fluffy donuts."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful assortment of sweet treats, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful trio of golden brown dumplings, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A tantalizing treat, a sugar-coated doughnut, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful assortment of powdered sugar-covered pastries, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful treat: powdered sugar-covered pastries on a plate, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful assortment of freshly baked pastries, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delicious seafood stir fry with noodles and a variety of ingredients."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty bowl of shrimp noodle soup, ready to warm up a chilly day!"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty bowl of shrimp noodle soup, garnished with a soft-boiled egg and fresh herbs, ready to warm the soul."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty bowl of seafood noodle soup, ready to warm the soul."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty seafood dish, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty bowl of noodles, seafood, and greens, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delicious and colorful meal of noodles, seafood, and a side of soup, ready to be enjoyed."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty bowl of shrimp noodle soup, ready to warm the soul on a chilly day."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty bowl of noodles, topped with succulent meat and garnished with a sprinkle of sesame seeds."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty bowl of shrimp and noodle soup, filled with vibrant colors and flavors."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty meal of sausage and beans, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty meal of sausage and beans, ready to be enjoyed."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty meal of sausage and beans, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty meal of sausages and greens, served in a rustic cast iron skillet."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty meal of sausages and beans, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty meal of sausage and pasta, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty meal of sausage, potatoes, and a side of fresh vegetables."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Preparing a hearty meal with freshly chopped ginger root and a sharp knife."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty meal of sausages and pasta, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty meal of sausages and onions, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful culinary experience awaits with these skewered delights, garnished with a touch of elegance."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful culinary scene featuring freshly made dough balls, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A colorful culinary adventure: Five mini pizzas, each with its own unique toppings, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A vibrant red dish filled with freshly sliced white radishes, ready to be enjoyed."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful stack of crispy, golden-brown pancakes, topped with a dollop of creamy peanut butter, served on a charming blue and white plate."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful culinary experience featuring a trio of golden-brown, crispy tofu squares, garnished with a sprig of fresh green herbs, served on a pristine white plate."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A moment of culinary artistry: freshly grated parmesan cheese being added to a dish."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful snack of apple slices and peanut butter on a bed of crunchy rice cakes."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A slice of sushi with a vibrant green seaweed topping, resting on a textured surface."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful dish of avocado toast topped with a sprinkle of sprouts and a drizzle of creamy sauce, served on a rustic wooden table."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful sushi meal, complete with a refreshing drink and a side of spices."
"A delightful culinary experience awaits with these two mouth-watering sushi rolls, each meticulously crafted and ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A colorful and appetizing bento box, ready to be enjoyed."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful plate of sushi rolls, garnished with a sprinkle of sesame seeds and a drizzle of soy sauce, ready to be savored."
"A delightful bite of sushi rice, topped with a slice of seaweed, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delicious and healthy meal, ready to be enjoyed."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful snack of sushi rice balls, topped with a vibrant orange filling, served on a white plate with a side of soy sauce."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful sushi dish, served on a black plate with a touch of orange, ready to be enjoyed."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful plate of sushi rolls, garnished with sesame seeds and green onions, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A culinary delight: Japanese onigiri, meticulously crafted with a sprinkle of spice."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful medley of fresh fruits and a flaky pastry, perfect for a summer afternoon."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A slice of heaven: a delicious piece of pie on a gold plate, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A slice of heaven: A creamy cheesecake with a golden crust, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A slice of heaven: a delicious dessert with a hint of citrus."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A stack of golden, flaky waffles topped with a dollop of whipped cream and a slice of lemon, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A slice of heaven: A perfectly baked pie with a flaky crust, topped with a scoop of vanilla ice cream, all ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful dessert scene featuring a slice of lemon meringue pie, a scoop of ice cream, and a sprinkle of powdered sugar, all served on a charming blue plate with a yellow checkered napkin."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A slice of heaven: A tantalizing dessert with a golden crust and a fluffy meringue topping."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A slice of heaven: A delicious, creamy slice of lemon meringue pie."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A slice of heaven: A delicious dessert served on a fine china plate."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delicious meal of sliced chicken with a side of flavorful sauce, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful dish of tender, succulent chicken pieces, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful meal of succulent chicken in a vibrant sauce, served on a traditional blue and white plate, accompanied by a refreshing side of soup."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delicious and hearty meal of chicken and broccoli in a savory sauce, garnished with a sprig of parsley and a purple flower, served on a bamboo placemat."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful dish of crispy, golden-brown fried chicken, served on a pristine white plate, garnished with a sprig of parsley and a vibrant purple flower, ready to be savored."
"A tantalizing dish of succulent chicken, garnished with a vibrant red tomato and a sprig of fresh parsley, served on a pristine white plate."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful dish of roasted chicken, garnished with a sprig of parsley and a purple flower, served on a white plate."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A tantalizing dish of succulent pork belly, served with a side of vibrant green onions, all presented on a rustic wooden table."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful culinary experience featuring succulent chicken and fresh herbs, served on a pristine white plate."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful meal of succulent chicken, fresh carrots, and aromatic parsley, all served on a pristine white plate."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A vibrant and healthy Mediterranean-style salad, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful Mediterranean feast, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A vibrant and appetizing meal, ready to be savored!"


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A vibrant and healthy salad, ready to be enjoyed."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful spread of appetizers, ready to be savored."
"A vibrant salad of fresh tomatoes, crisp lettuce, and creamy cheese, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A vibrant medley of fresh vegetables and crumbled feta cheese, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A vibrant and healthy salad, bursting with fresh vegetables and a sprinkle of feta cheese."
"A vibrant medley of fresh vegetables and juicy tomatoes, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A vibrant salad, bursting with colors and flavors."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A delightful dessert moment captured on a floral-patterned plate."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Homemade pie cooling on the stove, ready to be served."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A cozy meal of comfort food, featuring a hearty casserole and a side of fresh greens."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A tantalizing view of a pecan pie, freshly baked and ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A slice of delicious pie, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A hearty meal of a classic shepherd's pie, topped with a golden crust and served with a side of creamy baked beans."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A slice of pie, ready to be savored."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A freshly baked pie, ready to be enjoyed."


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"A golden-brown pie crust, ready for a delicious filling."
"A delicious, homemade pizza, freshly baked and ready to be enjoyed."


In [32]:
df[['reference_caption', 'predicted_caption']]

Unnamed: 0,reference_caption,predicted_caption
0,"[On the wooden chopping board, there is a cut up grilled and cooked pork rib.]","""Savoring the moment before the feast begins."""
1,[Many grilled pork ribs are arranged in a curiously shaped wooden cutlery.],"""Savoring the flavors of a succulent barbecue ribs feast."""
2,[Two grilled brown-red pork ribs on an oval white dinner plate.],"""Savoring the smoky, savory flavor of a perfectly grilled rib."""
3,[Three pork ribs drizzled with a rich sauce and served on a white round dinner plate.],"""Savoring the flavors of a succulent barbecue ribs meal."""
4,"[Raw, long, fresh pork ribs next to yellow, good potatoes.]","""A hearty meal of succulent ribs, crispy potatoes, and a refreshing can of soda."""
...,...,...
95,[A golden bean pie and some pork fried soybeans on a white round plate],"""A hearty meal of a classic shepherd's pie, topped with a golden crust and served with a side of creamy baked beans."""
96,"[A plate of bean pie with a yellow edge and orange skin, and a napkin, a fork, and a knife next to it.]","""A slice of pie, ready to be savored."""
97,[A dark brown bean pie with a golden edge on a tray placed on a stainless steel net.],"""A freshly baked pie, ready to be enjoyed."""
98,[a bean pie with scratches on the crispy surface which is caramel color fading from the edge to the center.],"""A golden-brown pie crust, ready for a delicious filling."""


In [33]:
# Save the image_captioning result

# Convert DataFrame to a list of dictionaries
list_of_dicts = df.to_dict(orient="records")

# Save the list of dictionaries to a JSON file
output_file = "/shared/data/food_data/food_annotation_with_image_captioning_result.json"

with open(output_file, "w") as file:
    json.dump(list_of_dicts, file, indent=4)

print(f"DataFrame saved as a list of dictionaries in {output_file}")

DataFrame saved as a list of dictionaries in /shared/data/food_data/food_annotation_with_image_captioning_result.json


## Evaluate results

In [31]:
# import libraries
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice

### Multiple Choice Accuracy

In [24]:
# Load annotation with multiple choice question result data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
food_annotation_file_path = '/shared/data/food_data/food_annotation_with_MCQ_result.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head(1)

Unnamed: 0,id,img_url,reference_caption,multiple_choice_solution,multiple_choice_question,multiple_choice_prediction
0,1,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0094.jpg,"[On the wooden chopping board, there is a cut up grilled and cooked pork rib.]",A,"Question: Which of the following captions best describes the painting?\n\nA) On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB) On the wooden cutting board, there is a raw fish and some vegetables.\n\nC) The wooden chopping board having a sliced beef steak on it.\n\nD) On the wooden table, there is a whole roasted chicken with side of greens.",A


In [27]:
def calculate_multiple_choice_question_accuracy(df):
    # Calculate accuracy
    accuracy_easy = (df["multiple_choice_solution"] == df["multiple_choice_prediction_easy"]).mean()
    accuracy_medium = (df["multiple_choice_solution"] == df["multiple_choice_prediction_medium"]).mean()
    accuracy_hard = (df["multiple_choice_solution"] == df["multiple_choice_prediction_hard"]).mean()

    print(f"Prediction Accuracy Easy: {accuracy_easy * 100:.2f}%") 
    print(f"Prediction Accuracy Medium: {accuracy_medium * 100:.2f}%") 
    print(f"Prediction Accuracy Hard: {accuracy_hard * 100:.2f}%") 
    return accuracy_easy, accuracy_medium, accuracy_hard

In [28]:
calculate_multiple_choice_question_accuracy(df)

Prediction Accuracy Easy: 94.00%
Prediction Accuracy Medium: 71.00%
Prediction Accuracy Hard: 63.00%


(0.94, 0.71, 0.63)

### Caption Quality


In [34]:
# Load annotation with multiple choice question result data file
import pandas as pd
pd.set_option('display.max_colwidth', None)
import json


# Data Directory: 
food_annotation_file_path = '/shared/data/food_data/food_annotation_with_image_captioning_result.json'


# Method 1: Using pandas.read_json directly
df = pd.read_json(food_annotation_file_path)

df.head(1)

Unnamed: 0,id,img_url,reference_caption,multiple_choice_solution,multiple_choice_question,predicted_caption
0,1,/shared/data/food_data/food_images/Pork_ribs/Pork_ribs_0094.jpg,"[On the wooden chopping board, there is a cut up grilled and cooked pork rib.]",A,"Question: Which of the following captions best describes the painting?\n\nA) On the wooden chopping board, there is a cut up grilled and cooked pork rib.\n\nB) On the wooden cutting board, there is a raw fish and some vegetables.\n\nC) The wooden chopping board having a sliced beef steak on it.\n\nD) On the wooden table, there is a whole roasted chicken with side of greens.","""Savoring the moment before the feast begins."""


In [None]:
# BLEU

In [35]:
from nltk.translate.bleu_score import sentence_bleu


# BLEU Evaluation (Average across multiple references)
def evaluate_bleu(df):
    bleu_scores = []
    
    for _, row in df.iterrows():
        references = row['reference_caption']  # List of reference captions
        candidate = row['predicted_caption']  # Predicted caption

        # Tokenize the candidate and reference captions
        tokenized_references = [ref.strip('"').split() for ref in references]  # List of tokenized references
        tokenized_candidate = candidate.strip('"').split()  # Tokenized candidate

        # Compute BLEU for all references
        row_bleu_scores = [
            sentence_bleu([ref], tokenized_candidate) for ref in tokenized_references
        ]
        
        # Average across references
        bleu_scores.append(sum(row_bleu_scores) / len(row_bleu_scores))
    
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    print(f"Average BLEU: {avg_bleu:.4f}")
    return bleu_scores

In [36]:
evaluate_bleu(df)

Average BLEU: 0.0010


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


[3.571883312829026e-232,
 8.962731118674859e-232,
 9.269981669466712e-232,
 5.746727420065187e-232,
 0.0,
 1.1337861261109773e-231,
 0.0,
 0.0,
 6.085166479973199e-232,
 1.0662520804273401e-231,
 1.268852357850863e-231,
 5.035580399326033e-155,
 0.0,
 2.6616657200018397e-78,
 6.695492068419091e-232,
 0.0,
 0.0,
 5.748666558742058e-232,
 0.0,
 5.294085324363193e-232,
 1.0709749285266912e-231,
 4.491405484477833e-232,
 1.0244914152188952e-231,
 9.065607048138757e-232,
 2.738929881729733e-232,
 6.085166479973199e-232,
 1.8423401430089077e-155,
 4.685504009359912e-155,
 1.3416480207402436e-231,
 8.672642734733089e-232,
 9.134374972545899e-232,
 1.1896457329133973e-231,
 1.0003688322288243e-231,
 5.347469696085671e-155,
 1.1896457329133973e-231,
 0.0,
 1.0709749285266912e-231,
 5.554837769749797e-155,
 0.0,
 1.0003688322288243e-231,
 2.5573972968570177e-155,
 4.801280454758726e-232,
 5.586496301804011e-232,
 1.154647032204335e-231,
 4.447844384793538e-155,
 9.594503055152632e-232,
 1.120040

In [None]:
# METEOR

In [65]:
import nltk
nltk.download('wordnet')
from nltk.translate.meteor_score import meteor_score

# METEOR Evaluation (Average across multiple references)
def evaluate_meteor(df):
    meteor_scores = []
    
    for _, row in df.iterrows():
        references = row['reference_caption']  # List of reference captions
        candidate = row['predicted_caption'].strip('"')  # Predicted caption (raw string)

        # Compute METEOR for all references
        row_meteor_scores = [
            meteor_score([[ref]], [candidate]) for ref in references
        ]
        
        # Average across references
        meteor_scores.append(sum(row_meteor_scores) / len(row_meteor_scores))
    
    avg_meteor = sum(meteor_scores) / len(meteor_scores)
    print(f"Average METEOR: {avg_meteor:.4f}")
    return meteor_scores


[nltk_data] Downloading package wordnet to /home/过隙de白驹/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [66]:
evaluate_meteor(df)

Average METEOR: 0.0000


[0.0, 0.0, 0.0]

In [None]:
# ROUGE

In [67]:
# ROUGE Evaluation (Average across multiple references)
def evaluate_rouge(df):
    rouge1_scores, rouge2_scores, rougeL_scores = [], [], []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    for _, row in df.iterrows():
        references = row['reference_caption']  # List of reference captions
        candidate = row['predicted_caption']  # Predicted caption
        
        # Compute ROUGE scores for all references
        row_rouge1_scores, row_rouge2_scores, row_rougeL_scores = [], [], []
        for ref in references:
            scores = scorer.score(ref, candidate.strip('"'))
            row_rouge1_scores.append(scores['rouge1'].fmeasure)
            row_rouge2_scores.append(scores['rouge2'].fmeasure)
            row_rougeL_scores.append(scores['rougeL'].fmeasure)
        
        # Average across references
        rouge1_scores.append(sum(row_rouge1_scores) / len(row_rouge1_scores))
        rouge2_scores.append(sum(row_rouge2_scores) / len(row_rouge2_scores))
        rougeL_scores.append(sum(row_rougeL_scores) / len(row_rougeL_scores))
    
    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
    avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
    avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)
    
    print(f"Average ROUGE-1: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L: {avg_rougeL:.4f}")
    return rouge1_scores, rouge2_scores, rougeL_scores

In [68]:
evaluate_rouge(df)

Average ROUGE-1: 0.4267
Average ROUGE-2: 0.1000
Average ROUGE-L: 0.3457


([0.3157894736842105, 0.5714285714285714, 0.3928571428571428],
 [0.0, 0.3, 0.0],
 [0.2631578947368421, 0.5238095238095238, 0.25])

In [None]:
# CIDEr and SPICE (unchanged, since they handle multiple references internally)

In [69]:
def evaluate_cider(df):
    ref_dict = {str(idx): row['reference_caption'] for idx, row in df.iterrows()}
    cand_dict = {str(idx): [row['predicted_caption']] for idx, row in df.iterrows()}
    
    cider_scorer = Cider()
    score, _ = cider_scorer.compute_score(ref_dict, cand_dict)
    print(f"Average CIDEr: {score:.4f}")
    return score

In [70]:
evaluate_cider(df)

Average CIDEr: 0.7787


0.7786648935891062

In [71]:
def evaluate_spice(df):
    ref_dict = {str(idx): row['reference_caption'] for idx, row in df.iterrows()}
    cand_dict = {str(idx): [row['predicted_caption']] for idx, row in df.iterrows()}
    
    spice_scorer = Spice()
    score, _ = spice_scorer.compute_score(ref_dict, cand_dict)
    print(f"Average SPICE: {score:.4f}")
    return score

In [73]:
evaluate_spice(df)

Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.9 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.8 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [1.0 sec].
Loading classif

SPICE evaluation took: 12.85 s
Average SPICE: 0.2341


0.23411371237458198