# Experiment 3: Semantic Fidelity Comparison

__Objective:__ Compute the correlation between human judgement and traditional metrics (e.g. BLEU and METEOR,etc.) compare it to the correlation between human judgement and embedding-based metrics (e.g., BERTScore, CLIPScore, etc.). Optional: If time permits, compute correlation between human judgement and the visual attention map alignment, then compare against traditional and embedding-based metrics.

__Research Question(s):__ Can embedding-based metrics or visual attention maps help quantify caption fidelity beyond BLEU and METEOR scores?

## Setup and Imports

In [1]:
import logging
# Configure logging
logging.basicConfig(
    level=logging.INFO, # Set the minimum logging level (e.g., INFO, DEBUG, WARNING, ERROR, CRITICAL)
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y:%m:%d %H:%M"
)
# Get a logger instance for this notebook
logger = logging.getLogger(__name__) 

logger.info("Notebook execution started.")

2025:07:29 13:28 - INFO - Notebook execution started.


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import pandas as pd
from evaluation_sheet_reader import read_human_evaluation_sheets
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from vtt.evaluation.metrics import (
    compute_meteor_scores,
    compute_bertscore,
)
from tqdm.notebook import tqdm

## Load Human Evaluation Data

In [4]:
# Define the Directory Containing Evaluation Spreadsheets
evaluation_data_directory = './human_evaluation_sheets'

# Check if the directory exists
if not os.path.exists(evaluation_data_directory):
    logger.warning(f"The specified evaluation data directory does not exist: {evaluation_data_directory}")
    logger.warning("Please create the directory and place your evaluation Excel files inside, or update the path.")

# Load the Data into a DataFrame
logger.info(f"Loading evaluation data from: {os.path.abspath(evaluation_data_directory)}")
evaluation_df = read_human_evaluation_sheets(evaluation_data_directory)

2025:07:29 13:28 - INFO - Loading evaluation data from: /mnt/c/grad_school/northeastern/ie7374/GenAI_Project/experiments/experiment3/human_evaluation_sheets


Scanning directory: ./human_evaluation_sheets for Excel files...
Successfully read: human_evaluation_sheet.xlsx


In [5]:
evaluation_df.info()
evaluation_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Image Filename             100 non-null    object 
 1   Generated Caption (Model)  100 non-null    object 
 2   Adequacy (1-5)             0 non-null      float64
 3   Fluency (1-5)              0 non-null      float64
 4   Overall Quality (1-5)      0 non-null      float64
 5   Comments                   0 non-null      float64
 6   Generation Method          100 non-null    object 
 7   Ground Truth Caption       100 non-null    object 
 8   source_file                100 non-null    object 
dtypes: float64(4), object(5)
memory usage: 7.2+ KB


Unnamed: 0,Image Filename,Generated Caption (Model),Adequacy (1-5),Fluency (1-5),Overall Quality (1-5),Comments,Generation Method,Ground Truth Caption,source_file
0,862054277_34b5a6f401.jpg,a man in a wetsuit is riding a dirt bike,,,,,Greedy,a young girl in a pink swimsuit,human_evaluation_sheet.xlsx
1,3181701312_70a379ab6e.jpg,a man in a blue shirt is standing in front of ...,,,,,Beam Search,a man covered with a blanket is asleep on the ...,human_evaluation_sheet.xlsx
2,624742559_ff467d8ebc.jpg,a woman is sitting on a bench reading a newspaper,,,,,Beam Search,a little boy is climbing on the monkey bars at...,human_evaluation_sheet.xlsx
3,1119463452_69d4eecd08.jpg,a little girl in a pink shirt is swinging on a...,,,,,Greedy,smiling little girl swimming in outdoor pool,human_evaluation_sheet.xlsx
4,1332722096_1e3de8ae70.jpg,a woman in a blue shirt is walking with a dog,,,,,Greedy,two girls and a woman walking on the sidewalk,human_evaluation_sheet.xlsx


In [6]:
evaluation_df.tail()

Unnamed: 0,Image Filename,Generated Caption (Model),Adequacy (1-5),Fluency (1-5),Overall Quality (1-5),Comments,Generation Method,Ground Truth Caption,source_file
95,161669933_3e7d8c7e2c.jpg,a motorcycle racer is riding a motorcycle,,,,,Greedy,a competitive motorcycle racer prepares to mak...,human_evaluation_sheet.xlsx
96,2709275718_73fcf08c23.jpg,a man in a black shirt and a black hat with a ...,,,,,Greedy,a man in a short mohawk and beard,human_evaluation_sheet.xlsx
97,3320680380_b0d38b3b4a.jpg,a man in a black wetsuit is riding a bike on a...,,,,,Greedy,the ski instructor is teaching the little girl...,human_evaluation_sheet.xlsx
98,3315250232_83e24a2d51.jpg,two dogs running in a field,,,,,Beam Search,a little girl runs across the grass toward the...,human_evaluation_sheet.xlsx
99,3381038951_225bb163af.jpg,a man in a wetsuit is surfing a wave,,,,,Greedy,a person is pointing at a geyser,human_evaluation_sheet.xlsx


## Compute Automatic Metrics

In [7]:
# Drop rows where captions are missing
df = evaluation_df.dropna(subset=["Ground Truth Caption", "Generated Caption (Model)"]).copy()

# Prepare BLEU smoothing function
smooth = SmoothingFunction().method1

# Lists to collect scores
bleu1s, bleu2s, bleu3s, bleu4s = [], [], [], []
meteors = []
bert_p_list, bert_r_list, bert_f1_list = [], [], []

# Loop over each row and compute metrics
for idx, row in tqdm(df.iterrows(), desc="Computing Metrics ..."):
    reference = row["Ground Truth Caption"]
    candidate = row["Generated Caption (Model)"]

    # Tokenize references and candidates
    ref_tokens = [reference.split()]
    cand_tokens = candidate.split()

    # Compute BLEU scores
    bleu1s.append(sentence_bleu(ref_tokens, cand_tokens, weights=(1, 0, 0, 0), smoothing_function=smooth))
    bleu2s.append(sentence_bleu(ref_tokens, cand_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth))
    bleu3s.append(sentence_bleu(ref_tokens, cand_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth))
    bleu4s.append(sentence_bleu(ref_tokens, cand_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth))

    # METEOR
    meteors.append(compute_meteor_scores([[reference]], [candidate]))

    # BERTScore: get P, R, F1
    bert_scores = compute_bertscore([reference], [candidate])
    bert_p_list.append(bert_scores["BERTScore_P"])
    bert_r_list.append(bert_scores["BERTScore_R"])
    bert_f1_list.append(bert_scores["BERTScore_F1"])

# Add metrics to DataFrame
df["BLEU-1"] = bleu1s
df["BLEU-2"] = bleu2s
df["BLEU-3"] = bleu3s
df["BLEU-4"] = bleu4s
df["METEOR"] = meteors
df["BERTScore_P"] = bert_p_list
df["BERTScore_R"] = bert_r_list
df["BERTScore_F1"] = bert_f1_list

Computing Metrics ...: 0it [00:00, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

In [8]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Image Filename             100 non-null    object 
 1   Generated Caption (Model)  100 non-null    object 
 2   Adequacy (1-5)             0 non-null      float64
 3   Fluency (1-5)              0 non-null      float64
 4   Overall Quality (1-5)      0 non-null      float64
 5   Comments                   0 non-null      float64
 6   Generation Method          100 non-null    object 
 7   Ground Truth Caption       100 non-null    object 
 8   source_file                100 non-null    object 
 9   BLEU-1                     100 non-null    float64
 10  BLEU-2                     100 non-null    float64
 11  BLEU-3                     100 non-null    float64
 12  BLEU-4                     100 non-null    float64
 13  METEOR                     100 non-null    float64


Unnamed: 0,Image Filename,Generated Caption (Model),Adequacy (1-5),Fluency (1-5),Overall Quality (1-5),Comments,Generation Method,Ground Truth Caption,source_file,BLEU-1,BLEU-2,BLEU-3,BLEU-4,METEOR,BERTScore_P,BERTScore_R,BERTScore_F1
0,862054277_34b5a6f401.jpg,a man in a wetsuit is riding a dirt bike,,,,,Greedy,a young girl in a pink swimsuit,human_evaluation_sheet.xlsx,0.3,0.182574,0.076653,0.049394,0.205479,0.900994,0.915368,0.908124
1,3181701312_70a379ab6e.jpg,a man in a blue shirt is standing in front of ...,,,,,Beam Search,a man covered with a blanket is asleep on the ...,human_evaluation_sheet.xlsx,0.307692,0.160128,0.063283,0.039074,0.178571,0.918207,0.918805,0.918506
2,624742559_ff467d8ebc.jpg,a woman is sitting on a bench reading a newspaper,,,,,Beam Search,a little boy is climbing on the monkey bars at...,human_evaluation_sheet.xlsx,0.327492,0.054582,0.032278,0.024437,0.169492,0.916734,0.915468,0.9161
3,1119463452_69d4eecd08.jpg,a little girl in a pink shirt is swinging on a...,,,,,Greedy,smiling little girl swimming in outdoor pool,human_evaluation_sheet.xlsx,0.25,0.150756,0.062757,0.039864,0.340741,0.89076,0.901141,0.89592
4,1332722096_1e3de8ae70.jpg,a woman in a blue shirt is walking with a dog,,,,,Greedy,two girls and a woman walking on the sidewalk,human_evaluation_sheet.xlsx,0.272727,0.165145,0.069007,0.044116,0.163043,0.897347,0.93303,0.914841


In [9]:
df.tail()

Unnamed: 0,Image Filename,Generated Caption (Model),Adequacy (1-5),Fluency (1-5),Overall Quality (1-5),Comments,Generation Method,Ground Truth Caption,source_file,BLEU-1,BLEU-2,BLEU-3,BLEU-4,METEOR,BERTScore_P,BERTScore_R,BERTScore_F1
95,161669933_3e7d8c7e2c.jpg,a motorcycle racer is riding a motorcycle,,,,,Greedy,a competitive motorcycle racer prepares to mak...,human_evaluation_sheet.xlsx,0.210217,0.11353,0.046564,0.03056,0.150376,0.938845,0.891691,0.914661
96,2709275718_73fcf08c23.jpg,a man in a black shirt and a black hat with a ...,,,,,Greedy,a man in a short mohawk and beard,human_evaluation_sheet.xlsx,0.375,0.273861,0.223814,0.169436,0.484533,0.900429,0.924097,0.91211
97,3320680380_b0d38b3b4a.jpg,a man in a black wetsuit is riding a bike on a...,,,,,Greedy,the ski instructor is teaching the little girl...,human_evaluation_sheet.xlsx,0.099953,0.023194,0.015215,0.011885,0.054054,0.877757,0.85166,0.864512
98,3315250232_83e24a2d51.jpg,two dogs running in a field,,,,,Beam Search,a little girl runs across the grass toward the...,human_evaluation_sheet.xlsx,0.08557,0.029642,0.023139,0.02096,0.104167,0.933959,0.910832,0.922251
99,3381038951_225bb163af.jpg,a man in a wetsuit is surfing a wave,,,,,Greedy,a person is pointing at a geyser,human_evaluation_sheet.xlsx,0.333333,0.06455,0.040332,0.03156,0.208333,0.900608,0.899955,0.900281


In [10]:
# Optional: save
# df.to_csv("human_eval_with_automated_metrics.csv", index=False)

## Compute Pearson Correlations and Generate Correlation Matrix

In [11]:
# Columns to include in correlation
correlation_columns = [
    "Adequacy (1-5)",
    "Fluency (1-5)",
    "Overall Quality (1-5)",
    "BLEU-1",
    "BLEU-2",
    "BLEU-3",
    "BLEU-4",
    "METEOR",
    "BERTScore_P",
    "BERTScore_R",
    "BERTScore_F1",
]

# Ensure you're only using rows with complete human ratings and metric scores
df_clean = df.dropna(subset=correlation_columns)

# Compute Pearson correlation matrix
pearson_corr = df_clean[correlation_columns].corr(method="pearson")

pearson_corr

Unnamed: 0,Adequacy (1-5),Fluency (1-5),Overall Quality (1-5),BLEU-1,BLEU-2,BLEU-3,BLEU-4,METEOR,BERTScore_P,BERTScore_R,BERTScore_F1
Adequacy (1-5),,,,,,,,,,,
Fluency (1-5),,,,,,,,,,,
Overall Quality (1-5),,,,,,,,,,,
BLEU-1,,,,,,,,,,,
BLEU-2,,,,,,,,,,,
BLEU-3,,,,,,,,,,,
BLEU-4,,,,,,,,,,,
METEOR,,,,,,,,,,,
BERTScore_P,,,,,,,,,,,
BERTScore_R,,,,,,,,,,,


## Sandbox