In [None]:
import pandas as pd
import numpy as np
# For NLP metrics
from evaluation.benchmark import calculate_nlp_metrics, calculate_clip_scores
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
VLMs = ["smolvlm2", "gemma3n", "blip2", "llama_vision", "mistral", "gemma", "qwen", "llava"]
data_type = "3d" # either "real" or "3d"
segmentation_dir = f"output/{data_type}/sam2_tracking/"

mask_dir = "with_masks" # either "with_masks" or "without_masks"

In [None]:
# Load ground truth
df = pd.read_csv(f"data/{data_type}/ground_truth.csv", header=0, sep=";")
idx = df['object_id'].tolist()
captions = df['caption'].tolist()

gt_captions = {}
for i in range(len(idx)):
    gt_captions[idx[i]] = captions[i]

In [None]:
all_captions = {}
for VLM in VLMs:
# Load non-aggregated captions based on the VLM
    captions_dir = f"output/{data_type}/caption/{VLM}/{mask_dir}"
    caption_files = glob.glob(os.path.join(captions_dir, f"all_captions.csv"))
    if not caption_files:
        raise FileNotFoundError(f"No caption files found in {captions_dir} for {VLM}")
    for file_path in caption_files:
        captions = {}

    try:
        # Read CSV with delimiter as semicolon
        df = pd.read_csv(file_path, sep=';', header=0)
        # Convert to nested dictionary, first key is frame_idx, second key is object_idx
        for _, row in df.iterrows():
            if row['frame_idx'] not in captions:
                captions[row['frame_idx']] = {}
            captions[row['frame_idx']][row['object_idx']] = row['caption']
        all_captions[VLM] = captions
    except Exception as e:
        print(f"Error loading {file_path}: {e}")


In [None]:
cap_frame0 = {}

for VLM, captions_dict in all_captions.items():
    cap_frame0[VLM] = {}
    for frame, cap_dict in captions_dict.items():
        for obj_idx, caption in cap_dict.items():
            if obj_idx not in cap_frame0[VLM]:
                cap_frame0[VLM][obj_idx] = {}
            if frame == 0:
                cap_frame0[VLM][obj_idx] = caption

cap_frame0

In [None]:
# Calculate CLIP scores for ground truth
clip_scores_gt = calculate_clip_scores(segmentation_dir, gt_captions, masked=(mask_dir=="with_masks"))

# Print CLIP scores and ground truth captions
print("Ground Truth Captions and CLIP Scores:")
for obj_id, caption in gt_captions.items():
    print(f"Object ID: {obj_id}, Caption: {caption}, CLIP Score: {clip_scores_gt[obj_id]}")

clip_scores_gt = [i for i in list(clip_scores_gt.values())]

In [None]:
# Calculate CLIP scores per model
clip_scores_vlm = {}
for VLM in VLMs:
    scores = calculate_clip_scores(segmentation_dir, cap_frame0[VLM], masked=(mask_dir=="with_masks"))
    clip_scores_vlm[VLM] = [i for i in list(scores.values())]

# Print the results
print("Ground Truth CLIP Scores:")
print(clip_scores_gt)

for VLM in VLMs:
    print(f"{VLM} CLIP Scores:")
    print(clip_scores_vlm[VLM])

# Calculate mean and standard deviation of CLIP scores
mean_gt = np.mean(clip_scores_gt)
std_gt = np.std(clip_scores_gt)
mean_vlm = {}
std_vlm = {}
for VLM in VLMs:
    mean_vlm[VLM] = np.mean(clip_scores_vlm[VLM])
    std_vlm[VLM] = np.std(clip_scores_vlm[VLM])

# Print mean and standard deviation
print(f"Ground Truth CLIP Score: {mean_gt:.2f} ± {std_gt:.2f}")
for VLM in VLMs:
    print(f"{VLM} CLIP Score: {mean_vlm[VLM]:.2f} ± {std_vlm[VLM]:.2f}")



In [None]:
# Create a single figure for the bar plot
fig, ax = plt.subplots(figsize=(10, 6))
plt.rcParams.update({'font.size': 14, 'axes.labelsize': 16, 'axes.titlesize': 18, 'xtick.labelsize': 14, 'ytick.labelsize': 14, 'legend.fontsize': 13})

# Calculate average CLIP score for ground truth and each VLM
avg_gt_score = np.mean(clip_scores_gt)
avg_vlm_scores = {vlm: np.mean(scores).item() for vlm, scores in clip_scores_vlm.items()}

# Bar plot of average CLIP scores
models = ['Ground Truth'] + VLMs
avg_scores = [avg_gt_score] + [avg_vlm_scores[vlm] for vlm in VLMs]
std_scores = [np.std(clip_scores_gt)] + [np.std(scores) for scores in clip_scores_vlm.values()]
min_scores = [np.min(clip_scores_gt)] + [np.min(scores) for scores in clip_scores_vlm.values()]
max_scores = [np.max(clip_scores_gt)] + [np.max(scores) for scores in clip_scores_vlm.values()]

# Custom colors with gray for ground truth and colorful for VLMs
colors = [(0.5, 0.5, 0.5, 1.0)] + [plt.cm.tab10(i) for i in range(len(VLMs))]
plt.rcParams.update({'font.size': 14, 'axes.labelsize': 16, 'axes.titlesize': 18, 'xtick.labelsize': 14, 'ytick.labelsize': 14, 'legend.fontsize': 13})

# Create the bar plot
ax.bar(models, avg_scores, yerr=std_scores, capsize=5, color=colors)
# ax.set_title('Average CLIP Score by Model (with Std Dev)', fontsize=14)
ax.set_ylabel('CLIP Score', fontsize=12)
ax.set_ylim([.2, .38])
ax.tick_params(axis='x')
ax.axhline(y=avg_gt_score, color='black', linestyle='--', alpha=0.7, label='GT Average')
ax.grid(axis='y', linestyle='--', alpha=0.3)
ax.legend()


plt.tight_layout()
plt.show()


for VLM in VLMs:
    clip_scores_vlm[VLM] = (np.mean(clip_scores_vlm[VLM]), np.std(clip_scores_vlm[VLM]), np.min(clip_scores_vlm[VLM]), np.max(clip_scores_vlm[VLM]))


In [None]:
# Calculate NLP metrics for all VLMs compared to ground truth

all_nlp_metrics = {}
nlp_metrics = {}

for VLM in VLMs:
    # Extract captions for this VLM (using frame 0)
    all_nlp_metrics[VLM] = {}
    vlm_captions = {}
    for obj_idx, caption in cap_frame0[VLM].items():
        vlm_captions[obj_idx] = caption
    
    # Calculate NLP metrics between this VLM's captions and ground truth
    # Calculate NLP metrics between this VLM's captions and ground truth
    raw_metrics = calculate_nlp_metrics(gt_captions, vlm_captions)
    all_nlp_metrics[VLM] = raw_metrics.copy()
    # print(raw_metrics)
    # Calculate statistics for each metric
    metrics = {}
    for metric_name, values in raw_metrics.items():
        values_list = list(values)

        metrics[metric_name] = (
            np.mean(values_list),  # mean
            np.std(values_list),   # std
            np.min(values_list),   # min
            np.max(values_list)    # max
        )
        
    nlp_metrics[VLM] = metrics

# Create a DataFrame to present the results
metrics_df = pd.DataFrame(columns=['VLM', 'CLIP', 'CIDEr', 'BERT', 'ROUGE', 'GPT'])

for VLM, metrics in nlp_metrics.items():
    metrics_df = pd.concat([metrics_df, pd.DataFrame({
        'VLM': [VLM],
        'CLIP': [float(clip_scores_vlm[VLM][0])],
        'CLIP_std': [float(clip_scores_vlm[VLM][1])],
        'CLIP_min': [float(clip_scores_vlm[VLM][2])],
        'CLIP_max': [float(clip_scores_vlm[VLM][3])],
        'CIDEr': [float(metrics['cider'][0])],
        'CIDEr_std': [float(metrics['cider'][1])],
        'CIDEr_min': [float(metrics['cider'][2])],
        'CIDEr_max': [float(metrics['cider'][3])],
        'BERT': [float(metrics['bert'][0])],
        'BERT_std': [float(metrics['bert'][1])],
        'BERT_min': [float(metrics['bert'][2])],
        'BERT_max': [float(metrics['bert'][3])],
        'ROUGE': [float(metrics['rouge_l'][0])],
        'ROUGE_std': [float(metrics['rouge_l'][1])],
        'ROUGE_min': [float(metrics['rouge_l'][2])],
        'ROUGE_max': [float(metrics['rouge_l'][3])],
        'GPT': [float(metrics['gpt'][0])],
        'GPT_std': [float(metrics['gpt'][1])],
        'GPT_min': [float(metrics['gpt'][2])],
        'GPT_max': [float(metrics['gpt'][3])], 
    })], ignore_index=True)


In [None]:
# Save the dataframe
metrics_df.to_csv(f'output/{data_type}/first_frame_eval.csv', index=False)

In [None]:
# Define the desired VLM order for plotting
vlm_order = ['smolvlm2', 'gemma3n', 'blip2', 'llama_vision', 'mistral', 'gemma', 'qwen', 'llava']

metrics_to_plot = ['ROUGE', 'CIDEr', 'BERT', 'GPT']
y_ranges = {
    'CIDEr': (0, 6),
    'GPT': None
}
default_ylim = (0, 1)

for metric in metrics_to_plot:
    # Reorder metrics_df according to vlm_order
    plot_df = metrics_df.set_index('VLM').loc[vlm_order].reset_index()
    means = plot_df[metric]
    stds = plot_df.get(f"{metric}_std", pd.Series([0]*len(plot_df)))
    vlm_names = plot_df['VLM']

    fig, ax = plt.subplots(figsize=(10, 5))
    ax.bar(vlm_names, means, yerr=stds, capsize=5, color=plt.cm.tab10.colors[:len(vlm_names)])
    ax.set_xticklabels(vlm_names, fontsize=12)
    ax.set_ylabel(f'{metric} Score', fontsize=14)
    ax.set_xlabel('VLM', fontsize=14)
    ax.set_title(f'{metric} by VLM', fontsize=16)
    if metric not in y_ranges:
        ax.set_ylim(default_ylim)
    else:
        ax.set_ylim(y_ranges[metric])
    ax.grid(axis='y', linestyle='--', alpha=0.3)
    plt.tight_layout()
    plt.show()


In [None]:
# Replot the metrics using seaborn swarmplot overlaid on a bar chart for each metric

for metric in metrics_to_plot:
    plot_df = metrics_df.set_index('VLM').loc[vlm_order].reset_index()
    means = plot_df[metric]
    stds = plot_df.get(f"{metric}_std", pd.Series([0]*len(plot_df)))
    vlm_names = plot_df['VLM']

    # Prepare data for swarmplot
    swarm_data = pd.DataFrame({
        'VLM': np.repeat(vlm_names.values, 10),  # 10 objects per VLM
        metric: np.concatenate([
            all_nlp_metrics[vlm][metric.lower() if metric != "ROUGE" else "rouge_l"]
            for vlm in vlm_order
        ])
    })

    fig, ax = plt.subplots(figsize=(10, 5))
    # Bar chart (mean ± std)
    ax.bar(vlm_names, means, yerr=stds, capsize=5, color=plt.cm.tab10.colors[:len(vlm_order)], label='Mean ± Std')
    # Swarmplot
    sns.swarmplot(data=swarm_data, x='VLM', y=metric, ax=ax, palette=plt.cm.tab10.colors[:len(vlm_order)], size=10, edgecolor='k', linewidth=0.5, alpha=0.5)
    ax.set_ylabel(f'{metric} Score', fontsize=14)
    ax.set_xlabel('VLM', fontsize=14)
    ax.set_title(f'{metric} by VLM', fontsize=16)
    if metric not in y_ranges:
        ax.set_ylim(default_ylim)
    else:
        ax.set_ylim(y_ranges[metric])
    ax.grid(axis='y', linestyle='--', alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
# Plot boxplots for each metric across VLMs
for metric in metrics_to_plot:
    plot_df = metrics_df.set_index('VLM').loc[vlm_order].reset_index()
    vlm_names = plot_df['VLM']

    # Prepare data for boxplot
    metric_key = metric.lower() if metric != "ROUGE" else "rouge_l"
    box_data = []
    for vlm in vlm_order:
        values = all_nlp_metrics[vlm][metric_key]
        box_data.append(values)

    fig, ax = plt.subplots(figsize=(10, 5))
    ax.boxplot(box_data, labels=vlm_names, patch_artist=True,
               boxprops=dict(facecolor='lightblue', color='blue'),
               medianprops=dict(color='red'))
    ax.set_ylabel(f'{metric} Score', fontsize=14)
    ax.set_xlabel('VLM', fontsize=14)
    ax.set_title(f'{metric} Boxplot by VLM', fontsize=16)
    if metric not in y_ranges:
        ax.set_ylim(default_ylim)
    else:
        if y_ranges[metric] is not None:
            ax.set_ylim(y_ranges[metric])
    ax.grid(axis='y', linestyle='--', alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
# Show examples of low and high score captions for each metric and VLM
def show_metric_examples(all_nlp_metrics, cap_frame0, gt_captions, n_examples=2):
    """
    Show examples of low and high scoring captions for each metric and VLM
    """
    metrics_to_analyze = ['bleu', 'rouge_l', 'meteor', 'cider', 'spice', 'bert', 'bleurt', 'bart', 'gpt']

    for metric in metrics_to_analyze:
        print(f"\n{'='*80}")
        print(f"EXAMPLES FOR {metric.upper()}")
        print('='*80)
        
        for vlm in VLMs:
            print(f"\n{vlm.upper()}:")
            print("-" * 40)
            
            # Get metric values and corresponding object indices
            metric_values = all_nlp_metrics[vlm][metric]
            obj_indices = list(range(len(metric_values)))
            
            # Sort by metric values to get lowest and highest
            sorted_pairs = sorted(zip(obj_indices, metric_values), key=lambda x: x[1])
            
            # Get lowest examples
            print(f"LOWEST {metric.upper()} scores:")
            for i in range(min(n_examples, len(sorted_pairs))):
                obj_idx, score = sorted_pairs[i]
                gt_caption = gt_captions[obj_idx]
                vlm_caption = cap_frame0[vlm][obj_idx]
                print(f"  Object {obj_idx} (Score: {score:.3f}):")
                print(f"    Ground Truth: {gt_caption}")
                print(f"    {vlm}: {vlm_caption}")
                print()
            
            # Get highest examples
            print(f"HIGHEST {metric.upper()} scores:")
            for i in range(min(n_examples, len(sorted_pairs))):
                obj_idx, score = sorted_pairs[-(i+1)]
                gt_caption = gt_captions[obj_idx]
                vlm_caption = cap_frame0[vlm][obj_idx]
                print(f"  Object {obj_idx} (Score: {score:.3f}):")
                print(f"    Ground Truth: {gt_caption}")
                print(f"    {vlm}: {vlm_caption}")
                print()

# Run the analysis
show_metric_examples(all_nlp_metrics, cap_frame0, gt_captions, n_examples=2)



In [None]:
# Create a DataFrame to store the examples data
examples_data = []

metrics_to_analyze = ['rouge_l', 'cider', 'bert', 'gpt']

for metric in metrics_to_analyze:
    for vlm in VLMs:
        # Get metric values and corresponding object indices
        metric_values = all_nlp_metrics[vlm][metric]
        obj_indices = list(range(len(metric_values)))
        
        # Sort by metric values to get lowest and highest
        sorted_pairs = sorted(zip(obj_indices, metric_values), key=lambda x: x[1])
        
        # Get lowest examples
        for i in range(min(2, len(sorted_pairs))):
            obj_idx, score = sorted_pairs[i]
            gt_caption = gt_captions[obj_idx]
            vlm_caption = cap_frame0[vlm][obj_idx]
            examples_data.append({
                'Metric': metric.upper(),
                'VLM': vlm,
                'Score_Type': 'LOWEST',
                'Object_ID': obj_idx,
                'Score': score,
                'Ground_Truth': gt_caption,
                'VLM_Caption': vlm_caption
            })
        
        # Get highest examples
        for i in range(min(2, len(sorted_pairs))):
            obj_idx, score = sorted_pairs[-(i+1)]
            gt_caption = gt_captions[obj_idx]
            vlm_caption = cap_frame0[vlm][obj_idx]
            examples_data.append({
                'Metric': metric.upper(),
                'VLM': vlm,
                'Score_Type': 'HIGHEST',
                'Object_ID': obj_idx,
                'Score': score,
                'Ground_Truth': gt_caption,
                'VLM_Caption': vlm_caption
            })

# Create DataFrame
examples_df = pd.DataFrame(examples_data)

# Save to CSV file
examples_output_file = f"output/{data_type}/metric_examples_{mask_dir}.csv"
examples_df.to_csv(examples_output_file, index=False, sep=';')
print(f"Metric examples saved to {examples_output_file}")

# Display summary of the examples table
print(f"\nExamples table contains {len(examples_df)} rows")
print(f"Metrics analyzed: {', '.join(examples_df['Metric'].unique())}")
print(f"VLMs analyzed: {', '.join(examples_df['VLM'].unique())}")