In [226]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from collections import defaultdict


In [None]:
# function to clean and split categories.
def clean_and_split(value):
    """
    Clean and split a value into a list of stripped elements.
    Returns empty list for NaN or empty values.
    """
    # print(value)
    if pd.isna(value):
        print("Value is NaN")
    if ',' not in value:
        return [value.strip()]
    else:
        value_list = [item.replace(',', '').strip() for item in value.split(',')  ]
        value_list = [item for item in value_list if item != '']
        return value_list

In [227]:
OUTPUT_DIR = './your_output_directory'

In [None]:
# load the claims csv file.
df_claims = pd.read_csv('./claims.csv')

# clean the granularity and category values.
df_claims['granularity'] = df_claims['granularity'].apply(lambda x: str(x).strip() if pd.notna(x) else x)
df_claims['category'] = df_claims['category'].apply(lambda x: clean_and_split(x))
nan_indices = df_claims[df_claims['category'].isna()].index.tolist()
print(nan_indices)


In [None]:
def convert_to_df(processed_results):
# Convert to DataFrame
    rows = []
    for movie_id, movie_data in processed_results.items():
        for claim_pair_id, claims in movie_data.items():
            claim_id = int(claim_pair_id) 
            if claims['Claim_1'] is None:
                value_true_claim = None
            else:
                value_true_claim = claims['Claim_1'].lower()
            if claims['Claim_2'] is None:
                value_false_claim = None
            else:
                value_false_claim = claims['Claim_2'].lower()
            
            rows.append({
                'movie_id': movie_id,
                'claim_id': claim_id,
                'prediction_true_claim': value_true_claim,
                'prediction_false_claim': value_false_claim
            })
    
    # Create DataFrame and sort by movie_id and claim_id
    df = pd.DataFrame(rows)
    df = df.sort_values(['movie_id', 'claim_id'])
    return df

In [None]:
# given a dict of output directories, we will load the results from each directory
# we also define a list of modalities to read from!
# modalities=["video_and_transcripts","video_only","transcripts_only","synopsis_only","statement_only","synopsis_w_movie_title"]
modalities=["video_and_transcripts"]
model_names = ["gemini-2.5-pro", "gpt-4o","qwen-2.5-72b-instruct","internvl3-78b-instruct","llava-video-72b-qwen2"]
# model_names = ["gemini-2.5-pro" ]
parsed_outputs_directories = {
    "gemini-2.5-pro": {
        "synopsis_only": "./gemini-2.5-pro-preview-03-25/synopsis_only/explanation_free/last-occurrence_parsed_outputs.json",
        "statement_only": "./gemini-2.5-pro-preview-03-25/statement_only/explanation_free/last-occurrence_parsed_outputs.json",
        "synopsis_w_movie_title": "./gemini-2.5-pro-preview-03-25/synopsis_w_movie_title/explanation_free/last-occurrence_parsed_outputs.json",
        "transcripts_only": "./gemini-2.5-pro-preview-03-25/transcripts_only/explanation_free/last-occurrence_parsed_outputs.json",
        "video_only": "./gemini-2.5-pro-preview-03-25/video_only/explanation_free/last-occurrence_parsed_outputs.json",
        "video_and_transcripts": "./gemini-2.5-pro-preview-03-25/video_and_transcripts/explanation_free/last-occurrence_parsed_outputs.json"
    },
    
}

all_dfs = []
for model_name in model_names:
    for modality in modalities:
        print(f"Processing {model_name} {modality}")
        # Load the parsed outputs
        parsed_outputs_dir = parsed_outputs_directories[model_name][modality]
        model_outputs_json = json.load(parsed_outputs_dir)
        parsed_outputs = convert_to_df(model_outputs_json)
        # add a modality column to the parsed outputs
        parsed_outputs['modality'] = modality
        parsed_outputs['model'] = model_name
        # concatenate the parsed outputs with the merged dataframe
        parsed_outputs_df = pd.concat([df_claims, parsed_outputs],axis=1)
        parsed_outputs_df = parsed_outputs_df.loc[:, ~parsed_outputs_df.columns.duplicated()]
        all_dfs.append(parsed_outputs_df)


# Concatenate all dataframes
final_df = pd.concat(all_dfs, axis=0, ignore_index=True)

# Sort the final dataframe by movie_id, claim_id
final_df = final_df.sort_values(['movie_id', 'claim_id'])


#Current size of the dataframe is:
print("Number of rows in the dataframe:", final_df.shape[0])



Assuming that the dataframe with the predicted results has predictions for true and false claims in 'Prediction_True_Claim' and 'Prediction_False_Claim' columns respectively. We define the following functions to compute performance.

In [None]:
def compute_performance_metrics(df):
    df=df.reset_index(drop=True)
    correct_true_claim_predictions = (df.Prediction_True_Claim == True).sum()
    correct_false_claim_predictions = (df.Prediction_False_Claim == False).sum()
    pairwise_correct = df.apply(lambda row: row['prediction_true_claim'] == True and row['prediction_false_claim'] == False, axis=1).sum()
    pairwise_accuracy = pairwise_correct / len(df)
    accuracy = (correct_true_claim_predictions + correct_false_claim_predictions) / (len(df)*2)
    total_pairs = len(df)
    return {'true_claim_accuracy': correct_true_claim_predictions / total_pairs, 'false_claim_accuracy': correct_false_claim_predictions / total_pairs,'pairwise_accuracy': pairwise_accuracy, 'accuracy': accuracy, 'total_pairs': total_pairs}


def compute_performance_per_movie(df):
    # Dictionary to store results
    results_per_movie = defaultdict(dict)
    # Group by Movie_ID to process each movie separately
    movie_groups = df.groupby('movie_id') 
    for movie_id, movie_df in movie_groups:
        results_per_movie[movie_id] = compute_performance_metrics(movie_df)
    return results_per_movie



Collect results per granularity and per category

In [None]:
results_per_granularity = {}
for (model,modality,granularity), group_df in final_df.groupby(['model','modality','granularity']):
    results = compute_performance_metrics(group_df)
    # Store results in the dictionary
    if model not in results_per_granularity:
        results_per_granularity[model] = {}
    if modality not in results_per_granularity[model]:
        results_per_granularity[model][modality] = {}
    results_per_granularity[model][modality][granularity] = results


In [None]:
exploded_df = final_df.explode('category')
exploded_df = exploded_df.reset_index()

results_per_category = {}
for (model,modality,category), group_df in exploded_df.groupby(['Model','Modality','category']):
    results = compute_performance_metrics(group_df)
    if model not in results_per_category:
        results_per_category[model] = {}
    if modality not in results_per_category[model]:
        results_per_category[model][modality] = {}
    results_per_category[model][modality][category] = results

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
with open(os.path.join(OUTPUT_DIR, './results_per_category.json'), 'w') as f:
        json.dump(results_per_category, f,indent=4) 

with open(os.path.join(OUTPUT_DIR, './results_per_granularity.json'), 'w') as f:
        json.dump(results_per_granularity, f,indent=4)   