In [32]:
##script for the loading the aggregated results from the various repetitions and LLM models
import pandas as pd
import time


##load all results for Claude 4.5 
claude_results_4_5 = pd.read_csv("../revision_11_25/results/aggregated_statistics.csv")

## load all the results for OpenAI gpt-4o
gpt4o_results = pd.read_csv("../revision_11_25_OpenAI/results/openai_gpt-4o_aggregated_statistics.csv")

## load all the results for OpenAI gpt-5
gpt5_results = pd.read_csv("../revision_11_25_OpenAI/results/openai_gpt-5-mini_aggregated_statistics.csv")


In [34]:
## load now the results for gemini 2.0, noting that the repetietions are store in different files
## set seet for reproducibility
import random
random.seed(42)
import glob
import glob
files = sorted(glob.glob("../revision_11_25_Gemini/results/gemini_gemini-2.0-flash-exp_aggregated_statistics_*.csv"))

print(files)

## select 10 files at random
selected_files = random.sample(files, 10)

## load a single file to see the structure
sample_df = pd.read_csv(selected_files[0])
sample_df.head()

## create an aggregated dataframe



['../revision_11_25_Gemini/results\\gemini_gemini-2.0-flash-exp_aggregated_statistics_20251107.csv', '../revision_11_25_Gemini/results\\gemini_gemini-2.0-flash-exp_aggregated_statistics_20251107_202856.csv', '../revision_11_25_Gemini/results\\gemini_gemini-2.0-flash-exp_aggregated_statistics_20251107_204438.csv', '../revision_11_25_Gemini/results\\gemini_gemini-2.0-flash-exp_aggregated_statistics_20251107_210032.csv', '../revision_11_25_Gemini/results\\gemini_gemini-2.0-flash-exp_aggregated_statistics_20251108_132256.csv', '../revision_11_25_Gemini/results\\gemini_gemini-2.0-flash-exp_aggregated_statistics_20251108_135045.csv', '../revision_11_25_Gemini/results\\gemini_gemini-2.0-flash-exp_aggregated_statistics_20251108_184803.csv', '../revision_11_25_Gemini/results\\gemini_gemini-2.0-flash-exp_aggregated_statistics_20251108_204129.csv', '../revision_11_25_Gemini/results\\gemini_gemini-2.0-flash-exp_aggregated_statistics_20251109_234026.csv', '../revision_11_25_Gemini/results\\gemini_g

Unnamed: 0,question_id,correct_answer,has_image,num_repetitions,real_correct_count,real_accuracy,real_ci_lower,real_ci_upper,fake_correct_count,fake_accuracy,fake_ci_lower,fake_ci_upper
0,IT0006,C,True,1,0,0.0,0.0,0.0,0,0.0,0.0,0.0
1,IT0007,C,True,1,1,1.0,1.0,1.0,1,1.0,1.0,1.0
2,IT0031,A,True,1,1,1.0,1.0,1.0,1,1.0,1.0,1.0
3,IT0032,B,True,1,1,1.0,1.0,1.0,1,1.0,1.0,1.0
4,IT0053,C,True,1,1,1.0,1.0,1.0,1,1.0,1.0,1.0


In [35]:
gemini_results = sample_df.copy()

for file in selected_files[1:]:
    df = pd.read_csv(file)
    gemini_results["num_repetitions"] += df["num_repetitions"]
    gemini_results["real_correct_count"] += df["real_correct_count"]
    gemini_results["fake_correct_count"] += df["fake_correct_count"]
    

In [36]:
## calculate the overall results for the 4 mondels, including a confidence interval on the accuracy
import numpy as np
from scipy import stats
import pandas as pd

def calculate_accuracy_ci(df, correct_col, total_col=None, confidence=0.95):
    """
    Calculate accuracy and confidence interval from a dataframe.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing the data
    correct_col : str
        Column name containing number of correct answers per question
    total_col : str, optional
        Column name containing total attempts per question. 
        If None, assumes all questions have the same number of attempts
        and infers from the first row or uses 10 as default.
    confidence : float, default=0.95
        Confidence level (e.g., 0.95 for 95% CI)
    
    Returns:
    --------
    dict with keys: 'accuracy', 'ci_lower', 'ci_upper', 'se'
    """
    
    correct_per_question = df[correct_col].values
    
    # Determine total attempts per question
    if total_col is not None:
        total_per_question = df[total_col].values
        # Check if all totals are the same
        if len(np.unique(total_per_question)) == 1:
            total_attempts = total_per_question[0]
        else:
            raise ValueError("This function assumes same number of attempts per question")
    else:
        # Default to 10 or infer from data
        total_attempts = 10
    
    # Calculate accuracy for each question
    accuracies = correct_per_question / total_attempts
    
    # Overall mean accuracy
    mean_accuracy = accuracies.mean()
    
    # Standard error of the mean
    se = accuracies.std(ddof=1) / np.sqrt(len(accuracies))
    
    # Confidence interval using t-distribution
    alpha = 1 - confidence
    df_val = len(accuracies) - 1
    ci = stats.t.interval(confidence, df=df_val, loc=mean_accuracy, scale=se)
    
    # Print results
    print(f"Number of questions: {len(accuracies)}")
    print(f"Total attempts: {len(accuracies) * total_attempts}")
    print(f"Accuracy: {mean_accuracy:.1%}")
    print(f"{confidence*100:.0f}% CI: [{ci[0]:.1%}, {ci[1]:.1%}]")
    print(f"Standard Error: {se:.4f}")
    
    return {
        'accuracy': mean_accuracy,
        'ci_lower': ci[0],
        'ci_upper': ci[1],
        'se': se,
        'n_questions': len(accuracies)
    }

# Example usage:
# result = calculate_accuracy_ci(df, correct_col='real_correct', total_col='num_repet')
# or if all questions have 10 attempts:
# result = calculate_accuracy_ci(df, correct_col='real_correct')

In [37]:
claude_4_5_summary_real = calculate_accuracy_ci(claude_results_4_5, correct_col='real_correct_count')
openai_gpt4o_summary_real = calculate_accuracy_ci(gpt4o_results, correct_col='real_correct_count')
openai_gpt5_summary_real = calculate_accuracy_ci(gpt5_results, correct_col='real_correct_count')
gemini_2_0_summary_real = calculate_accuracy_ci(gemini_results, correct_col='real_correct_count')
print("Claude 4.5 Summary:", claude_4_5_summary_real)
print("OpenAI GPT-4o Summary:", openai_gpt4o_summary_real)
print("OpenAI GPT-5 Summary:", openai_gpt5_summary_real)
print("Gemini 2.0 Summary:", gemini_2_0_summary_real)

## for fake images
claude_4_5_summary_fake = calculate_accuracy_ci(claude_results_4_5, correct_col='fake_correct_count')
openai_gpt4o_summary_fake = calculate_accuracy_ci(gpt4o_results, correct_col='fake_correct_count')
openai_gpt5_summary_fake = calculate_accuracy_ci(gpt5_results, correct_col='fake_correct_count')
gemini_2_0_summary_fake = calculate_accuracy_ci(gemini_results, correct_col='fake_correct_count')
print("Claude 4.5 Summary (Fake):", claude_4_5_summary_fake)
print("OpenAI GPT-4o Summary (Fake):", openai_gpt4o_summary_fake)
print("OpenAI GPT-5 Summary (Fake):", openai_gpt5_summary_fake)
print("Gemini 2.0 Summary (Fake):", gemini_2_0_summary_fake)

Number of questions: 60
Total attempts: 600
Accuracy: 82.8%
95% CI: [73.7%, 91.9%]
Standard Error: 0.0455
Number of questions: 60
Total attempts: 600
Accuracy: 83.2%
95% CI: [74.6%, 91.7%]
Standard Error: 0.0426
Number of questions: 60
Total attempts: 600
Accuracy: 88.0%
95% CI: [81.3%, 94.7%]
Standard Error: 0.0333
Number of questions: 60
Total attempts: 600
Accuracy: 83.7%
95% CI: [74.3%, 93.0%]
Standard Error: 0.0466
Claude 4.5 Summary: {'accuracy': 0.8283333333333333, 'ci_lower': 0.7373306132413898, 'ci_upper': 0.9193360534252767, 'se': 0.04547872580604715, 'n_questions': 60}
OpenAI GPT-4o Summary: {'accuracy': 0.8316666666666667, 'ci_lower': 0.7463872321008066, 'ci_upper': 0.9169461012325267, 'se': 0.04261850654130992, 'n_questions': 60}
OpenAI GPT-5 Summary: {'accuracy': 0.88, 'ci_lower': 0.813277547812063, 'ci_upper': 0.946722452187937, 'se': 0.03334463085385204, 'n_questions': 60}
Gemini 2.0 Summary: {'accuracy': 0.8366666666666666, 'ci_lower': 0.7434687479464059, 'ci_upper': 0

In [38]:
import pandas as pd

def create_summary_table_from_dict(results_dict):
    """
    Create a summary table from a dictionary of results.
    
    Parameters:
    -----------
    results_dict : dict
        Dictionary where keys are model names and values are result objects
    
    Returns:
    --------
    pandas.DataFrame
        Formatted summary table
    """
    
    summary_data = []
    
    for model_name, result in results_dict.items():
        summary_data.append({
            'Model': model_name,
            'Accuracy': f"{result['accuracy']:.1%}",
            '95% CI': f"[{result['ci_lower']:.1%}, {result['ci_upper']:.1%}]",
            'CI Width': f"{result['ci_upper'] - result['ci_lower']:.1%}",
            'Std Error': f"{result['se']:.2%}",
            'N Questions': result['n_questions']
        })
    
    df_summary = pd.DataFrame(summary_data)
    
    # Sort by accuracy
    df_summary['_sort_key'] = [r['accuracy'] for r in results_dict.values()]
    df_summary = df_summary.sort_values('_sort_key', ascending=False).reset_index(drop=True)
    df_summary = df_summary.drop('_sort_key', axis=1)
    
    return df_summary


# Usage with both fake and real:
all_results = {
    # Fake results
    'Claude 4.5 (Fake)': claude_4_5_summary_fake,
    'GPT-4o (Fake)': openai_gpt4o_summary_fake,
    'GPT-5 (Fake)': openai_gpt5_summary_fake,
    'Gemini 2.0 (Fake)': gemini_2_0_summary_fake,
    
    # Real results
    'Claude 4.5 (Real)': claude_4_5_summary_real,
    'GPT-4o (Real)': openai_gpt4o_summary_real,
    'GPT-5 (Real)': openai_gpt5_summary_real,
    'Gemini 2.0 (Real)': gemini_2_0_summary_real
}

summary_table = create_summary_table_from_dict(all_results)
print("\n=== Combined Summary Table ===")
print(summary_table.to_string(index=False))

# Or create separate tables:
fake_results = {
    'Claude 4.5': claude_4_5_summary_fake,
    'GPT-4o': openai_gpt4o_summary_fake,
    'GPT-5': openai_gpt5_summary_fake,
    'Gemini 2.0': gemini_2_0_summary_fake
}

real_results = {
    'Claude 4.5': claude_4_5_summary_real,
    'GPT-4o': openai_gpt4o_summary_real,
    'GPT-5': openai_gpt5_summary_real,
    'Gemini 2.0': gemini_2_0_summary_real
}

fake_table = create_summary_table_from_dict(fake_results)
real_table = create_summary_table_from_dict(real_results)

print("\n=== Fake Results ===")
print(fake_table.to_string(index=False))

print("\n=== Real Results ===")
print(real_table.to_string(index=False))

# Save both to CSV
# fake_table.to_csv('fake_results_summary.csv', index=False)
# real_table.to_csv('real_results_summary.csv', index=False)
# summary_table.to_csv('all_results_summary.csv', index=False)


=== Combined Summary Table ===
            Model Accuracy         95% CI CI Width Std Error  N Questions
     GPT-5 (Real)    88.0% [81.3%, 94.7%]    13.3%     3.33%           60
Gemini 2.0 (Real)    83.7% [74.3%, 93.0%]    18.6%     4.66%           60
    GPT-4o (Real)    83.2% [74.6%, 91.7%]    17.1%     4.26%           60
Claude 4.5 (Real)    82.8% [73.7%, 91.9%]    18.2%     4.55%           60
Gemini 2.0 (Fake)    81.3% [71.7%, 91.0%]    19.3%     4.82%           60
     GPT-5 (Fake)    79.5% [69.7%, 89.3%]    19.6%     4.89%           60
Claude 4.5 (Fake)    77.2% [66.6%, 87.7%]    21.1%     5.26%           60
    GPT-4o (Fake)    55.3% [44.1%, 66.6%]    22.5%     5.62%           60

=== Fake Results ===
     Model Accuracy         95% CI CI Width Std Error  N Questions
Gemini 2.0    81.3% [71.7%, 91.0%]    19.3%     4.82%           60
     GPT-5    79.5% [69.7%, 89.3%]    19.6%     4.89%           60
Claude 4.5    77.2% [66.6%, 87.7%]    21.1%     5.26%           60
    GPT-4o  

In [22]:
gpt5_results

Unnamed: 0,question_id,correct_answer,has_image,num_repetitions,real_correct_count,real_accuracy,real_ci_lower,real_ci_upper,fake_correct_count,fake_accuracy,fake_ci_lower,fake_ci_upper
0,IT0006,C,True,10,2,0.2,0.0,0.5,0,0.0,0.0,0.0
1,IT0007,C,True,10,4,0.4,0.1,0.7,4,0.4,0.1,0.7
2,IT0031,A,True,10,10,1.0,1.0,1.0,10,1.0,1.0,1.0
3,IT0032,B,True,10,9,0.9,0.7,1.0,10,1.0,1.0,1.0
4,IT0053,C,True,10,6,0.6,0.3,0.9,0,0.0,0.0,0.0
5,IT0054,A,True,10,10,1.0,1.0,1.0,10,1.0,1.0,1.0
6,IT0063,C,True,10,1,0.1,0.0,0.3,9,0.9,0.7,1.0
7,IT0064,D,True,10,2,0.2,0.0,0.5,5,0.5,0.2,0.8
8,IT0065,A,True,10,6,0.6,0.3,0.9,9,0.9,0.7,1.0
9,IT0095,C,True,10,6,0.6,0.3,0.9,4,0.4,0.1,0.7
