In [2]:
import os
import csv
import regex as re

In [3]:
# File paths
original_answer_file = "qwen_responses_task1.csv"
human_annotated_file = "qwen_annotated.csv"
llm_accuracy_file = "qwen_LLM_Acc_evaluation.txt"
mqm_file = "qwenMQM_evaluation.txt"

In [19]:
# Fix structure of annotated file
data_dict = {}

with open(original_answer_file, "r", encoding="utf-8") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    for row in csv_reader:
        object_id = row[0]
        content = [object_id, row[1], row[2], row[3]]
        
        data_dict[object_id] = content
        
with open(human_annotated_file, "r", encoding="utf-8") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    with open("tmp.csv", "w", newline="", encoding="utf-8") as output_file:
        csv_writer = csv.writer(output_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
        for row in csv_reader:
            if len(row) == 6:
                rating_index = -2
            elif len(row) == 5:
                rating_index = -1
            object_id = row[0]
            
            data_dict[object_id].append(row[rating_index])
            csv_writer.writerow(data_dict[object_id])

In [28]:
# Read scores and print average accuracy
human_scores = {}
figure_score = 0
table_score = 0
total_score = 0
total_figures = 0
total_tables = 0

with open(human_annotated_file, "r", encoding="utf-8") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    for row in csv_reader:
        object_id = row[0]
        try:
            score = float(row[-1])
        except Exception:
            print(row)
            raise ValueError()
        
        if "TAB" in object_id:
            table_score += score
            total_tables += 1
        elif "FIG" in object_id:
            figure_score += score
            total_figures += 1
        
        human_scores[object_id] = score

print(f"Average human total score:  {((figure_score + table_score) / (total_figures + total_tables)):.4f}")
print(f"Average human figure score: {(figure_score / total_figures):.4f}")
print(f"Average human table score:  {(table_score / total_tables):.4f}")

Average human total score:  0.6438
Average human figure score: 0.5976
Average human table score:  0.6953


In [33]:
# Obtain LLM-Acc Scores
def processOutput(output):
    regex_matches = re.match(r"^(.*): Score: (0\.5|0|1)", output)
    if regex_matches:
        return regex_matches.group(1), float(regex_matches.group(2))
    return None, None

llm_accuracy_scores = {}

with open(llm_accuracy_file, "r", encoding="utf-8") as input_file:
    for line in input_file:
        object_id, score = processOutput(line)
        if object_id:
            llm_accuracy_scores[object_id] = score
            
print(f"{len(llm_accuracy_scores)} scores retrieved.")

1684 scores retrieved.


In [37]:
# Calculate Pearson Correlation
from scipy.stats import pearsonr

def calculate_pearson(dict1, dict2):
    if dict1.keys() != dict2.keys():
        print(f"{len(dict1)}, {len(dict2)}")
        raise ValueError("The dictionaries must have the same keys.")
        
    scores1 = [dict1[key] for key in dict1]
    scores2 = [dict2[key] for key in dict2]
    
    correlation, p_value = pearsonr(scores1, scores2)
    print(f"Pearson Correlation: {correlation:.4f}")
    print(f"P-value: {p_value}")
    
def remove_scores(human_scores, llm_scores):
    human_scores_removed = {}
    
    for key in human_scores:
        if key in llm_scores:
            human_scores_removed[key] = human_scores[key]
            
    return human_scores_removed
    
human_llm_accuracy_scores = remove_scores(human_scores, llm_accuracy_scores)
calculate_pearson(human_llm_accuracy_scores, llm_accuracy_scores)

Pearson Correlation: 0.7387
P-value: 1.9182521950784916e-290


In [41]:
# Calculate percentage of scores:
def calculate_score_distribution(data_dict, categories):
    total_length = 0
    score_distribution = {}
    for c in categories:
        score_distribution[c] = 0
        
    for object_id in data_dict:
        score_distribution[data_dict[object_id]] += 1
        total_length += 1
        
    for c in categories:
        print(f"Score: {c} -> {score_distribution[c]} / {total_length} -> {(score_distribution[c] / total_length):.2f}")

print("Distribution for human scores:")
calculate_score_distribution(human_scores, [0, 0.5, 1])

print("\nDistribution for LLM-Accuracy scores:")
calculate_score_distribution(llm_accuracy_scores, [0, 0.5, 1])

Distribution for human scores:
Score: 0 -> 520 / 1798 -> 0.29
Score: 0.5 -> 241 / 1798 -> 0.13
Score: 1 -> 1037 / 1798 -> 0.58

Distribution for LLM-Accuracy scores:
Score: 0 -> 448 / 1684 -> 0.27
Score: 0.5 -> 499 / 1684 -> 0.30
Score: 1 -> 737 / 1684 -> 0.44
