In [1]:
import os
import csv
import regex as re

In [2]:
# File paths
original_answer_file = "qwen_responses_task1.csv"
human_annotated_file = "qwen_annotated.csv"
subset_annotated_file = "subset_evaluation.csv"
llm_accuracy_file = "qwen_LLM_Acc_evaluation.txt"
mqm_file = "qwenMQM_evaluation.txt"
evaluation_item_list = "human_evaluation_id.txt"

In [7]:
# Read scores and print average accuracy
human_scores = {}
def process_scores(item_list = False):
    h_scores =  {}
    figure_score = 0
    table_score = 0
    total_score = 0
    total_figures = 0
    total_tables = 0
    
    item_set = set()
    if item_list:
        with open(evaluation_item_list, "r", encoding="utf_8") as txt_file:
            for line in txt_file:
                item_set.add(line.replace("\n", ""))

    with open(human_annotated_file, "r", encoding="utf-8") as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)

        for row in csv_reader:
            object_id = row[0]
            
            if item_list and object_id not in item_set:
                continue
            
            try:
                score = float(row[-1])
            except Exception:
                print(row)
                raise ValueError()

            if "TAB" in object_id:
                table_score += score
                total_tables += 1
            elif "FIG" in object_id:
                figure_score += score
                total_figures += 1

            h_scores[object_id] = score

    print(f"Average human total score:  {((figure_score + table_score) / (total_figures + total_tables)):.4f}")
    print(f"Average human figure score: {(figure_score / total_figures):.4f}")
    print(f"Average human table score:  {(table_score / total_tables):.4f}")
    
    return h_scores

print("Overall human evaluation:")
human_scores_overall = process_scores(item_list=False)

print("\nSubset human evaluation:")
human_scores_subset = process_scores(item_list=True)

Overall human evaluation:
Average human total score:  0.6466
Average human figure score: 0.6013
Average human table score:  0.6971

Subset human evaluation:
Average human total score:  0.7037
Average human figure score: 0.6700
Average human table score:  0.7235


In [8]:
# Obtain LLM-Acc Scores
def processOutput(output):
    regex_matches = re.match(r"^(.*): Score: (0\.5|0|1)", output)
    if regex_matches:
        return regex_matches.group(1), float(regex_matches.group(2))
    
    output = output.split(":")
    if len(output) > 1 and ("FIG" in output[0] or "TAB" in output[0]):
        return output[0], 0
    
    return None, None

llm_accuracy_scores = {}

with open(llm_accuracy_file, "r", encoding="utf-8") as input_file:
    for line in input_file:
        object_id, score = processOutput(line)
        if object_id:
            llm_accuracy_scores[object_id] = score
            
print(f"{len(llm_accuracy_scores)} scores retrieved.")

1798 scores retrieved.


In [9]:
# Calculate Pearson Correlation
from scipy.stats import pearsonr

def calculate_pearson(dict1, dict2):
    if dict1.keys() != dict2.keys():
        print(f"{len(dict1)}, {len(dict2)}")
        raise ValueError("The dictionaries must have the same keys.")
        
    scores1 = [dict1[key] for key in dict1]
    scores2 = [dict2[key] for key in dict2]
    
    correlation, p_value = pearsonr(scores1, scores2)
    print(f"Pearson Correlation: {correlation:.4f}")
    print(f"P-value: {p_value}")
    
calculate_pearson(human_scores_overall, llm_accuracy_scores)

Pearson Correlation: 0.6973
P-value: 4.754039201337712e-262


In [10]:
# Calculate percentage of scores:
def calculate_score_distribution(data_dict, categories):
    total_length = 0
    score_distribution = {}
    for c in categories:
        score_distribution[c] = 0
        
    for object_id in data_dict:
        score_distribution[data_dict[object_id]] += 1
        total_length += 1
        
    for c in categories:
        print(f"Score: {c} -> {score_distribution[c]} / {total_length} -> {(score_distribution[c] / total_length):.2f}")

print("Distribution for human scores:")
calculate_score_distribution(human_scores_overall, [0, 0.5, 1])

print("\nDistribution for LLM-Accuracy scores:")
calculate_score_distribution(llm_accuracy_scores, [0, 0.5, 1])

Distribution for human scores:
Score: 0 -> 516 / 1798 -> 0.29
Score: 0.5 -> 239 / 1798 -> 0.13
Score: 1 -> 1043 / 1798 -> 0.58

Distribution for LLM-Accuracy scores:
Score: 0 -> 562 / 1798 -> 0.31
Score: 0.5 -> 499 / 1798 -> 0.28
Score: 1 -> 737 / 1798 -> 0.41


In [17]:
def process_scores_category(category_name):
    figure_score = 0
    table_score = 0
    total_score = 0
    total_figures = 0
    total_tables = 0
    
    with open(subset_annotated_file, "r") as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        
        for row in csv_reader:
            if row[0] == category_name:
                try:
                    score = float(row[-1])
                except Exception:
                    print(row)
                    raise ValueError()

                object_id = row[1]
                if "TAB" in object_id:
                    table_score += score
                    total_tables += 1
                elif "FIG" in object_id:
                    figure_score += score
                    total_figures += 1

        print(f"Average total score:  {((figure_score + table_score) / (total_figures + total_tables)):.4f}")
        print(f"Average figure score: {(figure_score / total_figures):.4f}")
        print(f"Average table score:  {(table_score / total_tables):.4f}")
        
def get_average_scores_complete():
    categories = set()
    with open(subset_annotated_file, "r") as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        
        for row in csv_reader:
            categories.add(row[0])
            
    for cat in categories:
        print(f"Results for {cat}:")
        print("#########################")
        process_scores_category(cat)
        print()
        
get_average_scores_complete()

Results for qwen:
#########################
Average total score:  0.6963
Average figure score: 0.6800
Average table score:  0.7059

Results for glm:
#########################
Average total score:  0.5444
Average figure score: 0.4700
Average table score:  0.5882

Results for minicpm:
#########################
Average total score:  0.5593
Average figure score: 0.5600
Average table score:  0.5588

Results for llava:
#########################
Average total score:  0.4519
Average figure score: 0.4700
Average table score:  0.4412

Results for internvl:
#########################
Average total score:  0.6370
Average figure score: 0.5600
Average table score:  0.6824

Results for gpt4:
#########################
Average total score:  0.6852
Average figure score: 0.7000
Average table score:  0.6765

Results for paligemma:
#########################
Average total score:  0.4333
Average figure score: 0.4300
Average table score:  0.4353

Results for ovis:
#########################
Average total score: