In [1]:
import os
import csv
import regex as re

### Human evaluation of Qwen output following LLM-Accuracy

In [2]:
# Read scores and print average accuracy
evaluation_item_list = "human_evaluation_id.txt"

def process_scores(file_path, object_index=0, item_list=False):
    h_scores =  {}
    figure_score = 0
    table_score = 0
    total_score = 0
    total_figures = 0
    total_tables = 0
    
    item_set = set()
    if item_list:
        with open(evaluation_item_list, "r", encoding="utf_8") as txt_file:
            for line in txt_file:
                item_set.add(line.replace("\n", ""))

    with open(file_path, "r", encoding="utf-8") as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)

        for row in csv_reader:
            object_id = row[object_index]
            
            if item_list and object_id not in item_set:
                continue
            
            try:
                score = float(row[-1])
            except Exception:
                print(row)
                raise ValueError()

            if "TAB" in object_id:
                table_score += score
                total_tables += 1
            elif "FIG" in object_id:
                figure_score += score
                total_figures += 1
            else:
                print(object_id)

            h_scores[object_id] = score

    print(f"Average human total score:  {((figure_score + table_score) / (total_figures + total_tables)):.4f}")
    print(f"Average human figure score: {(figure_score / total_figures):.4f}")
    print(f"Average human table score:  {(table_score / total_tables):.4f}")
    
    return h_scores

In [3]:
human_annotated_file = "qwen_annotated.csv"

print("Overall human evaluation:")
human_scores_overall = process_scores(human_annotated_file, item_list=False)

print("\nSubset human evaluation:")
human_scores_subset = process_scores(human_annotated_file, item_list=True)

Overall human evaluation:
Average human total score:  0.6466
Average human figure score: 0.6013
Average human table score:  0.6971

Subset human evaluation:
Average human total score:  0.7037
Average human figure score: 0.6700
Average human table score:  0.7235


### Correlation between LLM-generated scores and human-annotated scores

In [4]:
# Obtain LLM-Acc Scores
def processOutput_LLM(output, regex_expression):
    regex_matches = re.match(regex_expression, output)
    if regex_matches:
        return regex_matches.group(1), float(regex_matches.group(2))
    
    return None, None

def read_llm_acc_file(file_path, regex_expression=r"^(.*): Score: (0\.5|0|1)"):
    score_dict = {}
    
    with open(file_path, "r", encoding="utf-8") as input_file:
        for line in input_file:
            object_id, score = processOutput_LLM(line, regex_expression)
            if object_id:
                score_dict[object_id] = score
            
    print(f"{len(score_dict)} scores retrieved.")
    return score_dict

llm_accuracy_file = "qwen_LLM_Acc_evaluation.txt"
llm_accuracy_scores = read_llm_acc_file(llm_accuracy_file)

1798 scores retrieved.


In [5]:
# Obtain VQA-MQM Scores
error_types = {
    "Critical Value Error": "All values given in the response are significantly different.",
    "Critical Factual Error": "The response presents statements that are fundamentally different from the reference.",
    "Major Completeness Error": "The response misses relevant information from the reference.",
    "Major Value Error": "One of multiple values is significantly different, affecting correctness.",
    "Major Factual Error": "The response presents statements that partially contradict the reference.",
    "Minor Value Error": "A value in the response deviates slightly from the reference but remains within an acceptable range (less than 10% deviation).",
    "Minor Factual Error": "The response presents statements that slightly deviate from the reference.",
    "Minor Completeness Error": "The response misses little information from the reference."
}

def calculate_score(output):
    current_score = 1.0
    for e_type in error_types:
        matches = len(re.findall(e_type, output))

        if matches > 0:
            if "Critical" in e_type:
                return 0

            penalty_value = 0.5 if "Major" in e_type else 0.25
            current_score -= penalty_value * matches

            if current_score <= 0:
                return 0

    return current_score

object_format_regex = r"(\d+\.\d+_(FIG|TAB)_\d+):"
def read_mqm_file(file_path):
    score_dict = {}
    
    with open(file_path, "r", encoding="utf-8") as eval_file:
        current_obj = None
        current_text = ""
        for line in eval_file:
            if "List of errors:" in line:
                if current_obj:
                    score = calculate_score(current_text)
                    score_dict[current_obj] = score
                obj_match = re.match(object_format_regex, line)
                if obj_match:
                    current_obj = obj_match.group(1)
                else:
                    print(current_text)
                    raise ValueError()
                current_text = ""
            else:
                current_text += line
        score = calculate_score(current_text)
        score_dict[current_obj] = score
        
    print(f"{len(score_dict)} scores retrieved.")
    return score_dict

mqm_file = "qwenMQM_evaluation.txt"
mqm_scores = read_mqm_file(mqm_file)

1798 scores retrieved.


In [6]:
# Calculate Pearson Correlation
from scipy.stats import pearsonr

def calculate_pearson(dict1, dict2):
    if len(dict1) != len(dict2):
        print(f"Different dictionary lengths: {len(dict1)}, {len(dict2)}.")
        raise ValueError("The dictionaries must have the same keys.")
    
    for key in dict1.keys():
        if key not in dict2:
            print(f"Key mismatch for {key}")
            raise ValueError("The dictionaries must have the same keys.")
        
    common_keys = sorted(dict1.keys()) 
    scores1 = [dict1[key] for key in common_keys]
    scores2 = [dict2[key] for key in common_keys]
    
    correlation, p_value = pearsonr(scores1, scores2)
    print(f"Pearson Correlation: {correlation:.4f}")
    print(f"P-value: {p_value}")
    
print("Correlation between human scores and LLM-Accuracy:")
calculate_pearson(human_scores_overall, llm_accuracy_scores)

print("\nCorrelation between human scores and VQA-MQM:")
calculate_pearson(human_scores_overall, mqm_scores)

Correlation between human scores and LLM-Accuracy:
Pearson Correlation: 0.7131
P-value: 2.320306921941283e-279

Correlation between human scores and VQA-MQM:
Pearson Correlation: 0.7128
P-value: 5.182640156914712e-279


### Distribution of human scores and LLM scores

In [7]:
# Calculate percentage of scores:
def calculate_score_distribution(data_dict, categories):
    total_length = 0
    score_distribution = {}
    for c in categories:
        score_distribution[c] = 0
        
    for object_id in data_dict:
        score_distribution[data_dict[object_id]] += 1
        total_length += 1
        
    for c in categories:
        print(f"Score: {c} -> {score_distribution[c]} / {total_length} -> {(score_distribution[c] / total_length):.2f}")

print("Distribution for human scores:")
calculate_score_distribution(human_scores_overall, [0, 0.5, 1])

print("\nDistribution for LLM-Accuracy scores:")
calculate_score_distribution(llm_accuracy_scores, [0, 0.5, 1])

print("\nDistribution for VQA-MQM scores:")
calculate_score_distribution(mqm_scores, [0, 0.25, 0.5, 0.75, 1])

Distribution for human scores:
Score: 0 -> 516 / 1798 -> 0.29
Score: 0.5 -> 239 / 1798 -> 0.13
Score: 1 -> 1043 / 1798 -> 0.58

Distribution for LLM-Accuracy scores:
Score: 0 -> 418 / 1798 -> 0.23
Score: 0.5 -> 679 / 1798 -> 0.38
Score: 1 -> 701 / 1798 -> 0.39

Distribution for VQA-MQM scores:
Score: 0 -> 365 / 1798 -> 0.20
Score: 0.25 -> 25 / 1798 -> 0.01
Score: 0.5 -> 425 / 1798 -> 0.24
Score: 0.75 -> 371 / 1798 -> 0.21
Score: 1 -> 612 / 1798 -> 0.34


### Correlation between scores of different human workers

In [8]:
human2_scores_file = "qwen_human2_annotated.csv"
human2_scores_subset = process_scores(human2_scores_file, object_index=1, item_list=True)

print("\nCorrelation between scores of different human workers:")
calculate_pearson(human_scores_subset, human2_scores_subset)

Average human total score:  0.6259
Average human figure score: 0.6500
Average human table score:  0.6118

Correlation between scores of different human workers:
Pearson Correlation: 0.8140
P-value: 3.529107893734909e-33


### Correlation between human scores and traditional metrics

In [9]:
def read_metric_scores(file_path):
    metric_dict = {}
    output_expression = r"^(.*): (\d+\.\d+)%"
    
    with open(file_path, "r") as metric_file:
        for line in metric_file:
            regex_matches = re.match(output_expression, line)
            if regex_matches:
                object_id, metric_score = regex_matches.group(1), float(regex_matches.group(2))
                metric_score = metric_score / 100
                metric_dict[object_id] = metric_score
            else:
                raise ValueError(f"Error for line: {line}")
    
    return metric_dict

In [10]:
bleu_scores = read_metric_scores("bleu.txt")
rouge_scores = read_metric_scores("rouge.txt")
meteor_scores = read_metric_scores("meteor.txt")
bert_scores = read_metric_scores("bertscore.txt")

print("\nCorrelation between human scores and bleu scores:")
calculate_pearson(human_scores_overall, bleu_scores)

print("\nCorrelation between human scores and rouge scores:")
calculate_pearson(human_scores_overall, rouge_scores)

print("\nCorrelation between human scores and meteor scores:")
calculate_pearson(human_scores_overall, meteor_scores)

print("\nCorrelation between human scores and bertscores:")
calculate_pearson(human_scores_overall, bert_scores)


Correlation between human scores and bleu scores:
Pearson Correlation: 0.4413
P-value: 1.384000158573103e-86

Correlation between human scores and rouge scores:
Pearson Correlation: 0.6265
P-value: 1.3012031952210603e-196

Correlation between human scores and meteor scores:
Pearson Correlation: 0.4596
P-value: 1.1224004235452366e-94

Correlation between human scores and bertscores:
Pearson Correlation: 0.4972
P-value: 6.867061364694283e-113


### Results when using GPT-4o for evaluation

In [11]:
# Process GPT-4 Evaluation LLM-Accuracy
from collections import defaultdict

gpt_accuracy_file = "gpt_llm_acc_evaluation.txt"
regex_expresson = r"\d+\.\d+_(FIG|TAB)_\d+: (0\.5|1|0)"

count_dict = defaultdict(int)
score_dict = defaultdict(int)

with open(gpt_accuracy_file, "r", encoding="utf-8") as eval_file:
    for line in eval_file:
        match = re.match(regex_expresson, line)
        if match:
            score_dict[match.group(1)] += float(match.group(2))
            count_dict[match.group(1)] += 1
        else:
            raise ValueError
            
score_dict["Total"] = score_dict["FIG"] + score_dict["TAB"]
count_dict["Total"] = count_dict["FIG"] + count_dict["TAB"]

print("Qwen Results with LLM-Accuracy GPT-4o Evaluation:")
print("=" * 30)
for category in count_dict:
    match_count = score_dict[category]
    total_count = count_dict[category]
    partion = match_count / total_count if total_count > 0 else 0
    print(f"{category:<7}: {match_count:<5} / {total_count:<5} -> {partion:.2%}")

Qwen Results with LLM-Accuracy GPT-4o Evaluation:
FIG    : 538.5 / 948   -> 56.80%
TAB    : 556.0 / 850   -> 65.41%
Total  : 1094.5 / 1798  -> 60.87%


In [12]:
# Process GPT-4 Evaluation LLM-Accuracy

error_types = {
    "Critical Value Error": "All values given in the response are significantly different.",
    "Critical Factual Error": "The response presents statements that are fundamentally different from the reference.",
    "Major Completeness Error": "The response misses relevant information from the reference.",
    "Major Value Error": "One of multiple values is significantly different, affecting correctness.",
    "Major Factual Error": "The response presents statements that partially contradict the reference.",
    "Minor Value Error": "A value in the response deviates slightly from the reference but remains within an acceptable range (less than 10% deviation).",
    "Minor Factual Error": "The response presents statements that slightly deviate from the reference.",
    "Minor Completeness Error": "The response misses little information from the reference."
}

def calculate_score(output):
    current_score = 1.0
    for e_type in error_types:
        matches = len(re.findall(e_type, output))

        if matches > 0:
            if "Critical" in e_type:
                return 0

            penalty_value = 0.5 if "Major" in e_type else 0.25
            current_score -= penalty_value * matches

            if current_score <= 0:
                return 0

    return current_score

gpt_mqm_file = "gpt_mqm_evaluation.txt"
count_dict = defaultdict(int)
score_dict = defaultdict(int)

with open(gpt_mqm_file, "r", encoding="utf-8") as eval_file:
    current_type = None
    current_text = ""
    for line in eval_file:
        if "_TAB_" in line and "List of errors:" not in line:
            print(line)
            print(current_text)
        if "List of errors:" in line:
            if current_type:
                score = calculate_score(current_text)
                score_dict[current_type] += score
                count_dict[current_type] += 1
            if "TAB" in line:
                current_type = "TAB"
            elif "FIG" in line:
                current_type = "FIG"
            else:
                print(current_text)
                raise ValueError
            current_text = ""
        else:
            current_text += line
    score = calculate_score(current_text)
    score_dict[current_type] += score
    count_dict[current_type] += 1
            
score_dict["Total"] = score_dict["FIG"] + score_dict["TAB"]
count_dict["Total"] = count_dict["FIG"] + count_dict["TAB"]            
            
print("Qwen Results with VQA-MQM GPT-4o Evaluation:")
print("=" * 30)
for category in count_dict:
    match_count = score_dict[category]
    total_count = count_dict[category]
    partion = match_count / total_count if total_count > 0 else 0
    print(f"{category:<7}: {match_count:<5} / {total_count:<5} -> {partion:.2%}")

Qwen Results with VQA-MQM GPT-4o Evaluation:
FIG    : 419.0 / 948   -> 44.20%
TAB    : 558.0 / 850   -> 65.65%
Total  : 977.0 / 1798  -> 54.34%


### Correlation of metrics when used with different LLMs

In [13]:
qwen_llm_accuracy_file = "qwen_LLM_Acc_evaluation.txt"
qwen_llm_scores = read_llm_acc_file(qwen_llm_accuracy_file)

qwen_mqm_file = "qwenMQM_evaluation.txt"
qwen_mqm_scores = read_mqm_file(qwen_mqm_file)

gpt_llm_accuracy_file = "gpt_llm_acc_evaluation.txt"
gpt_llm_scores = read_llm_acc_file(gpt_llm_accuracy_file, regex_expression=r"^(.*): (0\.5|0|1)")

gpt_mqm_file = "gpt_mqm_evaluation.txt"
gpt_mqm_scores = read_mqm_file(gpt_mqm_file)

1798 scores retrieved.
1798 scores retrieved.
1798 scores retrieved.
1798 scores retrieved.


In [14]:
print("LLM-Accuracy correlation between scores of different LLMs:")
calculate_pearson(qwen_llm_scores, gpt_llm_scores)

print("\nVQA-MQM correlation between scores of different LLMs:")
calculate_pearson(qwen_mqm_scores, gpt_mqm_scores)

LLM-Accuracy correlation between scores of different LLMs:
Pearson Correlation: 0.8522
P-value: 0.0

VQA-MQM correlation between scores of different LLMs:
Pearson Correlation: 0.8560
P-value: 0.0


### Distribution differences

In [15]:
print("Distribution of Qwen LLM-Acc scores:")
calculate_score_distribution(qwen_llm_scores, [0, 0.5, 1])

print("\nDistribution of GPT LLM-Accuracy scores:")
calculate_score_distribution(gpt_llm_scores, [0, 0.5, 1])

Distribution of Qwen LLM-Acc scores:
Score: 0 -> 418 / 1798 -> 0.23
Score: 0.5 -> 679 / 1798 -> 0.38
Score: 1 -> 701 / 1798 -> 0.39

Distribution of GPT LLM-Accuracy scores:
Score: 0 -> 509 / 1798 -> 0.28
Score: 0.5 -> 389 / 1798 -> 0.22
Score: 1 -> 900 / 1798 -> 0.50


In [16]:
print("Distribution of Qwen VQA-MQM scores:")
calculate_score_distribution(qwen_mqm_scores, [0, 0.25, 0.5, 0.75, 1])

print("\nDistribution of GPT VQA-MQM scores:")
calculate_score_distribution(gpt_mqm_scores, [0, 0.25, 0.5, 0.75, 1])

Distribution of Qwen VQA-MQM scores:
Score: 0 -> 365 / 1798 -> 0.20
Score: 0.25 -> 25 / 1798 -> 0.01
Score: 0.5 -> 425 / 1798 -> 0.24
Score: 0.75 -> 371 / 1798 -> 0.21
Score: 1 -> 612 / 1798 -> 0.34

Distribution of GPT VQA-MQM scores:
Score: 0 -> 524 / 1798 -> 0.29
Score: 0.25 -> 76 / 1798 -> 0.04
Score: 0.5 -> 340 / 1798 -> 0.19
Score: 0.75 -> 280 / 1798 -> 0.16
Score: 1 -> 578 / 1798 -> 0.32
