In [1]:
import pandas as pd
from scipy.stats import pearsonr

In [2]:
def compute_correlations(scores_path, doc_metrics_path):
    df_scores = pd.read_csv(scores_path)
    df_doc_metrics = pd.read_csv(doc_metrics_path, sep=" ")
    
    score_columns = [
        "raw_score", "bleu_score", "meteor_score", "chrf_score", "monolingual_bert_score", "multilingual_bert_score"
    ]
    
    d_seg_scores = {}
    for col in score_columns:
        d_seg_scores[col] = {}
    
    # for each segment, store the maximum score computed for all references
    for _, row in df_scores.iterrows():
        for col in score_columns:
            score = row[col]
            segment = (row.sysid, row.docid, row.segid)
            old = d_seg_scores[col].get(segment, 0)
            new = max(old, score)
            d_seg_scores[col][segment] = score
            
    all_segments = list(d_seg_scores["raw_score"].keys())
    print("Segment correlations:")
    all_raw_scores = [d_seg_scores["raw_score"][segment] for segment in all_segments]
    for col in score_columns:
        all_col_scores = [d_seg_scores[col][segment] for segment in all_segments]
        corr, _ = pearsonr(all_raw_scores, all_col_scores)
        print(f"\t{col}: {corr}")
        
    d_doc_acumulator = {}
    for col in score_columns:
        d_doc_acumulator[col] = {}
        
    # for each document, add the scores computed for each segment
    for segment in all_segments:
        for col in score_columns:
            sysid, docid, _ = segment
            document = (sysid, docid)
            old = d_doc_acumulator[col].get(document, 0)
            new = old + d_seg_scores[col][segment]
            d_doc_acumulator[col][document] = new
        
    # divide accumulated scores for each document by the number of segments in the document
    for _, row in df_doc_metrics.iterrows():
        document = (row.SYS, row.SEGID)
        if document not in d_doc_acumulator["raw_score"]:
            continue
        d_doc_acumulator["raw_score"][document] = row.N * row["RAW.SCR"]
        for col in score_columns:
            d_doc_acumulator[col][document] /= row.N
            
    all_documents = list(d_doc_acumulator["raw_score"].keys())
    print("Document correlations:")
    all_raw_scores = [d_doc_acumulator["raw_score"][document] for document in all_documents]
    for col in score_columns:
        all_col_scores = [d_doc_acumulator[col][document] for document in all_documents]
        corr, _ = pearsonr(all_raw_scores, all_col_scores)
        print(f"\t{col}: {corr}")

## German-English

In [3]:
compute_correlations("data/en-test-scores.csv", "data/metrics-ad-doc-scores-de-en.csv")

Segment correlations:
	raw_score: 1.0
	bleu_score: 0.46319160375197915
	meteor_score: 0.6416436857747883
	chrf_score: 0.6425058781474778
	monolingual_bert_score: 0.7120993136320412
	multilingual_bert_score: 0.6630953396480903
Document correlations:
	raw_score: 0.9999999999999999
	bleu_score: 0.6894479572807775
	meteor_score: 0.7837446893852951
	chrf_score: 0.7699703682531935
	monolingual_bert_score: 0.7946188830855865
	multilingual_bert_score: 0.7706698141993786


## English-German

In [4]:
compute_correlations("data/de-test-scores.csv", "data/metrics-ad-doc-scores-en-de.csv")

Segment correlations:
	raw_score: 0.9999999999999997
	bleu_score: 0.28843973792229216
	meteor_score: 0.3061951962832597
	chrf_score: 0.3944730948395032
	monolingual_bert_score: 0.538809381700918
	multilingual_bert_score: 0.36947858448621407
Document correlations:
	raw_score: 1.0
	bleu_score: 0.3993291815922161
	meteor_score: 0.4314120932300453
	chrf_score: 0.5216788942053152
	monolingual_bert_score: 0.68094289522945
	multilingual_bert_score: 0.5157034823355668
