In [78]:
# enable automatic reloading of the notebook
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Evaluation

In [79]:
import os
import json
from src.utils.metrics import get_pearson_r, get_spearman_r, get_kendall_tau, get_t_test

## WMT18 Benchmark

In [80]:
lang_pairs = [
    "cs-en",
    "de-en",
    "et-en",
    "fi-en",
    "ru-en",
    "tr-en",
    "zh-en",
    "en-cs",
    "en-de",
    "en-et",
    "en-fi",
    "en-ru",
    "en-tr",
    "en-zh",
]

In [81]:
model_path_1 = "../archive/results_emd_idf_20230119/wmt18/"
model_path_2 = "../archive/results_emd_uniform_20230118/wmt18/"

In [82]:
def get_model_scores(lang_pair, model_path):
    with open(os.path.join(model_path, f"scores.{lang_pair}.json"), "r", encoding="utf8") as f:
        return [k["system_score"] for k in json.load(f)]

In [83]:
def calculate_t_test(lang_pair):
    model_scores_1 = get_model_scores(lang_pair, model_path_1)
    model_scores_2 = get_model_scores(lang_pair, model_path_2)
    return get_t_test(model_scores_1, model_scores_2)

In [84]:
for lang_pair in lang_pairs:
    try:
        print(lang_pair, calculate_t_test(lang_pair))
    except:
        print("Error, unable to compare scores")

cs-en Ttest_indResult(statistic=19.956666645232744, pvalue=1.0918613945700628e-87)
de-en Ttest_indResult(statistic=29.155633739202027, pvalue=9.791157357385664e-186)
et-en Ttest_indResult(statistic=28.32481411960971, pvalue=5.973909479501825e-175)
fi-en Ttest_indResult(statistic=34.77894454532147, pvalue=3.431230205311238e-260)
ru-en Ttest_indResult(statistic=24.52664848466707, pvalue=1.7414777289110105e-131)
tr-en Ttest_indResult(statistic=23.100246231581046, pvalue=6.31005660365584e-117)
zh-en Ttest_indResult(statistic=31.834582191045445, pvalue=1.886804436914362e-220)
en-cs Ttest_indResult(statistic=13.547417659873181, pvalue=1.394053540207454e-41)
en-de Ttest_indResult(statistic=14.685160394286475, pvalue=1.3056401502013133e-48)
Error, unable to compare scores
en-fi Ttest_indResult(statistic=17.8491726665699, pvalue=1.2400036414985596e-70)
en-ru Ttest_indResult(statistic=20.667966141251274, pvalue=2.3286719128672358e-94)
en-tr Ttest_indResult(statistic=8.072521166479888, pvalue=8.0

In [85]:
from src.data.WMT20 import WMT20

In [86]:
WMT20.supported_languages

['cs-en',
 'de-en',
 'iu-en',
 'ja-en',
 'km-en',
 'pl-en',
 'ps-en',
 'ru-en',
 'ta-en',
 'zh-en',
 'en-cs',
 'en-de',
 'en-iu',
 'en-ja',
 'en-km',
 'en-pl',
 'en-ps',
 'en-ru',
 'en-ta',
 'en-zh',
 'de-fr',
 'fr-de']

In [91]:
data = WMT20(WMT20.supported_languages[0], batch_size=16).setup()
dataloader = data.test_dataloader()

In [92]:
len(dataloader)

471

In [113]:
from evaluate import load
# prepare the models
model_bleu = load("bleu")
model_meteor = load("meteor")
model_rouge = load("rouge")
model_bertscore = load("bertscore")
model_comet = load("comet")

[nltk_data] Downloading package wordnet to /home/erikn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
wmt20-comet-da is already in cache.
Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Encoder model frozen.


In [None]:
def get_bleu_1_score(predictions, references, sources=None, lang=None):
    def _process_single_example(pred, ref):
        try:
            results = model_bleu.compute(predictions=[pred], references=[[ref]])
        except:
            results = {"precisions": [0, 0, 0, 0]}
        return results["precisions"][0]
    return [_process_single_example(pred, ref) for pred, ref in zip(predictions, references)]


def get_bleu_2_score(predictions, references, sources=None, lang=None):
    def _process_single_example(pred, ref):
        try:
            results = model_bleu.compute(predictions=[pred], references=[[ref]])
        except:
            results = {"precisions": [0, 0, 0, 0]}
        return results["precisions"][1]
    return [_process_single_example(pred, ref) for pred, ref in zip(predictions, references)]


def get_bleu_3_score(predictions, references, sources=None, lang=None):
    def _process_single_example(pred, ref):
        try:
            results = model_bleu.compute(predictions=[pred], references=[[ref]])
        except:
            results = {"precisions": [0, 0, 0, 0]}
        return results["precisions"][2]
    return [_process_single_example(pred, ref) for pred, ref in zip(predictions, references)]


def get_bleu_4_score(predictions, references, sources=None, lang=None):
    def _process_single_example(pred, ref):
        try:
            results = model_bleu.compute(predictions=[pred], references=[[ref]])
        except:
            results = {"precisions": [0, 0, 0, 0]}
        return results["precisions"][3]
    return [_process_single_example(pred, ref) for pred, ref in zip(predictions, references)]


def get_meteor_score(predictions, references, sources=None, lang=None):
    results = model_meteor.compute(predictions=predictions, references=references)
    return results["meteor"]


def get_rougel_score(predictions, references, sources=None, lang=None):
    def _process_single_example(pred, ref):
        try:
            results = model_rouge.compute(predictions=[pred], references=[ref])
        except:
            results = {"rougeL": 0}
        return results["rougeL"]
    return [_process_single_example(pred, ref) for pred, ref in zip(predictions, references)]


def get_bertscore_score(predictions, references, sources=None, lang="en"):
    results = model_bertscore.compute(predictions=predictions, references=references, lang=lang)
    return results["f1"]


def get_comet_score(predictions, references, sources, lang=None):
    results = model_comet.compute(predictions=predictions, references=references, sources=sources, gpus=1)
    return results["scores"]


models = [
    {"id": "BLEU-1", "model": get_bleu_1_score},
    {"id": "BLEU-2", "model": get_bleu_2_score},
    {"id": "BLEU-3", "model": get_bleu_3_score},
    {"id": "BLEU-4", "model": get_bleu_4_score},
    {"id": "METEOR", "model": get_meteor_score},
    {"id": "ROUGE-L", "model": get_rougel_score},
    # {"id": "BERTScore", "model": get_bertscore_score},
    # {"id": "COMET", "model": get_comet_score},
]

In [137]:
# load the datasets
lang_pair = WMT20.supported_languages[0]
dataset = WMT20(lang_pair, batch_size=16)
dataloader = dataset.setup().test_dataloader()
language = lang_pair.split("-")[1]
# calculate the scores
scores = []

for data in dataloader:
    src_model_ids = data["model_id"]
    system_scores = {}
    for model in models:
        model_scores = model["model"](
            predictions=data["system"],
            references=data["reference"],
            sources=data["source"],
            lang=language,
        )
        system_scores[model["id"]] = model_scores

    for idx in range(len(data["score"])):
        system_score = { key: vals[idx] for key, vals in system_scores.items() }

        scores.append(
            {
                **system_score,
                "model_id": src_model_ids[idx],
                "human_score": data["score"][idx].item(),
            }
        )
    break

Using default tokenizer.
Using default tokenizer.
Using default tokenizer.
Using default tokenizer.
Using default tokenizer.
Using default tokenizer.
Using default tokenizer.
Using default tokenizer.
Using default tokenizer.
Using default tokenizer.
Using default tokenizer.
Using default tokenizer.
Using default tokenizer.
Using default tokenizer.
Using default tokenizer.
Using default tokenizer.


AttributeError: module 'bert_score' has no attribute 'utils'

In [133]:
system_scores

{'BLEU-1': [0.7857142857142857,
  0.75,
  0.7479674796747967,
  0.7819548872180451,
  0.6574074074074074,
  0.6923076923076923,
  0.691358024691358,
  0.5384615384615384,
  0.6764705882352942,
  0.696969696969697,
  0.725,
  0.8148148148148148,
  0.6363636363636364,
  0.6153846153846154,
  0.5483870967741935,
  0.6891891891891891],
 'BLEU-2': [0.38461538461538464,
  0.5333333333333333,
  0.4262295081967213,
  0.4696969696969697,
  0.3364485981308411,
  0.5324675324675324,
  0.3416149068322981,
  0.25,
  0.417910447761194,
  0.3469387755102041,
  0.4430379746835443,
  0.5283018867924528,
  0.32653061224489793,
  0.4166666666666667,
  0.21311475409836064,
  0.3835616438356164],
 'BLEU-3': [0.25,
  0.35714285714285715,
  0.256198347107438,
  0.31297709923664124,
  0.20754716981132076,
  0.4342105263157895,
  0.1875,
  0.09090909090909091,
  0.25757575757575757,
  0.18556701030927836,
  0.28205128205128205,
  0.28846153846153844,
  0.16494845360824742,
  0.36363636363636365,
  0.1,
  0.222

In [None]:
scores