# Reproduction study results

In [1]:
!git clone https://github.com/drehero/geneval.git

Cloning into 'geneval'...
remote: Enumerating objects: 629, done.[K
remote: Counting objects: 100% (279/279), done.[K
remote: Compressing objects: 100% (190/190), done.[K
remote: Total 629 (delta 126), reused 224 (delta 79), pack-reused 350[K
Receiving objects: 100% (629/629), 68.51 MiB | 8.17 MiB/s, done.
Resolving deltas: 100% (270/270), done.
Checking out files: 100% (219/219), done.


In [2]:
import pathlib

import pandas as pd
from scipy.stats import pearsonr

from geneval.geneval.utils import kendall_score
from geneval.reproduction.configs import *

In [3]:
configs = [
    bertscore_config,
    bleurt_config,
    comet_config,
    frugalscore_config,
    bartscore_config,
    moverscore_config,
    baryscore_config,
]

In [4]:
lang_pairs = {
    16: ["cs-en", "de-en", "fi-en", "ru-en"],
    17: ["cs-en", "de-en", "fi-en", "lv-en", "ru-en", "tr-en", "zh-en"],
    18: ["cs-en", "de-en", "et-en", "fi-en", "ru-en", "tr-en", "zh-en"],
    19: ["de-en", "fi-en", "gu-en", "kk-en", "lt-en", "ru-en"]
}

## WMT17

In [5]:
year = 17
all_results = {}
path = pathlib.Path(f"geneval/reproduction/results/wmt{year}")

In [6]:
for config in configs:
    results = {}
    for lang_pair in lang_pairs[year]:
        if "model_type" in config.compute_args.keys():
            fn = f"{lang_pair}-{config.compute_args['model_type'].split('/')[-1]}.csv"
        elif "config_name" in config.load_args.keys():
            fn = f"{lang_pair}-{config.load_args['config_name'].split('/')[-1]}.csv"
        else:
            fn = f"{lang_pair}.csv"
        df = pd.read_csv(path / config.metric_name / fn)
        corr = pearsonr(df["metric_score"], df["human_score"])[0]
        results[lang_pair] = corr
    all_results[config.metric_name] = results

In [7]:
df = pd.DataFrame(all_results).T.abs()
df["avg"] = df.mean(axis=1)
df = df.round(3)
df

Unnamed: 0,cs-en,de-en,fi-en,lv-en,ru-en,tr-en,zh-en,avg
bertscore,0.643,0.665,0.809,0.686,0.698,0.727,0.703,0.705
bleurt,0.758,0.793,0.876,0.834,0.819,0.839,0.824,0.82
comet,0.83,0.795,0.875,0.834,0.828,0.845,0.823,0.833
frugalscore,0.549,0.558,0.725,0.577,0.606,0.605,0.606,0.604
bartscore,0.266,0.381,0.254,0.379,0.432,0.22,0.444,0.34
moverscore,0.602,0.627,0.769,0.645,0.664,0.727,0.665,0.671
baryscore,0.636,0.65,0.818,0.686,0.692,0.741,0.709,0.705


## WMT18

In [8]:
year = 18
all_results = {}
path = pathlib.Path(f"geneval/reproduction/results/wmt{year}")

In [9]:
for config in configs:
    results = {}
    for lang_pair in lang_pairs[year]:
        if "model_type" in config.compute_args.keys():
            fn = f"{lang_pair}-{config.compute_args['model_type'].split('/')[-1]}.csv"
        elif "config_name" in config.load_args.keys():
            fn = f"{lang_pair}-{config.load_args['config_name'].split('/')[-1]}.csv"
        else:
            fn = f"{lang_pair}.csv"
        df = pd.read_csv(path / config.metric_name / fn)
        results[lang_pair] = kendall_score(
            df["score_better"].to_list(),
            df["score_worse"].to_list()
        )
    all_results[config.metric_name] = results

In [10]:
df = pd.DataFrame(all_results).T.abs()
df["avg"] = df.mean(axis=1)
df = df.round(3)
df

Unnamed: 0,cs-en,de-en,et-en,fi-en,ru-en,tr-en,zh-en,avg
bertscore,0.373,0.532,0.385,0.286,0.342,0.268,0.244,0.347
bleurt,0.482,0.597,0.462,0.376,0.396,0.378,0.334,0.432
comet,0.499,0.613,0.475,0.397,0.425,0.405,0.356,0.453
frugalscore,0.297,0.469,0.336,0.219,0.281,0.219,0.198,0.289
bartscore,0.169,0.379,0.242,0.183,0.267,0.136,0.184,0.223
moverscore,0.359,0.52,0.381,0.285,0.316,0.254,0.245,0.337
baryscore,0.357,0.52,0.38,0.286,0.324,0.261,0.251,0.34


## WMT16

In [11]:
configs = [bertscore_config, baryscore_config]
year = 16
all_results = {}
path = pathlib.Path(f"geneval/reproduction/results/wmt{year}")

In [12]:
for config in configs:
    results = {}
    for lang_pair in lang_pairs[year]:
        if "model_type" in config.compute_args.keys():
            fn = f"{lang_pair}-{config.compute_args['model_type'].split('/')[-1]}.csv"
        elif "config_name" in config.load_args.keys():
            fn = f"{lang_pair}-{config.load_args['config_name'].split('/')[-1]}.csv"
        else:
            fn = f"{lang_pair}.csv"
        df = pd.read_csv(path / config.metric_name / fn)
        corr = pearsonr(df["metric_score"], df["human_score"])[0]
        results[lang_pair] = corr
    all_results[config.metric_name] = results

In [13]:
df = pd.DataFrame(all_results).T.abs()
df["avg"] = df.mean(axis=1)
df = df.round(3)
df

Unnamed: 0,cs-en,de-en,fi-en,ru-en,avg
bertscore,0.755,0.729,0.753,0.746,0.746
baryscore,0.751,0.73,0.768,0.73,0.745


## WMT19

In [14]:
configs = [bertscore_config, comet_config, frugalscore_config, bartscore_config]
year = 19
all_results = {}
path = pathlib.Path(f"geneval/reproduction/results/wmt{year}")

In [17]:
for config in configs:
    results = {}
    for lang_pair in lang_pairs[year]:
        if "model_type" in config.compute_args.keys():
            fn = f"{lang_pair}-{config.compute_args['model_type'].split('/')[-1]}.csv"
        elif "config_name" in config.load_args.keys():
            fn = f"{lang_pair}-{config.load_args['config_name'].split('/')[-1]}.csv"
        else:
            fn = f"{lang_pair}.csv"
        df = pd.read_csv(path / config.metric_name / fn)
        results[lang_pair] = kendall_score(
            df["score_better"].to_list(),
            df["score_worse"].to_list()
        )
    all_results[config.metric_name] = results

In [20]:
df = pd.DataFrame(all_results).T.abs()
df["avg"] = df.mean(axis=1)
df = df.round(3)
df

Unnamed: 0,de-en,fi-en,gu-en,kk-en,lt-en,ru-en,avg
bertscore,0.144,0.348,0.282,0.356,0.364,0.199,0.282
comet,0.257,0.421,0.364,0.419,0.439,0.288,0.365
frugalscore,0.102,0.29,0.245,0.321,0.307,0.163,0.238
bartscore,0.075,0.194,0.212,0.084,0.217,0.097,0.147
