# WMT17

* To English
* Segment level data
* Scores
* Pearson correlation

In [1]:
# install dependencies

!pip install datasets

# bertscore
#!pip install bert_score

# bleurt
#!pip install git+https://github.com/google-research/bleurt.git

# comet
#!pip install unbabel-comet

# frugalscore, bartscore, moverscore
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 31.2 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 71.8 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.7.1-py3-none-any.whl (141 kB)
[K     |████████████████████████████████| 141 kB 69.5 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 15.6 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 74.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYA

In [33]:
!git clone https://github.com/drehero/geneval

Cloning into 'geneval'...
remote: Enumerating objects: 384, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 384 (delta 15), reused 28 (delta 12), pack-reused 350[K
Receiving objects: 100% (384/384), 41.86 MiB | 10.98 MiB/s, done.
Resolving deltas: 100% (159/159), done.
Checking out files: 100% (126/126), done.


In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
import pathlib

import datasets
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

from geneval.geneval.data.wmt import WMT17

In [5]:
out_path = pathlib.Path(f"/content/drive/MyDrive/results/wmt17/")
lang_pairs = ["cs-en", "de-en", "fi-en", "lv-en", "ru-en", "tr-en", "zh-en"]

In [6]:
class bertscore_config:
    metric_name = "bertscore"
    metric_path = "bertscore"

    uses_reference = True
    uses_source = False

    score_name = "f1"

    load_args = {}
    compute_args = {
        "model_type": "bert-base-uncased",
        "lang": "en",
    }

In [7]:
class bleurt_config:
    metric_name = "bleurt"
    metric_path = "bleurt"

    uses_reference = True
    uses_source = False

    score_name = "scores"

    load_args = {
        "config_name": "bleurt-base-512"
    }
    compute_args = {}

In [8]:
class comet_config:
    metric_name = "comet"
    metric_path = "comet"

    uses_reference = True
    uses_source = True

    score_name = "scores"

    load_args = {}
    compute_args = {"progress_bar": True}

In [9]:
class frugalscore_config:
    metric_name = "frugalscore"
    metric_path = "frugalscore"

    uses_reference = True
    uses_source = False

    score_name = "scores"

    load_args = {"config_name": "moussaKam/frugalscore_tiny_bert-base_bert-score"}
    compute_args = {
        "max_length": 512,
        "batch_size": 128,
        "device": "gpu"
    }

In [13]:
class bartscore_config:
    metric_name = "bartscore"
    metric_path = "./geneval/geneval/metrics/bartscore/bartscore.py"

    uses_reference = False
    uses_source = True

    score_name = None

    load_args = {}
    compute_args = {
        #model_type: "facebook/bart-base",
        "model_type": "facebook/bart-large-cnn",
        "max_length": 512,
        "batch_size": 128,
    }

In [30]:
class moverscore_config:
    metric_name = "moverscore"
    metric_path = "./geneval/geneval/metrics/moverscore/moverscore.py"

    uses_reference = True
    uses_source = False

    score_name = None

    load_args = {}
    compute_args = {
        "model_type": "bert-base-uncased",
        "batch_size": 128,
        "n_gram": 1,
    }

In [31]:
config = moverscore_config

In [34]:
scorer = datasets.load_metric(config.metric_path, **config.load_args)

In [36]:
for lang_pair in lang_pairs:
    # load data
    wmt = WMT17(lang_pair)

    # compute score
    args = config.compute_args.copy()
    if config.uses_reference:
        args["references"] = wmt.references
    if config.uses_source:
        args["sources"] = wmt.sources
    
    scores = scorer.compute(
        predictions=wmt.translations,
        **args
    )

    # save
    df = pd.DataFrame({
        "translation": wmt.translations,
        "reference": wmt.references,
        "source": wmt.sources,
        "human_score": wmt.scores,
        "metric_score": scores[config.score_name] if config.score_name is not None else scores
    })
    if "model_type" in args.keys():
        fn = f"{lang_pair}-{args['model_type'].split('/')[-1]}.csv"
    elif "config_name" in config.load_args.keys():
        fn = f"{lang_pair}-{config.load_args['config_name'].split('/')[-1]}.csv"
    else:
        fn = f"{lang_pair}.csv"
    df.to_csv(out_path / config.metric_name / fn, index=False)

In [37]:
# load scores and compute pearson correlation
results = {}
for lang_pair in lang_pairs:
    if "model_type" in config.compute_args.keys():
        fn = f"{lang_pair}-{config.compute_args['model_type'].split('/')[-1]}.csv"
    elif "config_name" in config.load_args.keys():
        fn = f"{lang_pair}-{config.load_args['config_name'].split('/')[-1]}.csv"
    else:
        fn = f"{lang_pair}.csv"
    df = pd.read_csv(out_path / config.metric_name / fn)
    corr = pearsonr(df["metric_score"], df["human_score"])[0]
    results[lang_pair] = corr

In [38]:
results

{'cs-en': 0.6021966685731961,
 'de-en': 0.627159362496482,
 'fi-en': 0.7688580216458533,
 'lv-en': 0.6446975625846387,
 'ru-en': 0.6639918272189534,
 'tr-en': 0.7273878656667742,
 'zh-en': 0.6647567195778533}