# WMT17

* To English
* Segment level data
* Scores
* Pearson correlation

In [1]:
!pip install datasets
!pip install bert_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 7.3 MB/s 
[?25hCollecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.7.1-py3-none-any.whl (141 kB)
[K     |████████████████████████████████| 141 kB 70.7 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 65.9 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.1 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 62.2 MB/s 
Collecting pyyaml>=5.1
  Downloading

In [2]:
!git clone https://github.com/drehero/geneval

Cloning into 'geneval'...
remote: Enumerating objects: 229, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 229 (delta 10), reused 20 (delta 7), pack-reused 205[K
Receiving objects: 100% (229/229), 203.77 KiB | 8.49 MiB/s, done.
Resolving deltas: 100% (84/84), done.


In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
import pathlib

import datasets
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

from geneval.geneval.data.wmt import WMT17

## BERTScore


In [5]:
class config:
    metric_name = "bertscore"
    metric_path = "bertscore"
    model_type = "bert-base-uncased"

    out_path = pathlib.Path("/content/drive/MyDrive/results/wmt17")

    target_lang = "en"
    lang_pairs = ["cs-en", "de-en", "fi-en", "lv-en", "ru-en", "tr-en", "zh-en"]

In [6]:
scorer = datasets.load_metric(config.metric_path)

Downloading builder script:   0%|          | 0.00/2.92k [00:00<?, ?B/s]

In [7]:
for lang_pair in config.lang_pairs:
    # load data
    wmt = WMT17(lang_pair)
    # compute score
    scores = scorer.compute(
        predictions=wmt.translations,
        references=wmt.references,
        lang=config.target_lang,
        model_type=config.model_type
    )
    # save
    df = pd.DataFrame({
        "translation": wmt.translations,
        "references": wmt.references,
        "source": wmt.sources,
        "human": wmt.scores,
        "metric_precision": scores["precision"],
        "metric_recall": scores["recall"],
        "metric_f1": scores["f1"],
    })
    fn = f"{lang_pair}-{config.model_type}.csv"
    df.to_csv(config.out_path / config.metric_name / fn, index=False)

Downloading data:   0%|          | 0.00/528M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

In [9]:
# load scores and compute pearson correlation
results = {}
for lang_pair in config.lang_pairs:
    fn = f"{lang_pair}-{config.model_type}.csv"
    df = pd.read_csv(config.out_path / config.metric_name / fn)
    corr = pearsonr(df["metric_f1"], df["human"])[0]
    results[lang_pair] = corr

In [10]:
results

{'cs-en': 0.6432113791324501,
 'de-en': 0.6649739645604085,
 'fi-en': 0.8091542301807512,
 'lv-en': 0.6861870972050228,
 'ru-en': 0.6984675117977112,
 'tr-en': 0.7271820800105004,
 'zh-en': 0.7033989523217288}