# WMT18

* To english
* Segment level data
* Relative ranking (better translation vs. worse translation)
* Kendall score

In [1]:
!pip install datasets
!pip install bert_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 21.1 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 67.3 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.9 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 66.3 MB/s 
[?25hCollecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.7.1-py3-none-any.whl (141 kB)
[K     |████████████████████████████████| 141 kB 67.5 MB/s 
Collecting pyyaml>=5.1
  Downloadin

In [2]:
!git clone https://github.com/drehero/geneval

Cloning into 'geneval'...
remote: Enumerating objects: 235, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 235 (delta 11), reused 25 (delta 8), pack-reused 205[K
Receiving objects: 100% (235/235), 204.54 KiB | 15.73 MiB/s, done.
Resolving deltas: 100% (85/85), done.


In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
import pathlib

import datasets
import numpy as np
import pandas as pd

from geneval.geneval.data.wmt import WMT18
from geneval.geneval.utils import kendall_score

## BERT Score

In [5]:
class config:
    metric_name = "bertscore"
    metric_path = "bertscore"
    model_type = "bert-base-uncased"

    out_path = pathlib.Path("/content/drive/MyDrive/results/wmt18")

    target_lang = "en"
    lang_pairs = ["cs-en", "de-en", "et-en", "fi-en", "ru-en", "tr-en", "zh-en"]

In [6]:
scorer = datasets.load_metric(config.metric_path)

Downloading builder script:   0%|          | 0.00/2.92k [00:00<?, ?B/s]

In [7]:
for lang_pair in config.lang_pairs:
    # load data
    wmt = WMT18(lang_pair, root="/tmp")

    # compute scores
    scores_better = scorer.compute(
        predictions=wmt.translations_better,
        references=wmt.references,
        lang=config.target_lang,
        model_type=config.model_type
    )
    scores_worse = scorer.compute(
        predictions=wmt.translations_worse,
        references=wmt.references,
        lang=config.target_lang,
        model_type=config.model_type
    )

    # save scores
    df = pd.DataFrame({
        "translation_better": wmt.translations_better,
        "translations_worse": wmt.translations_worse,
        "reference": wmt.references,
        "source": wmt.sources,
        "precision_better": scores_better["precision"],
        "recall_better": scores_better["recall"],
        "f1_better": scores_better["f1"],
        "precision_worse": scores_worse["precision"],
        "recall_worse": scores_worse["recall"],
        "f1_worse": scores_worse["f1"],
    })
    fn = f"{lang_pair}-{config.model_type}.csv"
    df.to_csv(config.out_path / config.metric_name / fn, index=False)

Downloading data:   0%|          | 0.00/224M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.0M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]



In [8]:
# load scores and compute kendall score based on f1
results = {}
for lang_pair in config.lang_pairs:
    fn = f"{lang_pair}-{config.model_type}.csv"
    df = pd.read_csv(config.out_path / config.metric_name / fn)
    ks = kendall_score(df["f1_better"], df["f1_worse"])
    results[lang_pair] = ks

In [9]:
results

{'cs-en': 0.37181996086105673,
 'de-en': 0.5323540373469047,
 'et-en': 0.3852717688334127,
 'fi-en': 0.28655419222903883,
 'ru-en': 0.3414071510957324,
 'tr-en': 0.2668621700879765,
 'zh-en': 0.2450759960428096}