# WMT18

* To english
* Segment level data
* Relative ranking (better translation vs. worse translation)
* Kendall score

In [None]:
# install dependencies

!pip -q install datasets
!pip -q install bert_score
!pip -q install git+https://github.com/google-research/bleurt.git
!pip -q install unbabel-comet
!pip -q install transformers
!pip -q install POT

[K     |████████████████████████████████| 365 kB 4.7 MB/s 
[K     |████████████████████████████████| 115 kB 68.9 MB/s 
[K     |████████████████████████████████| 101 kB 13.7 MB/s 
[K     |████████████████████████████████| 141 kB 71.9 MB/s 
[K     |████████████████████████████████| 212 kB 62.5 MB/s 
[K     |████████████████████████████████| 596 kB 53.2 MB/s 
[K     |████████████████████████████████| 127 kB 49.9 MB/s 
[K     |████████████████████████████████| 60 kB 3.3 MB/s 
[K     |████████████████████████████████| 4.7 MB 10.6 MB/s 
[K     |████████████████████████████████| 6.6 MB 54.3 MB/s 
[K     |████████████████████████████████| 352 kB 4.7 MB/s 
[K     |████████████████████████████████| 1.3 MB 63.8 MB/s 
[?25h  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 64 kB 2.0 MB/s 
[K     |████████████████████████████████| 101 kB 5.7 MB/s 
[K     |████████████████████████████████| 9.5 MB 57.3 MB/s 
[K     |█████████████████

In [None]:
!git clone https://github.com/drehero/geneval

Cloning into 'geneval'...
remote: Enumerating objects: 508, done.[K
remote: Counting objects: 100% (158/158), done.[K
remote: Compressing objects: 100% (100/100), done.[K
remote: Total 508 (delta 77), reused 128 (delta 50), pack-reused 350[K
Receiving objects: 100% (508/508), 44.38 MiB | 13.83 MiB/s, done.
Resolving deltas: 100% (221/221), done.
Checking out files: 100% (176/176), done.


In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import pathlib

import datasets
import numpy as np
import pandas as pd

from geneval.geneval.data.wmt import WMT18
from geneval.geneval.utils import kendall_score

In [None]:
from geneval.reproducion.configs import baryscore_config as config

In [None]:
for attr, val in vars(config).items():
    if not attr.startswith("_"):
        print(attr, ": ", val)

metric_name :  baryscore
metric_path :  ./geneval/geneval/metrics/baryscore/baryscore.py
uses_reference :  True
uses_source :  False
score_name :  baryscore_W
load_args :  {}
compute_args :  {'model_type': 'bert-base-uncased', 'batch_size': 128, 'last_layers': 5, 'use_idfs': True, 'sinkhorn_ref': 0.01}


In [None]:
out_path = pathlib.Path("/content/drive/MyDrive/results/wmt18")
lang_pairs = ["cs-en", "de-en", "et-en", "fi-en", "ru-en", "tr-en", "zh-en"]

In [None]:
scorer = datasets.load_metric(config.metric_path, **config.load_args)

In [None]:
for lang_pair in lang_pairs:
    # load data
    wmt = WMT18(lang_pair, root="/tmp")

    # prepare input
    sents = []
    if config.uses_reference:
        refs = []
    if config.uses_source:
        srcs = []
    for i, sent in enumerate(wmt.translations_better):
        if sent not in sents:
            sents += [sent]
            if config.uses_reference:
                refs += [wmt.references[i]]
            if config.uses_source:
                srcs += [wmt.sources[i]]
    for i, sent in enumerate(wmt.translations_worse):
        if sent not in sents:
            sents += [sent]
            if config.uses_reference:
                refs += [wmt.references[i]]
            if config.uses_source:
                srcs += [wmt.sources[i]]

    # comput scores
    args = config.compute_args.copy()
    if config.uses_reference:
        args["references"] = refs
    if config.uses_source:
        args["sources"] = srcs

    scores = scorer.compute(predictions=sents, **args)
    sent2score = dict(zip(
        sents,
        scores[config.score_name] if config.score_name is not None else scores
    ))

    # save scores
    df = pd.DataFrame({
        "translation_better": wmt.translations_better,
        "translations_worse": wmt.translations_worse,
        "reference": wmt.references,
        "source": wmt.sources,
        "score_better": [sent2score[sent] for sent in wmt.translations_better],
        "score_worse": [sent2score[sent] for sent in wmt.translations_worse]
    })
    if "model_type" in args.keys():
        fn = f"{lang_pair}-{args['model_type'].split('/')[-1]}.csv"
    elif "config_name" in config.load_args.keys():
        fn = f"{lang_pair}-{config.load_args['config_name'].split('/')[-1]}.csv"
    else:
        fn = f"{lang_pair}.csv"
    df.to_csv(out_path / config.metric_name / fn, index=False)

Downloading wmt18-metrics-task-package.tgz:   0%|          | 0.00/224M [00:00<?, ?B/s]

Downloading wmt18-metrics-task-nohybrids.tgz:   0%|          | 0.00/46.0M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

  v = b / KtransposeU
  v = b / KtransposeU
  u = 1. / nx.dot(Kp, v)
  u = 1. / nx.dot(Kp, v)
Evaluating batch: 100%|██████████| 128/128 [00:32<00:00,  3.91it/s]
Evaluating batch: 100%|██████████| 128/128 [00:38<00:00,  3.33it/s]
Evaluating batch: 100%|██████████| 128/128 [00:33<00:00,  3.77it/s]
Evaluating batch: 100%|██████████| 128/128 [00:36<00:00,  3.54it/s]
Evaluating batch: 100%|██████████| 128/128 [00:36<00:00,  3.52it/s]
Evaluating batch: 100%|██████████| 128/128 [00:33<00:00,  3.80it/s]
Evaluating batch: 100%|██████████| 128/128 [00:34<00:00,  3.68it/s]
Evaluating batch: 100%|██████████| 128/128 [00:32<00:00,  3.97it/s]
Evaluating batch: 100%|██████████| 128/128 [00:34<00:00,  3.73it/s]
Evaluating batch: 100%|██████████| 128/128 [00:41<00:00,  3.10it/s]
Evaluating batch: 100%|██████████| 128/128 [00:35<00:00,  3.64it/s]
Evaluating batch: 100%|██████████| 128/128 [00:35<00:00,  3.60it/s]
Evaluating batch: 100%|██████████| 128/128 [00:35<00:00,  3.64it/s]
Evaluating batch: 100%

In [None]:
# load scores and compute kendalls tau
results = {}
for lang_pair in lang_pairs:
    if "model_type" in config.compute_args.keys():
        fn = f"{lang_pair}-{config.compute_args['model_type'].split('/')[-1]}.csv"
    elif "config_name" in config.load_args.keys():
        fn = f"{lang_pair}-{config.load_args['config_name'].split('/')[-1]}.csv"
    else:
        fn = f"{lang_pair}.csv"
    df = pd.read_csv(out_path / config.metric_name / fn)
    results[lang_pair] = kendall_score(
        df["score_better"].to_list(),
        df["score_worse"].to_list()
    )

In [None]:
results

{'cs-en': -0.3573385518590998,
 'de-en': -0.5203505931038028,
 'et-en': -0.37959485904691387,
 'fi-en': -0.28578732106339466,
 'ru-en': -0.32391387927720106,
 'tr-en': -0.26146627565982405,
 'zh-en': -0.25059207962346736}