In [1]:
# install dependencies

!pip -q install datasets
!pip -q install bert_score
!pip -q install git+https://github.com/google-research/bleurt.git
!pip -q install unbabel-comet
!pip -q install transformers
!pip -q install POT

In [2]:
!git clone https://github.com/drehero/geneval

fatal: destination path 'geneval' already exists and is not an empty directory.


In [3]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pathlib

import datasets
import numpy as np
import pandas as pd

from geneval.geneval.data.wmt import WMT19
from geneval.geneval.utils import kendall_score

In [5]:
from geneval.reproduction.configs import frugalscore_config as config

In [6]:
for attr, val in vars(config).items():
    if not attr.startswith("_"):
        print(attr, ": ", val)

metric_name :  frugalscore
metric_path :  frugalscore
uses_reference :  True
uses_source :  False
score_name :  scores
load_args :  {'config_name': 'moussaKam/frugalscore_tiny_bert-base_bert-score'}
compute_args :  {'max_length': 512, 'batch_size': 128, 'device': 'gpu'}


In [7]:
out_path = pathlib.Path("/content/drive/MyDrive/results/wmt19")
lang_pairs = ["de-en", "fi-en", "gu-en", "kk-en", "lt-en", "ru-en"]

In [8]:
scorer = datasets.load_metric(config.metric_path, **config.load_args)

Downloading builder script:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/324 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [9]:
for lang_pair in lang_pairs:
    # load data
    wmt = WMT19(lang_pair, root="/tmp")

    # prepare input
    sents = []
    if config.uses_reference:
        refs = []
    if config.uses_source:
        srcs = []
    for i, sent in enumerate(wmt.translations_better):
        if sent not in sents:
            sents += [sent]
            if config.uses_reference:
                refs += [wmt.references[i]]
            if config.uses_source:
                srcs += [wmt.sources[i]]
    for i, sent in enumerate(wmt.translations_worse):
        if sent not in sents:
            sents += [sent]
            if config.uses_reference:
                refs += [wmt.references[i]]
            if config.uses_source:
                srcs += [wmt.sources[i]]

    # comput scores
    args = config.compute_args.copy()
    if config.uses_reference:
        args["references"] = refs
    if config.uses_source:
        args["sources"] = srcs

    scores = scorer.compute(predictions=sents, **args)
    sent2score = dict(zip(
        sents,
        scores[config.score_name] if config.score_name is not None else scores
    ))

    # save scores
    df = pd.DataFrame({
        "translation_better": wmt.translations_better,
        "translations_worse": wmt.translations_worse,
        "reference": wmt.references,
        "source": wmt.sources,
        "score_better": [sent2score[sent] for sent in wmt.translations_better],
        "score_worse": [sent2score[sent] for sent in wmt.translations_worse]
    })
    if "model_type" in args.keys():
        fn = f"{lang_pair}-{args['model_type'].split('/')[-1]}.csv"
    elif "config_name" in config.load_args.keys():
        fn = f"{lang_pair}-{config.load_args['config_name'].split('/')[-1]}.csv"
    else:
        fn = f"{lang_pair}.csv"
    df.to_csv(out_path / config.metric_name / fn, index=False)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/18 [00:00<?, ?ba/s]

In [10]:
results = {}
for lang_pair in lang_pairs:
    if "model_type" in config.compute_args.keys():
        fn = f"{lang_pair}-{config.compute_args['model_type'].split('/')[-1]}.csv"
    elif "config_name" in config.load_args.keys():
        fn = f"{lang_pair}-{config.load_args['config_name'].split('/')[-1]}.csv"
    else:
        fn = f"{lang_pair}.csv"
    df = pd.read_csv(out_path / config.metric_name / fn)
    results[lang_pair] = kendall_score(
        df["score_better"].to_list(),
        df["score_worse"].to_list()
    )

In [11]:
results

{'de-en': 0.10202073449305922,
 'fi-en': 0.2895366543397868,
 'gu-en': 0.2448533068125311,
 'kk-en': 0.3209292763157895,
 'lt-en': 0.30701674137773305,
 'ru-en': 0.163354411321891}