# TransQuest reproduction


In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 33.2 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 64.0 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 67.7 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.4 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.7.1-py3-none-any.whl (141 kB)
[K     |████████████████████████████████| 141 kB 66.3 MB/s 
Collecting pyyaml>=5.1
  Down

In [2]:
!pip install transquest

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transquest
  Downloading transquest-1.1.1-py3-none-any.whl (211 kB)
[K     |████████████████████████████████| 211 kB 34.4 MB/s 
[?25hCollecting tensorboardx
  Downloading tensorboardX-2.5.1-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 66.4 MB/s 
Collecting transformers>=4.2.0
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 17.7 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 59.4 MB/s 
[?25hCollecting onnxruntime
  Downloading onnxruntime-1.12.1-cp37-cp37m-manylinux_2_27_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 58.0 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manyl

In [None]:
!git clone https://github.com/drehero/geneval

Cloning into 'geneval'...
remote: Enumerating objects: 503, done.[K
remote: Counting objects: 100% (153/153), done.[K
remote: Compressing objects: 100% (97/97), done.[K
remote: Total 503 (delta 75), reused 124 (delta 49), pack-reused 350[K
Receiving objects: 100% (503/503), 44.38 MiB | 14.12 MiB/s, done.
Resolving deltas: 100% (219/219), done.
Checking out files: 100% (176/176), done.


In [None]:
import datasets, os
import numpy as np
import pandas as pd

from geneval.geneval.data.wmt import WMT17, WMT18
from geneval.geneval.metrics.TransQuest.transquest import TransQuest
from scipy.stats import pearsonr

def kendall_score(scores_better, scores_worse):
    total = len(scores_better)
    correct = np.sum(np.array(scores_better) > np.array(scores_worse))
    incorrect = total - correct
    return (correct - incorrect)/total


In [None]:
transquest = TransQuest()

Downloading config.json:   0%|          | 0.00/721 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

Downloading sentencepiece.bpe.model:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

  f"use_multiprocessing automatically disabled as {model_type}"


## WMT18

In [None]:
results = {}
for lang_pair in ["cs-en", "de-en", "et-en", "fi-en", "ru-en", "tr-en", "zh-en"]:
    print(f"Started language pair {lang_pair}...")
    # load data
    wmt = WMT18(lang_pair, root="/tmp")

    # compute scores
    input = [list(pair) for pair in zip(wmt.translations_better, wmt.references)]
    scores_better, _ = transquest.predict(input)
    print("     ...scores_better completed,")

    input = [list(pair) for pair in zip(wmt.translations_worse, wmt.references)]
    scores_worse, _ = transquest.predict(input)
    print("     ...scores_worse completed,")

    ks = kendall_score(scores_better, scores_worse)
    print(f"     ...ks = {ks}")
    results[lang_pair] = ks

for key, value in results.items():
  print(f"{key}: {value}")

Started language pair cs-en...


Downloading wmt18-metrics-task-package.tgz:   0%|          | 0.00/224M [00:00<?, ?B/s]

Downloading wmt18-metrics-task-nohybrids.tgz:   0%|          | 0.00/46.0M [00:00<?, ?B/s]

  0%|          | 0/5110 [00:00<?, ?it/s]

  0%|          | 0/639 [00:00<?, ?it/s]

     ...scores_better completed,


  0%|          | 0/5110 [00:00<?, ?it/s]

  0%|          | 0/639 [00:00<?, ?it/s]

     ...scores_worse completed,
     ...ks = 0.31350293542074364
Started language pair de-en...


  0%|          | 0/77811 [00:00<?, ?it/s]

  0%|          | 0/9727 [00:00<?, ?it/s]

     ...scores_better completed,


  0%|          | 0/77811 [00:00<?, ?it/s]

  0%|          | 0/9727 [00:00<?, ?it/s]

     ...scores_worse completed,
     ...ks = 0.5088612149953091
Started language pair et-en...


  0%|          | 0/56721 [00:00<?, ?it/s]

  0%|          | 0/7091 [00:00<?, ?it/s]

     ...scores_better completed,


  0%|          | 0/56721 [00:00<?, ?it/s]

  0%|          | 0/7091 [00:00<?, ?it/s]

     ...scores_worse completed,
     ...ks = 0.3798416812115442
Started language pair fi-en...


  0%|          | 0/15648 [00:00<?, ?it/s]

  0%|          | 0/1956 [00:00<?, ?it/s]

     ...scores_better completed,


  0%|          | 0/15648 [00:00<?, ?it/s]

  0%|          | 0/1956 [00:00<?, ?it/s]

     ...scores_worse completed,
     ...ks = 0.29690695296523517
Started language pair ru-en...


  0%|          | 0/10404 [00:00<?, ?it/s]

  0%|          | 0/1301 [00:00<?, ?it/s]

     ...scores_better completed,


  0%|          | 0/10404 [00:00<?, ?it/s]

  0%|          | 0/1301 [00:00<?, ?it/s]

     ...scores_worse completed,
     ...ks = 0.3064206074586697
Started language pair tr-en...


  0%|          | 0/8525 [00:00<?, ?it/s]

  0%|          | 0/1066 [00:00<?, ?it/s]

     ...scores_better completed,


  0%|          | 0/8525 [00:00<?, ?it/s]

  0%|          | 0/1066 [00:00<?, ?it/s]

     ...scores_worse completed,
     ...ks = 0.286099706744868
Started language pair zh-en...


  0%|          | 0/33357 [00:00<?, ?it/s]

  0%|          | 0/4170 [00:00<?, ?it/s]

     ...scores_better completed,


  0%|          | 0/33357 [00:00<?, ?it/s]

  0%|          | 0/4170 [00:00<?, ?it/s]

     ...scores_worse completed,
     ...ks = 0.23536289234643404
cs-en: 0.31350293542074364
de-en: 0.5088612149953091
et-en: 0.3798416812115442
fi-en: 0.29690695296523517
ru-en: 0.3064206074586697
tr-en: 0.286099706744868
zh-en: 0.23536289234643404


## WMT17

In [None]:
transquest = TransQuest()
transquest = MonoTransQuestModel("xlmroberta", "TransQuest/monotransquest-da-multilingual", num_labels=1, use_cuda=True)

In [None]:
lang_pairs = ["cs-en", "de-en", "fi-en", "lv-en", "ru-en", "tr-en", "zh-en"]
for lang_pair in lang_pairs:
    print(f"Started language pair {lang_pair}...", end="")
    # load data
    wmt = WMT17(lang_pair)
    # compute scores
    input = [list(pair) for pair in zip(wmt.translations, wmt.references)]
    scores, _ = transquest.predict(input)
    df = pd.DataFrame({
        "translation": wmt.translations,
        "references": wmt.references,
        "source": wmt.sources,
        "human": wmt.scores,
        "metric_score": scores
    })
    df.to_csv(os.path.join(os.getcwd(), lang_pair + ".csv"), index=False)
    print("completed.")

Started language pair cs-en...

  0%|          | 0/541 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

completed.
Started language pair de-en...

  0%|          | 0/534 [00:00<?, ?it/s]

  0%|          | 0/67 [00:00<?, ?it/s]

completed.
Started language pair fi-en...

  0%|          | 0/551 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

completed.
Started language pair lv-en...

  0%|          | 0/550 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

completed.
Started language pair ru-en...

  0%|          | 0/535 [00:00<?, ?it/s]

  0%|          | 0/67 [00:00<?, ?it/s]

completed.
Started language pair tr-en...

  0%|          | 0/540 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

completed.
Started language pair zh-en...

  0%|          | 0/542 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

completed.


In [None]:
results = {}
for lang_pair in lang_pairs:
    df = pd.read_csv(os.path.join(os.getcwd(), lang_pair + ".csv"))
    corr = pearsonr(df["metric_score"], df["human"])[0]
    results[lang_pair] = corr
    print(f"{lang_pair}: {corr}")

cs-en: 0.5701709408765274
de-en: 0.599961716641377
fi-en: 0.6233656958468294
lv-en: 0.6566967224619693
ru-en: 0.644420831738664
tr-en: 0.708126653291465
zh-en: 0.735795582408577
