This notebook translates German sentences to English by generating multiple hypotheses with various methods and reranking them with various scores.

# Dependencies

In [1]:
# path to the translation model and its vocabulary, in order to compute ALTI correctly. 
MODEL_DIR = '../model'
DATA_DIR = '../model/wmt18_de-en'
LASER_DIR = '../laser'
USE_GPU = True

In [2]:
import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
# this is for comet to behave

In [3]:
import torch

In [4]:
torch.use_deterministic_algorithms(False) # otherwise, comet complains
#!pip install unbabel-comet==1.1.2 --use-feature=2020-resolver
import comet

In [5]:
from fairseq.models.transformer import TransformerModel

Please install tensorboardX: pip install tensorboardX


In [6]:
from stopes.eval.alti.wrappers.transformer_wrapper import FairseqTransformerHub
from stopes.eval.alti.alti_metrics.alti_metrics_utils import compute_alti_metrics, compute_alti_nllb, get_loss

In [7]:
from stopes.modules.preprocess.laser_sentence_encoder import SentenceEncoder, spm

In [8]:
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [9]:
import gc

def cleanup():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [10]:
from tqdm.auto import tqdm, trange

In [11]:
import pandas as pd
import numpy as np
import json

In [12]:
from sacrebleu import CHRF

# Data

In [13]:
gt = pd.read_csv('../annotated_data/guerreiro2022_corpus_w_annotations.csv')

In [15]:
gt['any_mistake'] = 1 - gt.correctness
gt['any_detached'] = gt[['strong-unsupport', 'full-unsupport']].max(1)
gt['repeat_or_detached'] = gt[['repetitions', 'strong-unsupport', 'full-unsupport']].max(1)
gt['other_errors'] = gt['any_mistake']-gt['named-entities']-gt['omission']-gt['repeat_or_detached']
gt['error_class'] = gt['any_detached'] + gt['full-unsupport'] + gt['any_mistake']
gt['error_class'].value_counts()

0    2048
1    1074
2     164
3     129
Name: error_class, dtype: int64

Sample 400 source texts

In [19]:
smpl = gt.groupby('error_class').sample(100, random_state=1)

# Creating the translations

In [25]:
k = 10  # number of hypotheses; also beam size when appropriate

In [26]:
de2en = TransformerModel.from_pretrained(
    MODEL_DIR,
    checkpoint_file='checkpoint_best.pt',
    data_name_or_path=DATA_DIR,
    bpe='sentencepiece', 
    sentencepiece_model=MODEL_DIR + '/sentencepiece.joint.bpe.model'
)

loading archive file ../model
load_model_ensemble_and_task is_moe=False
Rank 0: Done reading from disk
[de] dictionary: 32032 types
[en] dictionary: 32032 types
Done loading state dict
{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 10, 'log_format': None, 'log_file': None, 'tensorboard_logdir': None, 'wandb_project': 'mt-hallucinations', 'azureml_logging': False, 'seed': 42, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False,

In [27]:
de2en.cuda();

In [28]:
# Diverse translations of the data sample, key: list of lists of translation hypotheses.
smpl_diverse = {}

### Baseline translation

In [29]:
new_tran = [de2en.translate(t, beam=5) for t in tqdm(smpl.src)]

  0%|          | 0/400 [00:00<?, ?it/s]

In [30]:
key = 'default'
smpl_diverse[key] = [[mt] for mt in new_tran]

### Random sampling

In [31]:
key = 'sampling'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    for _ in range(k):
        batched_hypos = de2en.generate(enc, sampling=True, beam=1)
        out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
        options.append(out_texts[0])
    smpl_diverse[key].append(options)

  0%|          | 0/400 [00:00<?, ?it/s]

In [32]:
key = 'sampling_p08'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    for _ in range(k):
        batched_hypos = de2en.generate(enc, sampling=True, sampling_topp=0.8, beam=1)
        out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
        options.append(out_texts[0])
    smpl_diverse[key].append(options)

  0%|          | 0/400 [00:00<?, ?it/s]

### Beam search

In [33]:
key = 'beam_search'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    batched_hypos = de2en.generate(enc, beam=k)
    out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
    smpl_diverse[key].append(out_texts)

  0%|          | 0/400 [00:00<?, ?it/s]

In [34]:
key = 'beam_diversity_1'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    batched_hypos = de2en.generate(enc, beam=k, diversity_rate=1.0)
    out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
    smpl_diverse[key].append(out_texts)

  0%|          | 0/400 [00:00<?, ?it/s]

  final_beams = final_indices // k


In [35]:
key = 'beam_diversity_3'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    batched_hypos = de2en.generate(enc, beam=k, diversity_rate=3.0)
    out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
    smpl_diverse[key].append(out_texts)

  0%|          | 0/400 [00:00<?, ?it/s]

In [36]:
key = 'beam_diversity_10'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    batched_hypos = de2en.generate(enc, beam=k, diversity_rate=10.0)
    out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
    smpl_diverse[key].append(out_texts)

  0%|          | 0/400 [00:00<?, ?it/s]

In [37]:
key = 'beam_dbs_1'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    batched_hypos = de2en.generate(enc, beam=k, diverse_beam_groups=k, diverse_beam_strength=1)
    out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
    smpl_diverse[key].append(out_texts)

  0%|          | 0/400 [00:00<?, ?it/s]

In [38]:
key = 'beam_dbs_3'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    batched_hypos = de2en.generate(enc, beam=k, diverse_beam_groups=k, diverse_beam_strength=3)
    out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
    smpl_diverse[key].append(out_texts)

  0%|          | 0/400 [00:00<?, ?it/s]

In [39]:
key = 'beam_dbs_10'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    batched_hypos = de2en.generate(enc, beam=k, diverse_beam_groups=k, diverse_beam_strength=10)
    out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
    smpl_diverse[key].append(out_texts)

  0%|          | 0/400 [00:00<?, ?it/s]

### Dropout methods

In [40]:
for mn, m in de2en.named_modules():  # an easy way to randomize the model!
    if 'dropout' in mn:
        m.apply_during_inference = True

In [41]:
key = 'beam_dropout'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    for _ in range(k):
        batched_hypos = de2en.generate(enc, beam=k, retain_dropout=True)
        out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
        options.append(out_texts[0])
    smpl_diverse[key].append(options)

  0%|          | 0/400 [00:00<?, ?it/s]

In [42]:
key = 'greedy_dropout'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    for _ in range(k):
        batched_hypos = de2en.generate(enc, beam=1, retain_dropout=True)
        out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
        options.append(out_texts[0])
    smpl_diverse[key].append(options)

  0%|          | 0/400 [00:00<?, ?it/s]

In [43]:
for mn, m in de2en.named_modules():
    if 'dropout' in mn:
        m.apply_during_inference = False

# Scoring the hypotheses

Here is the mean quality of outputs

In [45]:
hypotheses_scores = {}

### By LABSE scores

In [46]:
labse = SentenceTransformer('sentence-transformers/LaBSE')
labse.cuda();

Load pretrained SentenceTransformer: sentence-transformers/LaBSE
Use pytorch device: cuda


In [47]:
def score_pair(src, trg):
    embs = labse.encode([src, trg], show_progress_bar=False)
    return embs[0].dot(embs[1])

In [48]:
def argmax(values, criterion):
    best = -np.infty
    candidate = None
    for v in values:
        score = criterion(v)
        if score > best:
            best = score
            candidate = v
    return candidate

In [49]:
hypotheses_scores['LABSE'] = {
    k: [[score_pair(x, smpl.iloc[i].src) for x in hyps] for i, hyps in enumerate(vs)]
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}

  0%|          | 0/12 [00:00<?, ?it/s]

In [50]:
labse.to('cpu')
cleanup();

### By COMET-QE

In [51]:
model_path = comet.download_model("wmt20-comet-qe-da-v2")
model = comet.load_from_checkpoint(model_path)

wmt20-comet-qe-da-v2 is already in cache.
Created a temporary directory at /tmp/tmpfh1qnfdl
Writing /tmp/tmpfh1qnfdl/_remote_module_non_scriptable.py
Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Encoder model frozen.


In [52]:
def score_pair(src, trg):
    seg_scores, sys_score = model.predict([{'src': src, 'mt': trg}], batch_size=8, gpus=0)
    # with 0 gpus, this is actually faster
    return seg_scores[0]

print(score_pair('hallo Welt', 'hello world'))
print(score_pair('hello world', 'hallo Welt'))
print(score_pair('hallo Welt', 'halo over my head'))
print(score_pair('halo over my head', 'hallo Welt'))

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.89it/s]
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


0.7208645939826965


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.07it/s]
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


0.8370150923728943


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.00it/s]
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


-0.4274933338165283


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.93it/s]

-0.10516911745071411





In [53]:
def score_pairs(src, trg, batch_size=8, gpus=0):
    seg_scores, sys_score = model.predict(
        [{'src': s, 'mt': t} for s, t in zip(src, trg)], 
        batch_size=batch_size, 
        gpus=gpus,
    )
    # with 0 gpus, this is actually faster
    return seg_scores


def get_scores_batched(hyp_sets, sources, **kwargs):
    srcs = []
    tgts = []
    ids = []
    for i, (hyp_set, src) in enumerate(zip(hyp_sets, sources)):
        id_old = len(srcs)
        for mt in hyp_set:
            srcs.append(src)
            tgts.append(mt)
        ids.append((id_old, len(srcs)))
    scores = score_pairs(srcs, tgts, **kwargs)
    results = []
    for i, (start, end) in enumerate(ids):
        results.append(scores[start:end])
    return results

In [54]:
hypotheses_scores['COMET-QE'] = {
    k: get_scores_batched(vs, smpl.src.tolist(), gpus=1)
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}

  0%|          | 0/12 [00:00<?, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0:   2%|███▋                                                                                                                                                                                    | 1/50 [00:00<00:02, 17.84it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:03<00:00, 13.18it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0:   0%|▎                                                                                                                                                                                      | 1/500 [00:00<00:40, 12.37it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:38<00:00, 12.92it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0:   0%|▎                                                                                                                                                                                      | 1/500 [00:00<00:29, 16.71it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:33<00:00, 15.08it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0:   0%|▎                                                                                                                                                                                      | 1/500 [00:00<00:27, 18.13it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:31<00:00, 15.92it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0:   0%|▎                                                                                                                                                                                      | 1/500 [00:00<00:28, 17.22it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:30<00:00, 16.29it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0:   0%|▎                                                                                                                                                                                      | 1/500 [00:00<00:35, 14.04it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:30<00:00, 16.34it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible


Predicting DataLoader 0:   0%|▎                                                                                                                                                                                      | 1/500 [00:00<00:28, 17.54it/s]

	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:31<00:00, 16.04it/s]

	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:31<00:00, 15.94it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0:   0%|▎                                                                                                                                                                                      | 1/500 [00:00<00:28, 17.57it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:29<00:00, 16.67it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0:   0%|▎                                                                                                                                                                                      | 1/500 [00:00<00:27, 17.84it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:29<00:00, 16.84it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0:   0%|▎                                                                                                                                                                                      | 1/500 [00:00<00:30, 16.11it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:30<00:00, 16.35it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0:   0%|▎                                                                                                                                                                                      | 1/500 [00:00<00:27, 18.02it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:30<00:00, 16.44it/s]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0:   0%|▎                                                                                                                                                                                      | 1/500 [00:00<00:27, 17.91it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:31<00:00, 15.93it/s]


### By LASER2

In [57]:
spm_tokenizer = spm.SentencePieceProcessor()
spm_tokenizer.Load(LASER_DIR + '/laser2.spm')

laser_encoder = SentenceEncoder(
    LASER_DIR + '/laser2.pt',
    max_sentences=None,
    max_tokens=None,
    spm_vocab=LASER_DIR + '/laser2.pt',
)

In [58]:
def encode_sents(sents):
    tokenized_sents = [
        " ".join(spm_tokenizer.EncodeAsPieces(sent))
        for sent in sents
    ]
    emb = laser_encoder.encode_sentences(tokenized_sents)
    return emb / ((emb**2).sum(1, keepdims=True) ** 0.5)

In [59]:
def score_pair(src, trg):
    embs = encode_sents([src, trg])
    return embs[0].dot(embs[1])

print(score_pair('hallo Welt', 'hello world'))
print(score_pair('hallo Welt', 'halo over my hed'))

0.8148769
0.67164296


In [60]:
hypotheses_scores['LASER2'] = {
    k: [[score_pair(x, smpl.iloc[i].src) for x in hyps] for i, hyps in enumerate(vs)]
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}

  0%|          | 0/12 [00:00<?, ?it/s]

In [61]:
laser_encoder.encoder.to('cpu');
cleanup();

### By ALTI+ 

In [62]:
hub = FairseqTransformerHub(cfg=de2en.cfg, models=de2en.models, task=de2en.task)
hub.cuda();

In [63]:
def score_pair(src, trg):
    with torch.inference_mode():
        alti = compute_alti_nllb(hub, src, trg)
    scores = compute_alti_metrics(*alti)
    return scores['avg_sc']

print(score_pair('hallo Welt', 'hello world'))
print(score_pair('hallo Welt', 'halo over my head'))

0.73475236
0.4881617


In [64]:
hypotheses_scores['ALTI_avg_sc'] = {
    k: [[score_pair(x, smpl.iloc[i].src) for x in hyps] for i, hyps in enumerate(vs)]
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}

  0%|          | 0/12 [00:00<?, ?it/s]

In [65]:
hub.to('cpu');
cleanup()

### By XNLI

In [66]:
mname = 'joeddav/xlm-roberta-large-xnli'

model = AutoModelForSequenceClassification.from_pretrained(mname).cuda()
tokenizer = AutoTokenizer.from_pretrained(mname)

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [67]:
def get_clf_scores(texts1, texts2, batch_size=32, label='entailment', verbose=True):
    scores = []
    t = trange if verbose else range
    for i in t(0, len(texts1), batch_size):
        xx, yy = texts1[i:i+batch_size], texts2[i:i+batch_size]
        with torch.inference_mode():
            inputs = tokenizer(xx, yy, truncation=True, padding=True, return_tensors='pt').to(model.device)
            proba = torch.softmax(model(**inputs).logits, -1)[:, model.config.label2id[label]].cpu().numpy()
        scores.append(proba)
    scores = np.concatenate(scores)
    return scores

def get_nli_scores(texts1, texts2, verbose=True):
    return get_clf_scores(texts1, texts2, verbose=verbose) * get_clf_scores(texts2, texts1, verbose=verbose)

In [68]:
def score_pair(src, trg):
    return get_nli_scores([src], [trg], verbose=False)[0]

print(score_pair('hallo Welt', 'hello world'))
print(score_pair('hallo Welt', 'halo over my head'))

0.99302286
0.049959093


In [69]:
hypotheses_scores['XNLI'] = {
    k: [[score_pair(x, smpl.iloc[i].src) for x in hyps] for i, hyps in enumerate(vs)]
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}

  0%|          | 0/12 [00:00<?, ?it/s]

In [70]:
model.to('cpu')
cleanup();

#### By ref-ChrF++ (oracle)

In [71]:
chrfpp = CHRF(word_order=2)

hypotheses_scores['ref_chrf'] = {
    k: [[chrfpp.sentence_score(x, [smpl.iloc[i].ref]).score for x in hyps] for i, hyps in enumerate(vs)]
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}


  0%|          | 0/12 [00:00<?, ?it/s]

### Now compute the selections based on the hypotheses

In [74]:
selections = {
    score_method: {
        gen_method: [
            hyps[np.argmax(hypotheses_scores[score_method][gen_method][i])]
            for i, hyps in enumerate(hyps_list)
        ]
        for gen_method, hyps_list in smpl_diverse.items()
    } 
    for score_method in hypotheses_scores
}

### The reference

In [75]:
selections['ref'] = {
    k: smpl.ref.tolist()
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}

  0%|          | 0/12 [00:00<?, ?it/s]

### By default 

In [76]:
selections['first'] = {
    k: [hyps[0] for i, hyps in enumerate(vs)]
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}

  0%|          | 0/12 [00:00<?, ?it/s]

In [77]:
len(selections['first']['sampling'])

400

the baseline (default translation) corresponds to taking the first hypothesis from beam search.

# Evaluate the selections

### src-NLI

In [78]:
mname = 'joeddav/xlm-roberta-large-xnli'

model = AutoModelForSequenceClassification.from_pretrained(mname).cuda()
tokenizer = AutoTokenizer.from_pretrained(mname)

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [79]:
def get_clf_scores(texts1, texts2, batch_size=32, label='entailment', verbose=True):
    scores = []
    t = trange if verbose else range
    for i in t(0, len(texts1), batch_size):
        xx, yy = texts1[i:i+batch_size], texts2[i:i+batch_size]
        with torch.inference_mode():
            inputs = tokenizer(xx, yy, truncation=True, padding=True, return_tensors='pt').to(model.device)
            proba = torch.softmax(model(**inputs).logits, -1)[:, model.config.label2id[label]].cpu().numpy()
        scores.append(proba)
    scores = np.concatenate(scores)
    return scores

def get_nli_scores(texts1, texts2, verbose=True):
    return get_clf_scores(texts1, texts2, verbose=verbose) * get_clf_scores(texts2, texts1, verbose=verbose)

In [80]:
sel_src_nli_raw = {
    selector: {
        sampler: get_nli_scores(sampled, smpl.src.tolist(), verbose=False).tolist()#.mean()
        for sampler, sampled in by_sampler.items()
    }
    for selector, by_sampler in tqdm(selections.items())
}

  0%|          | 0/8 [00:00<?, ?it/s]

In [81]:
sel_src_nli = {k1: {k2: np.mean(v2) for k2, v2 in v1.items()} for k1, v1 in sel_src_nli_raw.items()}

In [82]:
bl_nli = get_nli_scores(smpl.mt.tolist(), smpl.src.tolist())
bl_nli.mean()

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

0.512619

### src-ref-COMET

In [84]:
model_path = comet.download_model("wmt20-comet-da")
model = comet.load_from_checkpoint(model_path)

wmt20-comet-da is already in cache.
Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Encoder model frozen.


In [85]:
smpl_diverse.keys()

dict_keys(['default', 'sampling', 'sampling_p08', 'beam_search', 'beam_diversity_1', 'beam_diversity_3', 'beam_diversity_10', 'beam_dbs_1', 'beam_dbs_3', 'beam_dbs_10', 'beam_dropout', 'greedy_dropout'])

In [86]:
data_for_comet = pd.DataFrame([
    {'mt': hyp, 'src': smpl.src.iloc[i], 'ref': smpl.ref.iloc[i]}
    for gen, by_gen in smpl_diverse.items()
    for i, hyps in enumerate(by_gen)
    for hyp in hyps
] + [
    {'mt': mt, 'src': smpl.src.iloc[i], 'ref': smpl.ref.iloc[i]}
    for selector, by_sampler in selections.items()
    for sampler, sampled in by_sampler.items()
    for i, mt in enumerate(sampled)
])
print(data_for_comet.shape)
data_for_comet = data_for_comet.drop_duplicates().reset_index(drop=True)
print(data_for_comet.shape)

(82800, 3)
(30117, 3)


In [87]:
seg_scores_comet_ref, sys_score_comet_ref = model.predict(
    data_for_comet.to_dict('records'), batch_size=32, gpus=1
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...


Predicting DataLoader 0:   0%|                                                                                                                                                                                               | 0/942 [00:00<?, ?it/s]

	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 942/942 [05:11<00:00,  3.03it/s]

	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 942/942 [05:11<00:00,  3.02it/s]


In [88]:
texts2comet = {}
for i, row in data_for_comet.iterrows():
    texts2comet[(row.src, row.mt, row.ref)] =  seg_scores_comet_ref[i] 

Add scores for all the hypotheses

In [89]:
hypotheses_scores['COMET'] = {
    gen_method: [
        [texts2comet[(smpl.src.iloc[i], hyp, smpl.ref.iloc[i])] for hyp in hyps]
        for i, hyps in enumerate(by_gen)
    ]
    for gen_method, by_gen in smpl_diverse.items()
}

In [90]:
for selector, by_sampler in selections.items():
    for sampler, sampled in by_sampler.items():
        for i, mt in enumerate(sampled):
            _ = texts2comet[(smpl.src.iloc[i], mt, smpl.ref.iloc[i])]

In [91]:
sel_comet_raw = {
    selector: {
        sampler: [
            texts2comet[(smpl.src.iloc[i], mt, smpl.ref.iloc[i])] 
            for i, mt in enumerate(sampled)
        ]
        for sampler, sampled in by_sampler.items()
    }
    for selector, by_sampler in selections.items()
}

In [92]:
sel_comet = {k1: {k2: np.mean(v2) for k2, v2 in v1.items()} for k1, v1 in sel_comet_raw.items()}

In [93]:
sel_comet = {k1: {k2: np.mean(v2) for k2, v2 in v1.items()} for k1, v1 in sel_comet_raw.items()}

# Save the results

In [96]:
os.makedirs('../computed_data', exist_ok=True)
with open('../computed_data/diverse-decoding-results.json', 'w') as f:
    json.dump({
        'data': smpl.to_dict(orient='records'),
        'candidates': smpl_diverse,
        'candidate_scores': {k1: {k2: 
                                  [[float(h) for h in hl] for hl in v2] 
                                  for k2, v2 in v1.items()
                                 } for k1, v1 in hypotheses_scores.items()},
        'selections': selections,
        'nli_scores': sel_src_nli_raw, 
        'comet_scores': sel_comet_raw
    }, f, ensure_ascii=False, indent=2)