This notebook computes various quality metrics for the German-English translation dataset annotated with error types (https://github.com/deep-spin/hallucinations-in-nmt). 

Before running it, make sure that you have installed the requirements and downloaded the translation model and the LASER2 sentence encoder (see `../README.md`).

The results are stored in `../computed_data/detection_metrics.tsv` and later analyzed in the notebook `02_Detection_analysis.ipynb`.

# Dependencies

In [1]:
# path to the translation model and its vocabulary, in order to compute ALTI correctly. 
MODEL_DIR = '../model'
DATA_DIR = '../model/wmt18_de-en'
LASER_DIR = '../laser'
USE_GPU = True

In [1]:
import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
# this is for comet to behave

In [4]:
import torch

In [29]:
torch.use_deterministic_algorithms(False) # otherwise, comet complains
#!pip install unbabel-comet==1.1.2 --use-feature=2020-resolver
import comet

In [19]:
from stopes.eval.alti.wrappers.transformer_wrapper import FairseqTransformerHub
from stopes.eval.alti.alti_metrics.alti_metrics_utils import compute_alti_metrics, compute_alti_nllb, get_loss

In [36]:
from stopes.modules.preprocess.laser_sentence_encoder import SentenceEncoder, spm

In [62]:
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [9]:
import gc

def cleanup():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [10]:
from tqdm.auto import tqdm, trange

In [64]:
import pandas as pd
import numpy as np

In [57]:
from sacrebleu import CHRF

In [103]:
from sklearn.metrics import roc_auc_score
from scipy.stats import spearmanr

# Data

In [14]:
gt = pd.read_csv('../annotated_data/guerreiro2022_corpus_w_annotations.csv')

In [15]:
gt['any_mistake'] = 1 - gt.correctness
gt['any_detached'] = gt[['strong-unsupport', 'full-unsupport']].max(1)
gt['repeat_or_detached'] = gt[['repetitions', 'strong-unsupport', 'full-unsupport']].max(1)

Compute the most severe error for each translation

In [99]:
error_rank = ['full-unsupport', 'strong-unsupport', 'repetitions', 'omission','named-entities']
error_classes = error_rank + ['other_error', 'correct']

def get_most_important_error(row):
    for e in error_rank:
        if row[e]:
            return e
    if row['correctness']:
        return error_classes[-1]
    else:
        return error_classes[-2]

In [100]:
gt['error_class'] = gt.apply(get_most_important_error, axis=1)

# Compute metrics

### ALTI

In [8]:
hub = FairseqTransformerHub.from_pretrained(
    MODEL_DIR,
    checkpoint_file="checkpoint_best.pt",
    data_name_or_path=DATA_DIR,
    bpe='sentencepiece', 
    sentencepiece_model=MODEL_DIR + '/sentencepiece.joint.bpe.model',
)
hub.device

2022-12-19 07:25:51 | INFO | fairseq.file_utils | loading archive file ../model
2022-12-19 07:25:51 | INFO | fairseq.checkpoint_utils | load_model_ensemble_and_task is_moe=False
2022-12-19 07:25:54 | INFO | fairseq.checkpoint_utils | Rank 0: Done reading from disk
2022-12-19 07:25:55 | INFO | fairseq.tasks.translation | [de] dictionary: 32032 types
2022-12-19 07:25:55 | INFO | fairseq.tasks.translation | [en] dictionary: 32032 types
2022-12-19 07:25:55 | INFO | fairseq.checkpoint_utils | Done loading state dict
2022-12-19 07:25:55 | INFO | fairseq.models.fairseq_model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 10, 'log_format': None, 'log_file': None, 'tensorboard_logdir': None, 'wandb_project': 'mt-hallucinations', 'azureml_logging': False, 'seed': 42, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': False, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_win

device(type='cpu')

In [16]:
if USE_GPU:
    hub.cuda();

On GPU, the computation is fast: 3415 sentence pairs are processed in about 5 minutes. 

In [18]:
computed_alti = []
for i, row in tqdm(gt.iterrows(), total=gt.shape[0]):
    alti_t = compute_alti_nllb(hub, row.src, row.mt)
    computed_alti.append(alti_t)

  0%|          | 0/3415 [00:00<?, ?it/s]

Look at an example of ALTI source-target contributions

In [20]:
alti_matrix, source_sentence, target_sentence, predicted_sentence = computed_alti[2]
pd.DataFrame(
    alti_matrix[:, :len(source_sentence)], 
    index=predicted_sentence, columns=source_sentence,
).style.background_gradient()

Unnamed: 0,▁Lassen,▁Sie,▁mich,▁zunächst,▁meine,▁Ver,legenheit,▁zum,▁Ausdruck,▁bringen,.,Unnamed: 12
▁Let,0.18723,0.054757,0.152082,0.17953,0.065776,0.033765,0.050423,0.012787,0.059994,0.035203,0.011735,0.003673
▁me,0.177234,0.036178,0.172876,0.06896,0.029418,0.008563,0.023114,0.009291,0.050244,0.029518,0.009452,0.003174
▁begin,0.105086,0.026848,0.118675,0.232698,0.048564,0.018145,0.051057,0.013129,0.07683,0.040446,0.009672,0.00244
▁by,0.067121,0.020307,0.075587,0.233973,0.06066,0.012521,0.029689,0.008087,0.04076,0.022088,0.007429,0.00183
▁expressing,0.061585,0.016918,0.060454,0.104339,0.054063,0.028572,0.095226,0.014208,0.113486,0.052795,0.00743,0.001544
▁my,0.035415,0.009602,0.031961,0.041206,0.162535,0.049221,0.167154,0.009681,0.091974,0.037068,0.006908,0.001144
▁embar,0.023746,0.006623,0.022936,0.024855,0.112323,0.109145,0.401771,0.00585,0.041487,0.018518,0.005279,0.000874
r,0.011822,0.003594,0.009987,0.009004,0.036264,0.056879,0.237397,0.002786,0.013612,0.007376,0.001992,0.00029
ass,0.010152,0.003053,0.008315,0.007592,0.026204,0.036452,0.145966,0.002191,0.011672,0.006172,0.001527,0.000253
ment,0.011504,0.003347,0.010321,0.009686,0.038061,0.046663,0.196906,0.002677,0.01547,0.007741,0.002025,0.000322


In [19]:
alti_matrix, source_sentence, target_sentence, predicted_sentence = computed_alti[2]
pd.DataFrame(
    alti_matrix[:, :len(source_sentence)], 
    index=predicted_sentence, columns=source_sentence,
).style.background_gradient()

Unnamed: 0,▁Lassen,▁Sie,▁mich,▁zunächst,▁meine,▁Ver,legenheit,▁zum,▁Ausdruck,▁bringen,.,Unnamed: 12
▁Let,0.18723,0.054757,0.152082,0.17953,0.065776,0.033765,0.050423,0.012787,0.059994,0.035203,0.011735,0.003673
▁me,0.177234,0.036178,0.172876,0.06896,0.029418,0.008563,0.023113,0.009291,0.050243,0.029518,0.009452,0.003174
▁begin,0.105086,0.026849,0.118675,0.232698,0.048564,0.018145,0.051057,0.013128,0.07683,0.040446,0.009672,0.00244
▁by,0.06712,0.020307,0.075587,0.233973,0.06066,0.012521,0.029689,0.008087,0.04076,0.022088,0.007429,0.001829
▁expressing,0.061585,0.016918,0.060454,0.104339,0.054063,0.028572,0.095226,0.014207,0.113486,0.052795,0.00743,0.001544
▁my,0.035415,0.009602,0.03196,0.041206,0.162535,0.049221,0.167154,0.00968,0.091974,0.037068,0.006908,0.001144
▁embar,0.023746,0.006623,0.022936,0.024855,0.112323,0.109145,0.401772,0.005849,0.041486,0.018517,0.005279,0.000874
r,0.011822,0.003594,0.009987,0.009004,0.036264,0.056879,0.237397,0.002786,0.013612,0.007376,0.001992,0.00029
ass,0.010152,0.003053,0.008315,0.007592,0.026204,0.036452,0.145966,0.002191,0.011672,0.006172,0.001527,0.000253
ment,0.011504,0.003347,0.010321,0.009686,0.038061,0.046663,0.196906,0.002677,0.01547,0.007741,0.002025,0.000322


In [72]:
alti_token_metrics = pd.DataFrame([compute_alti_metrics(*row) for row in tqdm(computed_alti)])

  0%|          | 0/3415 [00:00<?, ?it/s]

In [73]:
alti_token_metrics.shape, gt.shape

((3415, 18), (3415, 13))

### Log probablity

In [23]:
mt_losses = [get_loss(hub, row.src, row.mt) for i, row in tqdm(gt.iterrows(), total=gt.shape[0])]

  0%|          | 0/3415 [00:00<?, ?it/s]

In [24]:
hub.to('cpu');
cleanup();

### Comet

In [27]:
model_path = comet.download_model("wmt20-comet-qe-da-v2")
model = comet.load_from_checkpoint(model_path)

2022-12-19 07:37:09 | INFO | comet.download_utils | wmt20-comet-qe-da-v2 is already in cache.
Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2022-12-19 07:37:18 | INFO | comet.models.base | Encoder model frozen.


In [33]:
seg_scores_comet, sys_score_comet = model.predict(gt[['src', 'mt']].to_dict('records'), batch_size=8, gpus=int(USE_GPU))

2022-12-19 07:39:52 | INFO | pytorch_lightning.utilities.rank_zero | GPU available: True, used: True
2022-12-19 07:39:52 | INFO | pytorch_lightning.utilities.rank_zero | TPU available: False, using: 0 TPU cores
2022-12-19 07:39:52 | INFO | pytorch_lightning.utilities.rank_zero | IPU available: False, using: 0 IPUs
2022-12-19 07:39:52 | INFO | pytorch_lightning.utilities.rank_zero | HPU available: False, using: 0 HPUs
2022-12-19 07:39:53 | INFO | pytorch_lightning.accelerators.gpu | LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 427/427 [00:28<00:00, 14.85it/s]


#### comet with reference

In [34]:
model_path = comet.download_model("wmt20-comet-da")
model = comet.load_from_checkpoint(model_path)

2022-12-19 07:40:24 | INFO | comet.download_utils | wmt20-comet-da is already in cache.
Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2022-12-19 07:40:31 | INFO | comet.models.base | Encoder model frozen.


In [35]:
seg_scores_comet_ref, sys_score_comet_ref = model.predict(
    gt[['src', 'mt', 'ref']].to_dict('records'), batch_size=8, gpus=int(USE_GPU)
)

2022-12-19 07:40:32 | INFO | pytorch_lightning.utilities.rank_zero | GPU available: True, used: True
2022-12-19 07:40:32 | INFO | pytorch_lightning.utilities.rank_zero | TPU available: False, using: 0 TPU cores
2022-12-19 07:40:32 | INFO | pytorch_lightning.utilities.rank_zero | IPU available: False, using: 0 IPUs
2022-12-19 07:40:32 | INFO | pytorch_lightning.utilities.rank_zero | HPU available: False, using: 0 HPUs
2022-12-19 07:40:32 | INFO | pytorch_lightning.accelerators.gpu | LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting DataLoader 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 427/427 [00:42<00:00, 10.16it/s]


### LASER

In [38]:
spm_tokenizer = spm.SentencePieceProcessor()
spm_tokenizer.Load(LASER_DIR + 'laser2.spm')

True

In [39]:
laser_encoder = SentenceEncoder(
    LASER_DIR + 'laser2.pt',
    max_sentences=None,
    max_tokens=None,
    spm_vocab=LASER_DIR + 'laser2.pt',
)

In [45]:
def encode_sents(sents):
    tokenized_sents = [
        " ".join(spm_tokenizer.EncodeAsPieces(sent))
        for sent in sents
    ]
    emb = laser_encoder.encode_sentences(tokenized_sents)
    return emb / ((emb**2).sum(1, keepdims=True) ** 0.5)

In [46]:
%%time
emb_src = encode_sents(gt.src.tolist())

CPU times: user 18.9 s, sys: 36.1 ms, total: 18.9 s
Wall time: 18.9 s


In [47]:
emb_mt = encode_sents(gt.mt.tolist())

In [48]:
laser_sims = (emb_src * emb_mt).sum(1)
laser_sims.shape

(3415,)

### LaBSE

In [81]:
# !pip install sentence_transformers

In [52]:
labse = SentenceTransformer('sentence-transformers/LaBSE')
if USE_GPU:
    labse.cuda()

2022-12-19 07:43:43 | INFO | sentence_transformers.SentenceTransformer | Load pretrained SentenceTransformer: sentence-transformers/LaBSE
2022-12-19 07:43:48 | INFO | sentence_transformers.SentenceTransformer | Use pytorch device: cuda


In [53]:
labse_emb_src = labse.encode(gt.src.tolist(), show_progress_bar=True)

Batches:   0%|          | 0/107 [00:00<?, ?it/s]

In [54]:
labse_emb_mt = labse.encode(gt.mt.tolist(), show_progress_bar=True)

Batches:   0%|          | 0/107 [00:00<?, ?it/s]

In [55]:
labse_sims = (labse_emb_src * labse_emb_mt).sum(1)
labse_sims.shape

(3415,)

In [56]:
labse.to('cpu');

### NLI models

In [60]:
nli_models = [
    'joeddav/xlm-roberta-large-xnli',   # use only the first one
    #'MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 
    #'vicgalle/xlm-roberta-large-xnli-anli',
]

In [65]:
batch_size = 32
nli_scores = {}

for mname in nli_models:
    print(mname)
    model = AutoModelForSequenceClassification.from_pretrained(mname)
    if USE_GPU:
        model.cuda()
    cleanup()
    tokenizer = AutoTokenizer.from_pretrained(mname)
    # forward scores
    scores = []
    for i in trange(0, gt.shape[0], batch_size):
        b = gt[i:i+batch_size]
        with torch.inference_mode():
            inputs = tokenizer(b.src.tolist(), b.mt.tolist(), truncation=True, padding=True, return_tensors='pt').to(model.device)
            proba = torch.softmax(model(**inputs).logits, -1)[:, model.config.label2id['entailment']].cpu().numpy()
        scores.append(proba)
    scores = np.concatenate(scores)
    nli_scores[f'nli_f_{mname}'] = scores
    
    # backward scores
    scores = []
    for i in trange(0, gt.shape[0], batch_size):
        b = gt[i:i+batch_size]
        with torch.inference_mode():
            inputs = tokenizer(b.mt.tolist(), b.src.tolist(), truncation=True, padding=True, return_tensors='pt').to(model.device)
            proba = torch.softmax(model(**inputs).logits, -1)[:, model.config.label2id['entailment']].cpu().numpy()
        scores.append(proba)
    scores = np.concatenate(scores)
    nli_scores[f'nli_b_{mname}'] = scores
    
    # their product
    nli_scores[f'nli_bf_{mname}'] = nli_scores[f'nli_b_{mname}'] * nli_scores[f'nli_f_{mname}']

model.to('cpu')
cleanup()

joeddav/xlm-roberta-large-xnli


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/107 [00:00<?, ?it/s]

  0%|          | 0/107 [00:00<?, ?it/s]

### ChrF++

In [66]:
chrfpp = CHRF(word_order=2)

ref_chrfpp = [
    chrfpp.sentence_score(row.mt, [row.ref]).score
    for i, row in tqdm(gt.iterrows(), total=gt.shape[0])
]

  0%|          | 0/3415 [00:00<?, ?it/s]

# Save the data

In [75]:
all_metrics = pd.concat([
    pd.DataFrame(mt_losses).add_prefix('mt_log_'),
    pd.DataFrame({
        'comet_qa_neg': 1 - np.array(seg_scores_comet), 
        'comet_ref_neg': 1 - np.array(seg_scores_comet_ref),
        'ref_chrfpp_neg': -np.array(ref_chrfpp),
        'laser_sim': -laser_sims, 
        'labse_sim': -labse_sims, 
    }),
    -pd.DataFrame(nli_scores),
    -alti_token_metrics.add_prefix('alti_t_'),
], axis=1)

In [101]:
detection_data = pd.concat([gt, all_metrics], axis=1)

In [102]:
os.makedirs('../computed_data', exist_ok=True)
detection_data.to_csv('../computed_data/detection_metrics.tsv', sep='\t', index=None)

# Evaluate the metrics

In [76]:
target_columns = ['any_mistake', 'repeat_or_detached', 'any_detached', 'full-unsupport']

In [77]:
pd.options.display.max_rows = 100

In [80]:
aucs = pd.DataFrame({
    target: {pred: roc_auc_score(gt[target], all_metrics[pred]) for pred in all_metrics.columns} 
    for target in target_columns
})
aucs.sort_values('repeat_or_detached', ascending=False)

Unnamed: 0,any_mistake,repeat_or_detached,any_detached,full-unsupport
labse_sim,0.759625,0.917212,0.942578,0.98473
nli_bf_joeddav/xlm-roberta-large-xnli,0.728699,0.909248,0.932571,0.986676
nli_b_joeddav/xlm-roberta-large-xnli,0.725535,0.89999,0.924351,0.971583
nli_f_joeddav/xlm-roberta-large-xnli,0.661679,0.880811,0.905899,0.985456
alti_t_top_sc_mean,0.694632,0.865731,0.874418,0.972283
alti_t_avg_sc,0.616799,0.849173,0.870443,0.986556
alti_t_sc_above_50,0.618442,0.847574,0.87122,0.983288
alti_t_avg_sc_wo_lang,0.594139,0.847189,0.873031,0.987469
alti_t_avg_sc_wo_eos,0.600112,0.844565,0.867781,0.986171
comet_ref_neg,0.777755,0.834186,0.840474,0.877104


In [81]:
spearmans = pd.DataFrame({
    target: {pred: spearmanr(gt[target], all_metrics[pred]).correlation for pred in all_metrics.columns} 
    for target in target_columns
})
spearmans.sort_values('repeat_or_detached', ascending=False)

Unnamed: 0,any_mistake,repeat_or_detached,any_detached,full-unsupport
labse_sim,0.440652,0.422939,0.429378,0.320131
nli_bf_joeddav/xlm-roberta-large-xnli,0.388162,0.414866,0.41967,0.321417
nli_b_joeddav/xlm-roberta-large-xnli,0.382793,0.405481,0.411695,0.311449
nli_f_joeddav/xlm-roberta-large-xnli,0.274413,0.386038,0.393793,0.320611
alti_t_top_sc_mean,0.330342,0.370751,0.363251,0.311911
alti_t_avg_sc,0.198239,0.353966,0.359395,0.321337
alti_t_sc_above_50,0.201429,0.35305,0.360869,0.319818
alti_t_avg_sc_wo_lang,0.159779,0.351955,0.361906,0.32194
alti_t_avg_sc_wo_eos,0.169916,0.349295,0.356812,0.321083
comet_ref_neg,0.471423,0.338773,0.330319,0.249052
