This notebook does the same experiments as `03_Mitigation.ipynb`, with fewer generation methods and a larger pool of hypotheses (50 instead of 10).

# Dependencies

In [None]:
# path to the translation model and its vocabulary, in order to compute ALTI correctly. 
MODEL_DIR = '../model'
DATA_DIR = '../model/wmt18_de-en'
LASER_DIR = '../laser'
USE_GPU = True

In [None]:
import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
# this is for comet to behave

In [None]:
import torch

In [None]:
torch.use_deterministic_algorithms(False) # otherwise, comet complains
#!pip install unbabel-comet==1.1.2 --use-feature=2020-resolver
import comet

In [None]:
from fairseq.models.transformer import TransformerModel

In [None]:
from stopes.eval.alti.wrappers.transformer_wrapper import FairseqTransformerHub
from stopes.eval.alti.alti_metrics.alti_metrics_utils import compute_alti_metrics, compute_alti_nllb, get_loss

In [None]:
from stopes.modules.preprocess.laser_sentence_encoder import SentenceEncoder, spm

In [None]:
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:
import gc

def cleanup():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [None]:
from tqdm.auto import tqdm, trange

In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
from sacrebleu import CHRF

# Data

In [None]:
gt = pd.read_csv('../annotated_data/guerreiro2022_corpus_w_annotations.csv')

In [None]:
gt['any_mistake'] = 1 - gt.correctness
gt['any_detached'] = gt[['strong-unsupport', 'full-unsupport']].max(1)
gt['repeat_or_detached'] = gt[['repetitions', 'strong-unsupport', 'full-unsupport']].max(1)
gt['other_errors'] = gt['any_mistake']-gt['named-entities']-gt['omission']-gt['repeat_or_detached']
gt['error_class'] = gt['any_detached'] + gt['full-unsupport'] + gt['any_mistake']
gt['error_class'].value_counts()

Sample 400 source texts

In [None]:
smpl = gt.groupby('error_class').sample(100, random_state=1)

# Creating the translations

In [None]:
MAX_BEAM_SIZE = 10
MAX_HYP_NUMBER = 50

In [None]:
de2en = TransformerModel.from_pretrained(
    MODEL_DIR,
    checkpoint_file='checkpoint_best.pt',
    data_name_or_path=DATA_DIR,
    bpe='sentencepiece', 
    sentencepiece_model=MODEL_DIR + '/sentencepiece.joint.bpe.model'
)

In [None]:
de2en.cuda();

In [None]:
# Diverse translations of the data sample, key: list of lists of translation hypotheses.
smpl_diverse = {}

### Baseline translation

In [None]:
new_tran = [de2en.translate(t, beam=5) for t in tqdm(smpl.src)]

In [None]:
key = 'default'
smpl_diverse[key] = [[mt] for mt in new_tran]

### Random sampling

In [None]:
key = 'sampling_p08'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    for _ in range(MAX_HYP_NUMBER):
        batched_hypos = de2en.generate(enc, sampling=True, sampling_topp=0.8, beam=1)
        out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
        options.append(out_texts[0])
    smpl_diverse[key].append(options)

### Beam search

In [None]:
key = 'beam_search'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    batched_hypos = de2en.generate(enc, beam=MAX_HYP_NUMBER)
    out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
    smpl_diverse[key].append(out_texts)

In [None]:
key = 'beam_diversity_1'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    batched_hypos = de2en.generate(enc, beam=MAX_HYP_NUMBER, diversity_rate=1.0)
    out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
    smpl_diverse[key].append(out_texts)

In [None]:
key = 'beam_dbs_3'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    batched_hypos = de2en.generate(enc, beam=MAX_HYP_NUMBER, diverse_beam_groups=MAX_HYP_NUMBER, diverse_beam_strength=3)
    out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
    smpl_diverse[key].append(out_texts)

### Dropout methods

In [None]:
for mn, m in de2en.named_modules():  # an easy way to randomize the model!
    if 'dropout' in mn:
        m.apply_during_inference = True

In [None]:
key = 'beam_dropout'
smpl_diverse[key] = []
for text in tqdm(smpl.src):
    options = []
    enc = [de2en.encode(text)]
    for _ in range(MAX_HYP_NUMBER):
        batched_hypos = de2en.generate(enc, beam=MAX_BEAM_SIZE, retain_dropout=True)
        out_texts = [de2en.decode(h['tokens']) for h in batched_hypos[0]]
        options.append(out_texts[0])
    smpl_diverse[key].append(options)

In [None]:
for mn, m in de2en.named_modules():
    if 'dropout' in mn:
        m.apply_during_inference = False

# Scoring the hypotheses

Here is the mean quality of outputs

In [None]:
hypotheses_scores = {}

### By LABSE scores

In [None]:
labse = SentenceTransformer('sentence-transformers/LaBSE')
labse.cuda();

In [None]:
def score_pair(src, trg):
    embs = labse.encode([src, trg], show_progress_bar=False)
    return embs[0].dot(embs[1])

In [None]:
def argmax(values, criterion):
    best = -np.infty
    candidate = None
    for v in values:
        score = criterion(v)
        if score > best:
            best = score
            candidate = v
    return candidate

In [None]:
hypotheses_scores['LABSE'] = {
    k: [[score_pair(x, smpl.iloc[i].src) for x in hyps] for i, hyps in enumerate(vs)]
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}

In [None]:
labse.to('cpu')
cleanup();

### By COMET-QE

In [None]:
model_path = comet.download_model("wmt20-comet-qe-da-v2")
model = comet.load_from_checkpoint(model_path)

In [None]:
def score_pair(src, trg):
    seg_scores, sys_score = model.predict([{'src': src, 'mt': trg}], batch_size=8, gpus=0)
    # with 0 gpus, this is actually faster
    return seg_scores[0]

print(score_pair('hallo Welt', 'hello world'))
print(score_pair('hello world', 'hallo Welt'))
print(score_pair('hallo Welt', 'halo over my head'))
print(score_pair('halo over my head', 'hallo Welt'))

In [None]:
def score_pairs(src, trg, batch_size=8, gpus=0):
    seg_scores, sys_score = model.predict(
        [{'src': s, 'mt': t} for s, t in zip(src, trg)], 
        batch_size=batch_size, 
        gpus=gpus,
    )
    # with 0 gpus, this is actually faster
    return seg_scores


def get_scores_batched(hyp_sets, sources, **kwargs):
    srcs = []
    tgts = []
    ids = []
    for i, (hyp_set, src) in enumerate(zip(hyp_sets, sources)):
        id_old = len(srcs)
        for mt in hyp_set:
            srcs.append(src)
            tgts.append(mt)
        ids.append((id_old, len(srcs)))
    scores = score_pairs(srcs, tgts, **kwargs)
    results = []
    for i, (start, end) in enumerate(ids):
        results.append(scores[start:end])
    return results

In [None]:
hypotheses_scores['COMET-QE'] = {
    k: get_scores_batched(vs, smpl.src.tolist(), gpus=1)
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}

### By LASER2

In [None]:
spm_tokenizer = spm.SentencePieceProcessor()
spm_tokenizer.Load(LASER_DIR + '/laser2.spm')

laser_encoder = SentenceEncoder(
    LASER_DIR + '/laser2.pt',
    max_sentences=None,
    max_tokens=None,
    spm_vocab=LASER_DIR + '/laser2.pt',
)

In [None]:
def encode_sents(sents):
    tokenized_sents = [
        " ".join(spm_tokenizer.EncodeAsPieces(sent))
        for sent in sents
    ]
    emb = laser_encoder.encode_sentences(tokenized_sents)
    return emb / ((emb**2).sum(1, keepdims=True) ** 0.5)

In [None]:
def score_pair(src, trg):
    embs = encode_sents([src, trg])
    return embs[0].dot(embs[1])

print(score_pair('hallo Welt', 'hello world'))
print(score_pair('hallo Welt', 'halo over my hed'))

In [None]:
hypotheses_scores['LASER2'] = {
    k: [[score_pair(x, smpl.iloc[i].src) for x in hyps] for i, hyps in enumerate(vs)]
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}

In [None]:
laser_encoder.encoder.to('cpu');
cleanup();

### By ALTI+ 

In [None]:
hub = FairseqTransformerHub(cfg=de2en.cfg, models=de2en.models, task=de2en.task)
hub.cuda();

In [None]:
def score_pair(src, trg):
    with torch.inference_mode():
        alti = compute_alti_nllb(hub, src, trg)
    scores = compute_alti_metrics(*alti)
    return scores['avg_sc']

print(score_pair('hallo Welt', 'hello world'))
print(score_pair('hallo Welt', 'halo over my head'))

In [None]:
hypotheses_scores['ALTI_avg_sc'] = {
    k: [[score_pair(x, smpl.iloc[i].src) for x in hyps] for i, hyps in enumerate(vs)]
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}

In [None]:
hub.to('cpu');
cleanup()

### By XNLI

In [None]:
mname = 'joeddav/xlm-roberta-large-xnli'

model = AutoModelForSequenceClassification.from_pretrained(mname).cuda()
tokenizer = AutoTokenizer.from_pretrained(mname)

In [None]:
def get_clf_scores(texts1, texts2, batch_size=32, label='entailment', verbose=True):
    scores = []
    t = trange if verbose else range
    for i in t(0, len(texts1), batch_size):
        xx, yy = texts1[i:i+batch_size], texts2[i:i+batch_size]
        with torch.inference_mode():
            inputs = tokenizer(xx, yy, truncation=True, padding=True, return_tensors='pt').to(model.device)
            proba = torch.softmax(model(**inputs).logits, -1)[:, model.config.label2id[label]].cpu().numpy()
        scores.append(proba)
    scores = np.concatenate(scores)
    return scores

def get_nli_scores(texts1, texts2, verbose=True):
    return get_clf_scores(texts1, texts2, verbose=verbose) * get_clf_scores(texts2, texts1, verbose=verbose)

In [None]:
def score_pair(src, trg):
    return get_nli_scores([src], [trg], verbose=False)[0]

print(score_pair('hallo Welt', 'hello world'))
print(score_pair('hallo Welt', 'halo over my head'))

In [None]:
hypotheses_scores['XNLI'] = {
    k: [[score_pair(x, smpl.iloc[i].src) for x in hyps] for i, hyps in enumerate(vs)]
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}

In [None]:
model.to('cpu')
cleanup();

#### By ref-ChrF++ (oracle)

In [None]:
chrfpp = CHRF(word_order=2)

hypotheses_scores['ref_chrf'] = {
    k: [[chrfpp.sentence_score(x, [smpl.iloc[i].ref]).score for x in hyps] for i, hyps in enumerate(vs)]
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}


### Now compute the selections based on the hypotheses

In [None]:
selections = {
    score_method: {
        gen_method: [
            hyps[np.argmax(hypotheses_scores[score_method][gen_method][i])]
            for i, hyps in enumerate(hyps_list)
        ]
        for gen_method, hyps_list in smpl_diverse.items()
    } 
    for score_method in hypotheses_scores
}

### The reference

In [None]:
selections['ref'] = {
    k: smpl.ref.tolist()
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}

### By default 

In [None]:
selections['first'] = {
    k: [hyps[0] for i, hyps in enumerate(vs)]
    for k, vs in tqdm(smpl_diverse.items(), total=len(smpl_diverse))
}

the baseline (default translation) corresponds to taking the first hypothesis from beam search.

# Evaluate the selections

### src-NLI

In [None]:
mname = 'joeddav/xlm-roberta-large-xnli'

model = AutoModelForSequenceClassification.from_pretrained(mname).cuda()
tokenizer = AutoTokenizer.from_pretrained(mname)

In [None]:
def get_clf_scores(texts1, texts2, batch_size=32, label='entailment', verbose=True):
    scores = []
    t = trange if verbose else range
    for i in t(0, len(texts1), batch_size):
        xx, yy = texts1[i:i+batch_size], texts2[i:i+batch_size]
        with torch.inference_mode():
            inputs = tokenizer(xx, yy, truncation=True, padding=True, return_tensors='pt').to(model.device)
            proba = torch.softmax(model(**inputs).logits, -1)[:, model.config.label2id[label]].cpu().numpy()
        scores.append(proba)
    scores = np.concatenate(scores)
    return scores

def get_nli_scores(texts1, texts2, verbose=True):
    return get_clf_scores(texts1, texts2, verbose=verbose) * get_clf_scores(texts2, texts1, verbose=verbose)

In [None]:
sel_src_nli_raw = {
    selector: {
        sampler: get_nli_scores(sampled, smpl.src.tolist(), verbose=False).tolist()#.mean()
        for sampler, sampled in by_sampler.items()
    }
    for selector, by_sampler in tqdm(selections.items())
}

In [None]:
sel_src_nli = {k1: {k2: np.mean(v2) for k2, v2 in v1.items()} for k1, v1 in sel_src_nli_raw.items()}

In [None]:
bl_nli = get_nli_scores(smpl.mt.tolist(), smpl.src.tolist())
bl_nli.mean()

### src-ref-COMET

In [None]:
model_path = comet.download_model("wmt20-comet-da")
model = comet.load_from_checkpoint(model_path)

In [None]:
smpl_diverse.keys()

In [None]:
data_for_comet = pd.DataFrame([
    {'mt': hyp, 'src': smpl.src.iloc[i], 'ref': smpl.ref.iloc[i]}
    for gen, by_gen in smpl_diverse.items()
    for i, hyps in enumerate(by_gen)
    for hyp in hyps
] + [
    {'mt': mt, 'src': smpl.src.iloc[i], 'ref': smpl.ref.iloc[i]}
    for selector, by_sampler in selections.items()
    for sampler, sampled in by_sampler.items()
    for i, mt in enumerate(sampled)
])
print(data_for_comet.shape)
data_for_comet = data_for_comet.drop_duplicates().reset_index(drop=True)
print(data_for_comet.shape)

In [None]:
seg_scores_comet_ref, sys_score_comet_ref = model.predict(
    data_for_comet.to_dict('records'), batch_size=32, gpus=1
)

In [None]:
texts2comet = {}
for i, row in data_for_comet.iterrows():
    texts2comet[(row.src, row.mt, row.ref)] =  seg_scores_comet_ref[i] 

Add scores for all the hypotheses

In [None]:
hypotheses_scores['COMET'] = {
    gen_method: [
        [texts2comet[(smpl.src.iloc[i], hyp, smpl.ref.iloc[i])] for hyp in hyps]
        for i, hyps in enumerate(by_gen)
    ]
    for gen_method, by_gen in smpl_diverse.items()
}

In [None]:
for selector, by_sampler in selections.items():
    for sampler, sampled in by_sampler.items():
        for i, mt in enumerate(sampled):
            _ = texts2comet[(smpl.src.iloc[i], mt, smpl.ref.iloc[i])]

In [None]:
sel_comet_raw = {
    selector: {
        sampler: [
            texts2comet[(smpl.src.iloc[i], mt, smpl.ref.iloc[i])] 
            for i, mt in enumerate(sampled)
        ]
        for sampler, sampled in by_sampler.items()
    }
    for selector, by_sampler in selections.items()
}

In [None]:
sel_comet = {k1: {k2: np.mean(v2) for k2, v2 in v1.items()} for k1, v1 in sel_comet_raw.items()}

In [None]:
sel_comet = {k1: {k2: np.mean(v2) for k2, v2 in v1.items()} for k1, v1 in sel_comet_raw.items()}

# Save the results

In [None]:
os.makedirs('../computed_data', exist_ok=True)
with open('../computed_data/diverse-decoding-results-more-hypotheses.json', 'w') as f:
    json.dump({
        'data': smpl.to_dict(orient='records'),
        'candidates': smpl_diverse,
        'candidate_scores': {k1: {k2: 
                                  [[float(h) for h in hl] for hl in v2] 
                                  for k2, v2 in v1.items()
                                 } for k1, v1 in hypotheses_scores.items()},
        'selections': selections,
        'nli_scores': sel_src_nli_raw, 
        'comet_scores': sel_comet_raw
    }, f, ensure_ascii=False, indent=2)