In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import sacrebleu
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer_en2vi = AutoTokenizer.from_pretrained("vinai/vinai-translate-en2vi-v2", src_lang="en_XX")
model_en2vi = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-en2vi-v2")

In [3]:
def translate_en2vi(en_text: str) -> str:
    input_ids = tokenizer_en2vi(en_text, return_tensors="pt").input_ids
    output_ids = model_en2vi.generate(
        input_ids,
        decoder_start_token_id=tokenizer_en2vi.lang_code_to_id["vi_VN"],
        num_return_sequences=1,
        num_beams=5,
        early_stopping=True
    )
    vi_text = tokenizer_en2vi.batch_decode(output_ids, skip_special_tokens=True)
    vi_text = " ".join(vi_text)
    return vi_text

In [4]:
en_text = "That report was written by 620 scientists from 40 countries ."
print(translate_en2vi(en_text))

Báo cáo đó được viết bởi 620 nhà khoa học từ 40 quốc gia.


In [11]:
def read_data(source_file, target_file):
    """
        Đọc dữ liệu từ 2 file source và target
        Trả về: 1D array chứa các câu với từng source và target
    """
    source_data = open(source_file, encoding='utf-8').read().strip().split("\n")
    target_data = open(target_file, encoding='utf-8').read().strip().split("\n")
    return source_data, target_data

In [12]:
def preprocess_seq(seq):
        seq = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(seq))
        seq = re.sub(r"[ ]+", " ", seq)
        seq = re.sub(r"\!+", "!", seq)
        seq = re.sub(r"\,+", ",", seq)
        seq = re.sub(r"\?+", "?", seq)
        seq = seq.lower()
        return seq

In [13]:
def calculate_bleu_score_vinai(configs):
    valid_src_data, valid_trg_data = read_data('en_vi\data\tst2012.en', 'en_vi\data\tst2012.en')

    pred_sents = []
    for sentence in tqdm(valid_src_data):
        pred_trg = translate_en2vi(sentence)
        pred_sents.append(pred_trg)
    
    # write prediction to file
    with open("./predict_valid_vinai.txt", "w") as f:
        for sent in pred_sents:
            f.write(f"{sent}\n")

    hypotheses = [preprocess_seq(sent) for sent in pred_sents]
    references = [[sent] for sent in valid_trg_data]
    
    return sacrebleu.corpus_bleu(hypotheses, references)

In [14]:
valid_src_data, valid_trg_data = read_data('en_vi\\data\\tst2012.en', 'en_vi\\data\\tst2012.vi')

In [16]:
hypotheses = open('predict_valid_vinai.txt', encoding='utf-8').read().strip().split("\n")

In [17]:
hypotheses = [preprocess_seq(sent) for sent in hypotheses]

In [18]:
references = [[sent] for sent in valid_trg_data]

In [19]:
blue_score = sacrebleu.corpus_bleu(hypotheses, references)

In [20]:
blue_score.score

39.88954172519978