In [1]:
import sacrebleu
from sacremoses import MosesDetokenizer
from transformers import MarianMTModel, MarianTokenizer
from datasets import load_metric
import pandas as pd
import re

In [6]:
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-ROMANCE')
model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-ROMANCE')
model.to('cuda')

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0): MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
   

In [2]:
df = pd.read_csv('data/mt/eng_-french.csv')
df['English words/sentences'] = df['English words/sentences'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]+", ' ', x))
df['French words/sentences'] = df['French words/sentences'].apply(lambda x: re.sub(r"[^a-zA-Z0-9]+", ' ', x))

In [3]:
english_text = df['English words/sentences'].tolist()
french_text = df['French words/sentences'].tolist()

In [7]:
french_code = tokenizer.supported_language_codes[0]
md = MosesDetokenizer(lang='es')
metric = load_metric('bleu')

In [8]:
refs = []
preds = []
count = 0
for eng_txt, fr_txt in zip(english_text, french_text):
    input_text = french_code +  ' ' + eng_txt
    tokenized_text = tokenizer(input_text, return_tensors="pt", padding=True)
    input_ids = tokenized_text['input_ids']
    attn_mask = tokenized_text['attention_mask']
    
    gen_text = model.generate(input_ids.to('cuda'))
    with tokenizer.as_target_tokenizer():
        gen_text = tokenizer.decode(gen_text.squeeze().detach().cpu(), skip_special_tokens=True)
        preds.append(re.sub(r"[^a-zA-Z0-9]+", ' ', gen_text).split())
    refs.append([md.detokenize(fr_txt.strip().split()).split()])
    
    if count == 3000:
        break
        
    count += 1

In [9]:
metric.compute(predictions=preds, references=refs)

{'bleu': 0.407941799968416,
 'precisions': [0.5927515974440895,
  0.4518888096935139,
  0.35849514563106794,
  0.2884066247858367],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0084575110753122,
 'translation_length': 10016,
 'reference_length': 9932}