<a href="https://colab.research.google.com/github/gyasifred/msc-thesis/blob/main/bleu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
from transformers import MarianMTModel, MarianTokenizer
import re

In [7]:
class Bleu():
    def __init__(self, translator, tokenizer):
        self.translator = translator
        self.tokenizer = tokenizer

    def get_bleuscore(self, testfile, referencefile, smothingfunction=None):
        if type(testfile) == str and type(referencefile) == str:
            # Open test file and read lines
            f = open(testfile, "r")
            hypothesis = f.readlines()
            f.close()
            # open refernce file and read lines
            f = open(referencefile, "r")
            reference = f.readlines()
            f.close()
        elif type(testfile) == list and type(referencefile) == list:
            hypothesis = testfile
            reference = referencefile
        else:
            print(f'File must be txt or python list')

        # check the length of our input sentence
        length = len(hypothesis)
        bleu_total = 0
        weights = (0.58, 0, 0, 0)
        for i in range(length):
            hypothesis[i] = hypothesis[i]
            reference[i] = reference[i]
            groundtruth = reference[i].lower().split()
            print(f'Ground Truth {groundtruth}')
            groundtruth = [groundtruth]
            translated_text = self.translator.generate(
                **self.tokenizer(hypothesis[i], return_tensors="pt", padding=True))
            translated = [self.tokenizer.decode(
                t, skip_special_tokens=True) for t in translated_text]
            translated = str(translated)[1:-1][1:-1]
            candidate =  translated.lower().split()
            print(f'Translated {candidate}')
            bleu = corpus_bleu(
                [groundtruth], [candidate], weights, smoothing_function=smothingfunction, auto_reweigh=True)
            bleu_total += bleu

        return f'BLEU SCORE: {bleu_total/length}'

In [8]:
# loading  pretrained model 
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-tw")
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-tw")
smooth = SmoothingFunction()
bleu = Bleu(model, tokenizer)
print(bleu.get_bleuscore("/content/english_test.txt","/content/twi_test.txt", smooth.method7))



Ground Truth ['ɛdeɛn', 'na', 'ɛrekɔso', 'wɔ', 'aha?']
Translated ['dɛn', 'na', 'ɛrekɔ', 'so', 'wɔ', 'ha?']
Ground Truth ['sɔre']
Translated ["w'abam", 'bebu']
Ground Truth ['ɔba', 'ha', 'fiada', 'biara']
Translated ['ɔbaa', 'ha', 'fida', 'biara']
Ground Truth ['sua', 'nyansa']
Translated ['sua', 'sɛ', 'wobɛyɛ', "w'ade", 'nyansam']
Ground Truth ['mannwene', 'da', 'sɛ', 'wo', 'bɛyera']
Translated ['ná', 'misusuw', 'sɛ', 'wode', 'wo', 'nan', 'besi', 'fam']
Ground Truth ['sɛ', 'wo', 'pɛ', 'sɛ', 'wo', 'tintim', 'brɔfo', 'ne', 'twi', 'nsɛmfua', 'nnum', 'ne', 'akyire', 'a']
Translated ['sɛ', 'wopɛ', 'sɛ', 'wode', 'wo', 'ho', 'hyɛ', 'afiase', 'bɛboro', 'mfe', 'anum', 'mu', 'a']
Ground Truth ['me', 'mma', 'reha', "m'adwene"]
Translated ['me', 'mma', 'no', 'rehaw', 'me.']
Ground Truth ['me', 'ne', 'simeon']
Translated ['mene', 'simeon']
Ground Truth ['dwene', 'wo', 'ho']
Translated ['susuw', 'wo', 'ho', 'hwɛ']
Ground Truth ['eyi', 'yɛ', 'asɛm', 'papa']
Translated ['eyi', 'yɛ', 'asɛmpa']
Ground T