In [1]:
#import

import json
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [15]:
#read jsons

language_strings = ["malay", "nahuatl", "quechua", "wolof"] #Tibetan removed due to formatting errors
language_jsons = {}
for language_string in language_strings:
    language_json_string =  language_string + ".json"

    with open(language_json_string, 'r', encoding='utf-8') as fp:
        language_json = json.load(fp)
        language_jsons[language_string] = language_json

In [19]:
#evaluation

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.translate import bleu_score
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import SmoothingFunction
from collections import defaultdict

bleu_score_chat = defaultdict(lambda:0.0)
bleu_score_google = defaultdict(lambda:0.0)
meteor_score_chat = defaultdict(lambda:0.0)
meteor_score_google = defaultdict(lambda:0.0)

for language_string, language_json in language_jsons.items():

  print("\n" + language_string + "\n")

  for passage_number in language_json['passages']:

    if passage_number == "5" or passage_number == "6":  #5 and 6 does not have official translation
      continue

    passage = language_json['passages'][passage_number]

    official = passage['official']
    chatgpt = passage['chatgpt']
    google = passage['google']

    sent_token = sent_tokenize(official)

    chat_token = sent_tokenize(chatgpt)

    google_token = sent_tokenize(google)

    sent_tokenized = []
    for sent in sent_token:
      tokens = word_tokenize(sent)
      sent_tokenized.append(tokens)

    chat_tokenized = []
    for sent in chat_token:
      tokens = word_tokenize(sent)
      chat_tokenized.append(tokens)

    google_tokenized = []
    for sent in google_token:
      tokens = word_tokenize(sent)
      google_tokenized.append(tokens)

    """
    BLEU score

    All hypothesis (translated sentences) have the official translation as references. Due to us not knowing which sentence
    in the official translation translates to which sentence in the translated sentences (as we translated per paragraph
    instead of per sentence), we decided to account for all the sentences in the official to simplify our program

    We used corpus_blue() to calculate the BLEU score
    Per https://www.nltk.org/api/nltk.translate.bleu_score.html:
    Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all the hypotheses and their respective references.
    Instead of averaging the sentence level BLEU scores (i.e. macro-average precision), the original BLEU metric
    (Papineni et al. 2002) accounts for the micro-average precision (i.e. summing the numerators and denominators for each
    hypothesis-reference(s) pairs before the division).

    """

    references = []

    sent_list = [sent for sent in sent_tokenized]

    for _ in range(len(chat_tokenized)):
      references.append(sent_list)

    chat_score_bleu = bleu_score.corpus_bleu(references, chat_tokenized, smoothing_function=SmoothingFunction().method2)
    print(f"ChatGPT BLEU score for passage {passage_number} for language {language_string}:", chat_score_bleu)
    bleu_score_chat[language_string] += chat_score_bleu

    references = []

    sent_list = [sent for sent in sent_tokenized]

    for _ in range(len(google_tokenized)):
      references.append(sent_list)

    google_score_bleu = bleu_score.corpus_bleu(references, google_tokenized, smoothing_function=SmoothingFunction().method2)
    print(f"Google Translate BLEU score for passage {passage_number} for language {language_string}:", google_score_bleu)
    bleu_score_google[language_string] += google_score_bleu

    print("\n")

    """
    METEOR score

    Same as BLEU, we use all the official translation as the references for every translated sentence in the paragraph.

    Instead of able to calculate multiple sentences at once, we have to evaluate every sentence in the
    translated paragraph and average it to get the METEOR score

    We used meteor_score() to calculate the BLEU score
    Per https://www.nltk.org/api/nltk.translate.meteor_score.html:
    Calculates METEOR score for hypothesis with multiple references as described in
    “Meteor: An Automatic Metric for MT Evaluation with HighLevels of Correlation with Human Judgments” by Alon Lavie and Abhaya Agarwal,
    in Proceedings of ACL. https://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf
    In case of multiple references the best score is chosen. This method iterates over single_meteor_score and
    picks the best pair among all the references for a given hypothesis

    """

    avg_chat_meteor = 0.0
    avg_google_meteor = 0.0

    for i,sent in enumerate(chat_tokenized):
      chat_meteor_score = meteor_score(sent_list, sent)
      avg_chat_meteor += chat_meteor_score
      print(f"ChatGPT METEOR score for passage {passage_number} sentence {i+1} for {language_string} language:", chat_meteor_score)

    for i,sent in enumerate(google_tokenized):
      google_meteor_score = meteor_score(sent_list, sent)
      avg_google_meteor += google_meteor_score
      print(f"Google Translate METEOR score for passage {passage_number} sentence {i+1} for {language_string} language:", google_meteor_score)

    print("\n")

    print(f"Averaged ChatGPT METEOR score for passage {passage_number} for {language_string} language:", avg_chat_meteor/len(chat_token))
    print(f"Averaged Google Translate METEOR score for passage {passage_number} for {language_string} language:", avg_google_meteor/len(google_token))
    meteor_score_chat[language_string] += avg_chat_meteor/len(chat_token)
    meteor_score_google[language_string] += avg_google_meteor/len(google_token)

    print("\n")

for language_string in language_strings:
  print(f"Averaged ChatGPT BLEU score for language {language_string}:", bleu_score_chat[language_string]/4)
  print(f"Averaged Google Translate BLEU score for language {language_string}:", bleu_score_google[language_string]/4)
  print(f"Averaged ChatGPT METEOR score for language {language_string}:", meteor_score_chat[language_string]/4)
  print(f"Averaged Google Translate METEOR score for language {language_string}:", meteor_score_google[language_string]/4)
  print("\n")


malay

ChatGPT BLEU score for passage 1 for language malay: 0.04444176295633874
Google Translate BLEU score for passage 1 for language malay: 0.04422615079668136


ChatGPT METEOR score for passage 1 sentence 1 for malay language: 0.1293103448275862
ChatGPT METEOR score for passage 1 sentence 2 for malay language: 0.0819672131147541
ChatGPT METEOR score for passage 1 sentence 3 for malay language: 0.4179775280898876
ChatGPT METEOR score for passage 1 sentence 4 for malay language: 0.2937420178799489
ChatGPT METEOR score for passage 1 sentence 5 for malay language: 0.1149425287356322
ChatGPT METEOR score for passage 1 sentence 6 for malay language: 0.11904761904761905
ChatGPT METEOR score for passage 1 sentence 7 for malay language: 0.38490853658536583
Google Translate METEOR score for passage 1 sentence 1 for malay language: 0.1293103448275862
Google Translate METEOR score for passage 1 sentence 2 for malay language: 0.08130081300813008
Google Translate METEOR score for passage 1 sente