## This notebook fetches several translation examples from Google's translation API and rates those translations using the METEOR score metric.

#### METEOR (Metric for Evaluation of Translation with Explicit ORdering) is an algorithm used to score and evaluate the accuracy of a machine translation by comparing how close it is to a human-derived translation. Unlike the more common BLEU score, it is intended to be used on individual sentences rather than an entire corpus. See https://en.wikipedia.org/wiki/METEOR for more information.

In [1]:
import os
import json
import random
import six
import json
from google.cloud import translate_v2 as translate
from nltk.translate.meteor_score import single_meteor_score
from spacy.lang.en import English

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/curtismitchell/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# get JSON file of sentence pairs
benchmark_directory = os.getcwd()
os.chdir(os.path.join(benchmark_directory, './..'))
sentences_file = open('./subtitle_corpus.json', 'r')
sentences_json = json.load(sentences_file)

In [4]:
os.chdir(os.path.join(benchmark_directory, './..'))
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './credentials/Google-Cloud-a8fd3d0a789d.json'
ACCESS_CODE = os.getenv("accessCode")
translate_client = translate.Client()

In [5]:
# grab 100 random sentence pairs
num_sentences = len(sentences_json) - 1
random.seed(42)
selected_sentence_pairs = []
for _ in range(100):
  i = random.randrange(0, num_sentences)
  selected_sentence_pairs.append(sentences_json[i])

In [6]:
# method for retrieving individual translations via Google Translation API
def retrieve_google_translation(text):
  if isinstance(text, six.binary_type):
      text = text.decode('utf-8')

  # Text can also be a sequence of strings, in which case this method
  # will return a sequence of results for each text.
  result = translate_client.translate(
      text,
      source_language = 'ja',
      target_language='en')

  # print(u'Text: {}'.format(result['input']))
  # print(u'Translation: {}'.format(result['translatedText']))
  return result['translatedText']


In [7]:
benchmark_translations = []

for sentence_pair in selected_sentence_pairs:
  ja_text = sentence_pair['j']
  sentence_pair['gc_result'] = retrieve_google_translation(ja_text)
  benchmark_translations.append(sentence_pair)

In [8]:
# Create a blank Spacy tokenizer with just the English vocab
nlp = English()
tokenizer = nlp.Defaults.create_tokenizer(nlp)

In [9]:
# Count number of unique words in sample sentences as we want at least 1000 unique words
word_list = []
for translation in benchmark_translations:
    word_list += [token.text for token in tokenizer(translation['e'])]

print(len(set(word_list)))

351


In [10]:
# Calculate METEOR for each sentence
meteor_scores = []
for translation in benchmark_translations:
    score = single_meteor_score(translation['e'], translation['gc_result'])
    meteor_scores.append(round(score, 4))

In [11]:
# Descriptive stats for meteor scores
from statistics import mean, stdev, median

print("max: " + str(max(meteor_scores)))
print("min: " + str(min(meteor_scores)))
print("mean: " + str(mean(meteor_scores)))
print("stdev: " + str(stdev(meteor_scores)))
print("median: " + str(median(meteor_scores)))

max: 0.755
min: 0.0
mean: 0.224539
stdev: 0.21867785443247773
median: 0.16


#### Older scores from Kyoto Corpus:
max: 0.9375

min: 0.0

mean: 0.316994

stdev: 0.21899833301335025

median: 0.2957