In [1]:
import pandas as pd
from googletrans import Translator
from pyvi import ViTokenizer
from difflib import SequenceMatcher
from nltk.translate.bleu_score import sentence_bleu

# Initialize the translator
translator = Translator()

# Load your English captions CSV file
english_captions = pd.read_csv('forrest_gump_transcript_en.csv')
human_translations = pd.read_csv('forrest_gump_transcript_vn.csv')

In [2]:
# Step 1: Translate English captions to Vietnamese
english_captions['translated_vietnamese'] = english_captions['Transcript Line'].head(3).apply(
    lambda x: translator.translate(x, src='en', dest='vi').text
)

In [3]:
# Step 2: Tokenize the machine-translated Vietnamese text
english_captions['tokenized_vietnamese'] = english_captions['translated_vietnamese'].head(3).apply(ViTokenizer.tokenize)

In [4]:
# Step 3: Tokenize the human-translated Vietnamese captions
human_translations['tokenized_caption'] = human_translations['Transcript Line'].head(3).apply(ViTokenizer.tokenize)

In [5]:
# Step 4: Define a function to calculate similarity ratio (Levenshtein-based)
def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [6]:
# Step 5: Define a function to calculate BLEU score
def bleu_score(reference, candidate):
    return sentence_bleu([reference.split()], candidate.split())

In [7]:
# Step 6: Apply similarity and BLEU score comparisons between machine and human translations
english_captions['similarity'] = english_captions.head(3).apply(
    lambda row: similarity(row['tokenized_vietnamese'], human_translations.loc[row.name, 'tokenized_caption']),
    axis=1
)

english_captions['bleu_score'] = english_captions.head(3).apply(
    lambda row: bleu_score(human_translations.loc[row.name, 'tokenized_caption'], row['tokenized_vietnamese']),
    axis=1
)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [8]:
# Step 7: Save the comparison results to a CSV file
english_captions.head(3).to_csv('pyvi_comparison_results.csv', index=False)

In [9]:
# Print some of the results for quick inspection
print(english_captions[['Line Number', 'Transcript Line', 'translated_vietnamese', 'tokenized_vietnamese', 'similarity', 'bleu_score']].head(3))

   Line Number                                   Transcript Line  \
0            1         Hello. My name's Forrest... Forrest Gump.   
1            2                          Do you want a chocolate?   
2            3  I could eat about a million and a half of these.   

                               translated_vietnamese  \
0      Xin chào.Tên tôi là Forrest ... Forrest Gump.   
1                          Bạn có muốn một sô cô la?   
2  Tôi có thể ăn khoảng một triệu rưỡi trong số này.   

                                tokenized_vietnamese  similarity  \
0   Xin chào . Tên tôi là Forrest ... Forrest_Gump .    0.895833   
1                         Bạn có muốn một sô cô la ?    0.562500   
2  Tôi có_thể ăn khoảng một triệu rưỡi trong số n...    0.672897   

      bleu_score  
0   7.598357e-01  
1  7.176382e-155  
2  7.428368e-155  
