In [1]:
import nltk
from tqdm import tqdm

In [2]:
from nltk.translate.bleu_score import corpus_bleu

In [3]:
from nltk import ngrams

In [4]:
# load the dataset
train_source_text_path = "../data/sup_train.en-fr.fr"
train_target_text_path = "../data/sup_train.en-fr.en"
dev_source_text_path = "../data/sup_valid.en-fr.fr"
dev_target_text_path = "../data/sup_valid.en-fr.en"

In [5]:
# read the data from the files and store them in pandas dataframe
def retrieve_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f.readlines():
            line = line.strip()
            data.append(line)
    return data

In [6]:
train_source_sentences = retrieve_data(train_source_text_path)
train_target_sentences = retrieve_data(train_target_text_path)

In [7]:
train_source_sentences = train_source_sentences[:len(train_source_sentences)//2]
train_target_sentences = train_target_sentences[:len(train_target_sentences)//2]

In [8]:
# The dictionary contains french source sentences as keys
# with corresponding English translations as values
def create_parallel_dict(source_data, target_data):
    train_sentences = {}
    for index, src_sent in enumerate(source_data):
        train_sentences[src_sent] = target_data[index]
    return train_sentences

# This function takes in a value for n
# then determines how many n-gram overlaps are present
# between the test source sentence and the train source sentences by looping
# over all source train sentences and finding the sentence that
# has the highest overlap
# We then use this train sentences corresponding translation as the translation
# for the test sentence
def ngram_overlap(input_sent, train_src_sentences, n=2):
    input_sent_tokens = nltk.word_tokenize(input_sent.lower())
    max_overlap = 0
    max_train_sent = None
    for train_src_sent in train_src_sentences:
        train_src_sent_tokens = nltk.word_tokenize(train_src_sent.lower())
        n_gram_overlap = 0
        for i in range(len(train_src_sent_tokens)-n+1):
            if ' '.join(train_src_sent_tokens[i:i+n]) in input_sent_tokens:
                n_gram_overlap += 1
        print(n_gram_overlap)
        if n_gram_overlap > max_overlap:
            max_overlap = n_gram_overlap
            max_train_sent = train_src_sent
    return max_train_sent

In [9]:
# compute n-grams for each source sentence in the dictionary
n_grams_dict = {}
def create_src_ngrams(n):
    for sent in train_source_sentences:
        n_grams_dict[sent] = list(ngrams(sent.split(), n))

In [10]:
n = 3
create_src_ngrams(n)

In [11]:
def compute_ngram_overlap(input_sentence, n):
    max_overlap_count = float('-inf')
    best_translation_sent = ""
    
    input_sent_n_grams = list(ngrams(input_sentence.split(), n))
    for index, sent in enumerate(train_source_sentences):
        src_n_grams = n_grams_dict[sent]
        matches = len(set(src_n_grams).intersection(input_sent_n_grams))
        if matches > max_overlap_count:
            max_overlap_count = matches
            best_translation_sent = train_target_sentences[index]
    return max_overlap_count, best_translation_sent

In [12]:
valid_source_sentences = retrieve_data(dev_source_text_path)
valid_target_sentences = retrieve_data(dev_target_text_path)
test_source_text_path = "../data/test.en-fr.fr"
test_source_sentences = retrieve_data(test_source_text_path)

In [26]:
compute_ngram_overlap(valid_source_sentences[0], n=3)

(5,
 'mr president , what the investigations into this incident show is what can best be described as a culture of management incompetence and one of complacency .')

In [None]:
all_predictions = []
all_targets = []
total_overlap = 0
total_examples = 0
for index, source_sent in tqdm(enumerate(valid_source_sentences), total=len(valid_source_sentences)):
    max_overlap_count, translation_sent = compute_ngram_overlap(source_sent, n=3)
    all_predictions.append(translation_sent)
    all_targets.append(valid_target_sentences[index])
    total_overlap += max_overlap_count
    total_examples += 1

average_overlap = total_overlap / total_examples
print(f"Average max n-gram overlap: {average_overlap}")

 19%|████████████████████████████████▉                                                                                                                                         | 387/2000 [15:23<1:05:09,  2.42s/it]

In [11]:
train_data_tokenized = []
for index, src_sent in enumerate(train_source_sentences):
    source_sent_tokenized = nltk.word_tokenize(src_sent)
    train_data_tokenized.append((source_sent_tokenized, train_target_sentences[index]))

In [12]:
# Define a function to compute the n-gram overlap between two sentences
def compute_ngram_overlap(sent1, sent2, n):
    sent1_ngrams = Counter(zip(*[sent1[i:] for i in range(n)]))
    sent2_ngrams = Counter(zip(*[sent2[i:] for i in range(n)]))
    overlap = sum(min(sent1_ngrams[k], sent2_ngrams[k]) for k in sent1_ngrams.keys() if k in sent2_ngrams)
    return overlap

In [13]:
total_overlap = 0
total_examples = 0
all_predictions = []
all_targets = []
total_bleu_score = 0
for index, source_sent in tqdm(enumerate(valid_source_sentences), total=len(valid_source_sentences)):
    target_sent = valid_target_sentences[index]
    source_sent_tokenized = nltk.word_tokenize(source_sent)
    max_overlap = 0
    max_overlap_target = ''
    for train_sent_tokenized, train_target_sent in train_data_tokenized:
        overlap = compute_ngram_overlap(source_sent_tokenized, train_sent_tokenized, n=3)
        if overlap > max_overlap:
            max_overlap = overlap
            max_overlap_target = train_target_sent

    all_predictions.append(max_overlap_target)
    all_targets.append(target_sent)
    total_overlap += max_overlap
    total_examples += 1

average_overlap = total_overlap / total_examples
print(f"Average max n-gram overlap: {average_overlap}")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [4:34:17<00:00,  8.23s/it]

Average max n-gram overlap: 6.1745





In [16]:
references = [[target_sent.split()] for target_sent in all_targets]
candidates = [pred_sent.split() for pred_sent in all_predictions]
dev_bleu = corpus_bleu(references, candidates)
print('Dev BLEU score: ', dev_bleu)

Dev BLEU score:  0.06364323991478013
