In [17]:
import pickle
from nltk.corpus import comtrans
import csv
import json
import math
from nltk.metrics import *
import nltk

In [18]:
def create_corpus_en_hi(source_lang_file, target_lang_file):
    sentence_pairs = []
    e_file = open(target_lang_file, "r")
    f_file = open(source_lang_file, "r")
    e_sentences = e_file.readlines()
    f_sentences = f_file.readlines()

    for e_s, f_s in zip(e_sentences, f_sentences):
        s_pair = {'e': e_s.lower(), 'f': f_s}
        sentence_pairs.append(s_pair)

    e_file.close()
    f_file.close()

    return sentence_pairs

In [19]:
def create_corpus_en_fr(aligned_sentences):
    sentence_pairs = []
    alignments = []
    for aligned_sentence in aligned_sentences:
        e_s = " ".join(aligned_sentence.words)
        f_s = " ".join(aligned_sentence.mots)
        s_pair = {'e': e_s.lower(), 'f': f_s.lower()}
        sentence_pairs.append(s_pair)
        alignments.append(aligned_sentence.alignment)
    return sentence_pairs, alignments

In [20]:
def create_vocabulary(sentence_pairs):
    e_words_set = set()
    f_words_set = set()
    for pair in sentence_pairs:
        for word in pair['e'].split():
            e_words_set.add(word)
        for word in pair['f'].split():
            f_words_set.add(word)
    words = {'e': e_words_set, 'f': f_words_set}
    return words

In [21]:
def train_model(sentence_pairs):
    words = create_vocabulary(sentence_pairs)
    e_words_size = len(words['e'])
    f_words_size = len(words['f'])

    translation_prob = {}

    num_iterations = 20
    curr_iteration = 1
    while curr_iteration <= num_iterations:
        print("Iteration number:", curr_iteration)
        count = {}
        total = {f_word: 0 for f_word in words['f']}

        s_total = {e_word: 0 for e_word in words['e']}
        for pair in sentence_pairs:
            for e_word in pair['e'].split():
                if e_word not in translation_prob:
                    translation_prob[e_word] = {}
                if e_word not in count:
                    count[e_word] = {}
                s_total[e_word] = 0
                for f_word in pair['f'].split():
                    if f_word not in translation_prob[e_word]:
                        translation_prob[e_word][f_word] = 1 / f_words_size
                    if f_word not in count[e_word]:
                        count[e_word][f_word] = 0
                    s_total[e_word] += translation_prob[e_word][f_word]

            for e_word in pair['e'].split():
                for f_word in pair['f'].split():
                    count[e_word][f_word] += translation_prob[e_word][f_word] / \
                        s_total[e_word]
                    total[f_word] += translation_prob[e_word][f_word] / \
                        s_total[e_word]

        for e_word in translation_prob:
            for f_word in translation_prob[e_word]:
                translation_prob[e_word][f_word] = count[e_word][f_word] / \
                    total[f_word]

        curr_iteration += 1

    return translation_prob

In [22]:
def alignment_scores(sentence_pairs, translation_prob):
    my_alignments = []
    for sent in sentence_pairs:
        e_words = sent['e'].split()
        f_words = sent['f'].split()
        sent_alignment = []
        for e in e_words:
            max_prob = 0
            ind = -1
            for f in f_words:
                if f in translation_prob[e]:
                    if translation_prob[e][f] > max_prob:
                        max_prob = translation_prob[e][f]
                        ind = f_words.index(f)
            curr_alignment = (e_words.index(e), ind)
            sent_alignment.append(curr_alignment)
        my_alignments.append(sent_alignment)
    return my_alignments

In [23]:
# English French corpus
aligned_sentences = comtrans.aligned_sents('alignment-en-fr.txt')
sentence_pairs, alignments = create_corpus_en_fr(aligned_sentences)
ind = math.floor(0.8 * len(sentence_pairs))
sentence_pairs_train = sentence_pairs[:ind]
sentence_pairs_test = sentence_pairs[ind:]
translation_prob = train_model(sentence_pairs)

outfile = open("translation_probablities_en-fr", "wb")
pickle.dump(translation_prob, outfile)
outfile.close()
json_dump = json.dumps(translation_prob)
f = open("translation_probabilities_en-fr.json", "w")
f.write(json_dump)
f.close()

Iteration number: 1
Iteration number: 2
Iteration number: 3
Iteration number: 4
Iteration number: 5
Iteration number: 6
Iteration number: 7
Iteration number: 8
Iteration number: 9
Iteration number: 10
Iteration number: 11
Iteration number: 12
Iteration number: 13
Iteration number: 14
Iteration number: 15
Iteration number: 16
Iteration number: 17
Iteration number: 18
Iteration number: 19
Iteration number: 20


In [24]:
file = open("translation_probabilities_en-fr.txt", "w")
for e in translation_prob:
    for f in translation_prob[e]:
        if translation_prob[e][f] > 0.3:
            file.write(e + " " + f + " : " + str(translation_prob[e][f]) + "\n")
file.close()

In [25]:
# infile = open('translation_probablities_en-fr', 'rb')
# translation_prob = pickle.load(infile)
# infile.close()
file = open("translation_probabilities_highest_en-fr.txt", "w")
for e in translation_prob:
    max_prob = 0
    aligned_word = ''
    for f in translation_prob[e]:
        if translation_prob[e][f] > max_prob:
            max_prob = translation_prob[e][f]
            aligned_word = f
    file.write(e + " : " + aligned_word + " = " + str(max_prob) + "\n")
file.close()

In [26]:
my_alignments = alignment_scores(sentence_pairs_test, translation_prob)
total_prec = 0
total_rec = 0
total_aer = 0
total_num = 0
for (als, my_als) in zip(alignments[ind:], my_alignments):
    my_als_set = set()
    for al in my_als:
        my_als_set.add(al)
    als_set = set(als)
    total_prec += precision(als_set, my_als_set)
    total_rec += recall(als_set, my_als_set)
    total_aer += nltk.metrics.alignment_error_rate(als_set, my_als_set)
    total_num += 1
prec = total_prec / total_num
rec = total_rec / total_num
aer = total_aer / total_num
print("Precision =", prec)
print("Recall =", rec)
print("Alignment Error Rate =", aer)

Precision = 0.6834336688145523
Recall = 0.571224106012276
Alignment Error Rate = 0.38116104308386844


In [27]:
# English Hindi corpus
data_directory = './Data'
source_lang_file = './Data/train.hi'
target_lang_file = './Data/train.en'
sentence_pairs = create_corpus_en_hi(source_lang_file, target_lang_file)
translation_prob = train_model(sentence_pairs)

outfile = open("translation_probablities_en-hi", "wb")
pickle.dump(translation_prob, outfile)
outfile.close()
json_dump = json.dumps(translation_prob)
f = open("translation_probabilities_en-hi.json", "w")
f.write(json_dump)
f.close()

Iteration number: 1
Iteration number: 2
Iteration number: 3
Iteration number: 4
Iteration number: 5
Iteration number: 6
Iteration number: 7
Iteration number: 8
Iteration number: 9
Iteration number: 10
Iteration number: 11
Iteration number: 12
Iteration number: 13
Iteration number: 14
Iteration number: 15
Iteration number: 16
Iteration number: 17
Iteration number: 18
Iteration number: 19
Iteration number: 20


In [28]:
file = open("translation_probabilities_en-hi.txt", "w")
for e in translation_prob:
    for f in translation_prob[e]:
        if translation_prob[e][f] > 0.3:
            file.write(e + " " + f + " : " + str(translation_prob[e][f]) + "\n")
file.close()

In [29]:
# infile = open('translation_probablities_en-hi', 'rb')
# translation_prob = pickle.load(infile)
# infile.close()
file = open("translation_probabilities_highest_en-hi.txt", "w")
for e in translation_prob:
    max_prob = 0
    aligned_word = ''
    for f in translation_prob[e]:
        if translation_prob[e][f] > max_prob:
            max_prob = translation_prob[e][f]
            aligned_word = f
    file.write(e + " : " + aligned_word + " = " + str(max_prob) + "\n")
file.close()