In [None]:
import numpy as np   
import pandas as pd
import sys

english_data = "data/hansards.e"
french_data = "data/hansards.f"

english_words = []
french_words = []
sentence_pairs = []

for english_line, french_line in zip(open(english_data), open(french_data)): # open english and french text
    sentence_pairs.append([english_line.strip().split(), french_line.strip().split()]) # append english-french sentence pairs to list
                                    
    for english in english_line.strip().split(): # strip whitespaces and split english string into a list
        english_words.append(english) # append english words to list
    for french in french_line.strip().split(): # strip whitespaces and split french string into a list
        french_words.append(french) # append french words to list

english_words = list(set(english_words)) # convert lists to sets to get unique words 
french_words = list(set(french_words)) 

english_words_i = {english:i for i,english in enumerate(english_words)} # get positions of words in lists 
french_words_i = {french:i for i,french in enumerate(french_words)} 

trans_prob = np.ones([len(english_words), len(french_words)], dtype = np.float32) # initialize translation probabilities 
trans_prob = trans_prob/len(english_words) # divide translation probabilities by length of english words

for i in range (0,5):

    count_pair = np.zeros([len(english_words), len(french_words)], dtype = np.float32) # initialize counts to zero
    count_french = np.zeros([1,len(french_words)], dtype = np.float32) 

    for english, french in sentence_pairs:
        index_e = [english_words_i[k] for k in english] # get indices of words
        index_f = [french_words_i[k] for k in french]
        sub_array = trans_prob[np.ix_(index_e,index_f)].copy() 
        norm = sub_array/sub_array.sum(axis = 1).reshape(-1,1)
        count_pair[np.ix_(index_e, index_f)] = count_pair[np.ix_(index_e, index_f)] + norm
        count_french[:,np.ix_(index_f)] = count_french[:,np.ix_(index_f)] + norm.sum(axis = 0)
        
    trans_prob = count_pair /count_french

for (e_w, f_w) in sentence_pairs: # alignment
    for (i, f_i) in enumerate(f_w):
        best_prob = 0
        best_j = 0
        for (j, e_j) in enumerate(e_w):
            if trans_prob[english_words_i[e_j],french_words_i[f_i]] > best_prob:
                best_prob = trans_prob[english_words_i[e_j],french_words_i[f_i]]
                best_j = j
        sys.stdout.write("%i-%i " % (i,best_j))
    sys.stdout.write("\n")