# IBM Model I

In [204]:
with open('corpus.en', 'r') as f, open('corpus.es', 'r') as g:
    english_sentences, spanish_sentences = f.readlines(), g.readlines()

## Reference Sentences

In [205]:
english_sentences, spanish_sentences = ['the dog', 'the chicken'], ['el perro', 'el pollo']

In [206]:
english_sentences, spanish_sentences = [sentence.split() for sentence in english_sentences], [sentence.split() for sentence in spanish_sentences]

# Compute $n(e)$ for Every English Word

In [207]:
from collections import defaultdict

candidate_mapping = defaultdict(set)

# Get the total number of spanish words that appear in the same sentence as this english word
for english_sentence in english_sentences:
    for english_word in english_sentence:
        # Already computed the all spanish words that occur in reference translations with this english word?
        if english_word in candidate_mapping:
            continue

        # Go through all pairs of english-spanish sentences, collecting all spanish words that appear in a translation with the english word
        candidate_mapping[english_word].update(
            spanish_word
            for english_sent, spanish_sent in zip(english_sentences, spanish_sentences)
            if english_word in english_sent for spanish_word in spanish_sent)

# For every $e$, initialize $t(f | e) = \frac{1}{n(e)}$ for all $f$ that occur with $e$

In [208]:
from collections import defaultdict

translation_prob = defaultdict(lambda: defaultdict(float))

english_words = set(word for sentence in english_sentences for word in sentence)
spanish_words = set(word for sentence in spanish_sentences for word in sentence)

for english_word in english_words:
    for spanish_word in candidate_mapping[english_word]:
        print 'translation_prob({} | {}) = {}'.format(spanish_word, english_word, 1./len(candidate_mapping[english_word]))
        translation_prob[spanish_word][english_word] = 1. / len(candidate_mapping[english_word])

translation_prob(el | the) = 0.333333333333
translation_prob(perro | the) = 0.333333333333
translation_prob(pollo | the) = 0.333333333333
translation_prob(el | dog) = 0.5
translation_prob(perro | dog) = 0.5
translation_prob(el | chicken) = 0.5
translation_prob(pollo | chicken) = 0.5


In [209]:
import dill

with open('translation_probs.p', 'wb') as f:
    dill.dump(translation_prob, f)

In [210]:
import dill

with open('translation_probs.p', 'rb') as f:
    translation_probs = dill.load(f)

# View Translation Probabilities

In [211]:
for spanish_word in spanish_words:
    for english_word in english_words:
        print 'translation_prob({} | {}) = {}'.format(spanish_word, english_word, translation_prob[spanish_word][english_word])

translation_prob(el | the) = 0.333333333333
translation_prob(el | dog) = 0.5
translation_prob(el | chicken) = 0.5
translation_prob(perro | the) = 0.333333333333
translation_prob(perro | dog) = 0.5
translation_prob(perro | chicken) = 0.0
translation_prob(pollo | the) = 0.333333333333
translation_prob(pollo | dog) = 0.0
translation_prob(pollo | chicken) = 0.5


# EM

# Compute $\delta(k, i, j)$ for all $k, i, j$

In [212]:
delta = {}

for k, (english_sentence, spanish_sentence) in enumerate(zip(english_sentences, spanish_sentences)):
    for i, spanish_word in enumerate(spanish_sentence):
        print '=========================================================================================='
        print 'Computing normalizing term \Sum_j translation_prob(s={} | e_j={})...'.format(spanish_word, english_sentence)
        print
        normalizing_term = 0
        for english_word in english_sentence:
            print '    translation_prob({} | {}) = {}'.format(spanish_word, english_word, translation_prob[spanish_word][english_word])
            normalizing_term += translation_prob[spanish_word][english_word]
            
        print
        
        print 'English sentence: {}'.format(zip(range(len(english_sentence)), english_sentence))
        print 'Spanish sentence: {}'.format(zip(range(len(spanish_sentence)), spanish_sentence))
        print
        
        for j, english_word in enumerate(english_sentence):
            delta[(k, i, j)] = translation_prob[spanish_word][english_word] / float(normalizing_term)
            print 'delta(k={}, i={}, j={}) = {}'.format(k, i, j, delta[(k, i, j)])
        print '=========================================================================================='
        
        print
        print

Computing normalizing term \Sum_j translation_prob(s=el | e_j=['the', 'dog'])...

    translation_prob(el | the) = 0.333333333333
    translation_prob(el | dog) = 0.5

English sentence: [(0, 'the'), (1, 'dog')]
Spanish sentence: [(0, 'el'), (1, 'perro')]

delta(k=0, i=0, j=0) = 0.4
delta(k=0, i=0, j=1) = 0.6


Computing normalizing term \Sum_j translation_prob(s=perro | e_j=['the', 'dog'])...

    translation_prob(perro | the) = 0.333333333333
    translation_prob(perro | dog) = 0.5

English sentence: [(0, 'the'), (1, 'dog')]
Spanish sentence: [(0, 'el'), (1, 'perro')]

delta(k=0, i=1, j=0) = 0.4
delta(k=0, i=1, j=1) = 0.6


Computing normalizing term \Sum_j translation_prob(s=el | e_j=['the', 'chicken'])...

    translation_prob(el | the) = 0.333333333333
    translation_prob(el | chicken) = 0.5

English sentence: [(0, 'the'), (1, 'chicken')]
Spanish sentence: [(0, 'el'), (1, 'pollo')]

delta(k=1, i=0, j=0) = 0.4
delta(k=1, i=0, j=1) = 0.6


Computing normalizing term \Sum_j translati

# Compute Expected Alignment Counts

In [213]:
from collections import defaultdict

expected_alignment_count = defaultdict(float)

for k, (english_sentence, spanish_sentence) in enumerate(zip(english_sentences, spanish_sentences)):
    for i, spanish_word in enumerate(spanish_sentence):  
        for j, english_word in enumerate(english_sentence):
            expected_alignment_count[(english_word, spanish_word)] += delta[(k, i, j)]
            expected_alignment_count[english_word] += delta[(k, i, j)]

In [214]:
for elem, count in expected_alignment_count.items():
    print '{}: {}'.format(elem, count)

('dog', 'el'): 0.6
('the', 'perro'): 0.4
('chicken', 'el'): 0.6
('the', 'el'): 0.8
('the', 'pollo'): 0.4
dog: 1.2
('chicken', 'pollo'): 0.6
chicken: 1.2
the: 1.6
('dog', 'perro'): 0.6


# Re-Estimate Translation Probabilities

In [215]:
print 'Old translation probabilities...'
print
for english_word in english_words:
    for spanish_word in translation_prob:
        if translation_prob[spanish_word][english_word]:
            print '    translation_problation_prob({} | {}) = {}'.format(spanish_word, english_word, translation_prob[spanish_word][english_word])

print
print
        
for english_word in english_words:
    for spanish_word in translation_prob:
        if translation_prob[spanish_word][english_word]:
            new_prob = expected_alignment_count[(english_word, spanish_word)] / expected_alignment_count[english_word]
            translation_prob[spanish_word][english_word] = expected_alignment_count[(english_word, spanish_word)] / expected_alignment_count[english_word]

print 'New translation probabilities...'
print
for english_word in english_words:
    for spanish_word in translation_prob:
        if translation_prob[spanish_word][english_word]:
            print '    translation_prob({} | {}) = {}'.format(spanish_word, english_word, translation_prob[spanish_word][english_word])

Old translation probabilities...

    translation_problation_prob(el | the) = 0.333333333333
    translation_problation_prob(perro | the) = 0.333333333333
    translation_problation_prob(pollo | the) = 0.333333333333
    translation_problation_prob(el | dog) = 0.5
    translation_problation_prob(perro | dog) = 0.5
    translation_problation_prob(el | chicken) = 0.5
    translation_problation_prob(pollo | chicken) = 0.5


New translation probabilities...

    translation_prob(el | the) = 0.5
    translation_prob(perro | the) = 0.25
    translation_prob(pollo | the) = 0.25
    translation_prob(el | dog) = 0.5
    translation_prob(perro | dog) = 0.5
    translation_prob(el | chicken) = 0.5
    translation_prob(pollo | chicken) = 0.5


# Recover Alignments

In [220]:
alignments = [''] * len(spanish_sentences)

for k, (english_sentence, spanish_sentence) in enumerate(zip(english_sentences, spanish_sentences)):
    
    alignment = [0] * len(spanish_sentence)
    for i, spanish_word in enumerate(spanish_sentence):
        
        most_likely_alignment = (None, 0)
        for english_word in english_sentence:
            print 'translation_prob({} | {}) = {}'.format(spanish_word, english_word, translation_prob[spanish_word][english_word])
            most_likely_alignment = (english_word, translation_prob[spanish_word][english_word]) if translation_prob[spanish_word][english_word] > most_likely_alignment[1] else most_likely_alignment
        
        print 'Setting the i={} alignment to {}...'.format(i, most_likely_alignment)
        alignment[i] = most_likely_alignment
    
    print 'Setting the k={} alignment to {}...'.format(k, alignments[i])
    alignments[k] = alignment

translation_prob(el | the) = 0.5
translation_prob(el | dog) = 0.5
Setting the i=0 alignment to ('the', 0.5)...
translation_prob(perro | the) = 0.25
translation_prob(perro | dog) = 0.5
Setting the i=1 alignment to ('dog', 0.5)...
Setting the k=0 alignment to ...
translation_prob(el | the) = 0.5
translation_prob(el | chicken) = 0.5
Setting the i=0 alignment to ('the', 0.5)...
translation_prob(pollo | the) = 0.25
translation_prob(pollo | chicken) = 0.5
Setting the i=1 alignment to ('chicken', 0.5)...
Setting the k=1 alignment to ...


In [221]:
alignments

[[('the', 0.5), ('dog', 0.5)], [('the', 0.5), ('chicken', 0.5)]]