# IBM Model I

In [320]:
with open('corpus.en', 'r') as f, open('corpus.es', 'r') as g:
    english_sentences, spanish_sentences = f.readlines(), g.readlines()

## Reference Sentences

In [321]:
english_sentences, spanish_sentences = ['the dog', 'the chicken'], ['el perro', 'el pollo']

In [322]:
english_sentences, spanish_sentences = [['NULL'] + sentence.split() for sentence in english_sentences], [sentence.split() for sentence in spanish_sentences]

# Compute $n(e)$ for Every English Word

In [323]:
from collections import defaultdict

candidate_mapping = defaultdict(set)

# Get the total number of spanish words that appear in the same sentence as this english word
for english_sentence in english_sentences:
    for english_word in english_sentence:
        # Already computed the all spanish words that occur in reference translations with this english word?
        if english_word in candidate_mapping:
            continue

        # Go through all pairs of english-spanish sentences, collecting all spanish words that appear in a translation with the english word
        candidate_mapping[english_word].update(
            spanish_word for english_sent, spanish_sent in zip(english_sentences, spanish_sentences) if english_word in english_sent for spanish_word in spanish_sent)

# For every $e$, initialize $t(f | e) = \frac{1}{n(e)}$ for all $f$ that occur with $e$

In [326]:
from collections import defaultdict

translation_prob = defaultdict(lambda: defaultdict(float))

english_words = set(word for sentence in english_sentences for word in sentence)
spanish_words = set(word for sentence in spanish_sentences for word in sentence)

for english_word in english_words:
    for spanish_word in candidate_mapping[english_word]:
        translation_prob[spanish_word][english_word] = 1. / len(candidate_mapping[english_word])

In [303]:
# import dill

# with open('translation_probs.p', 'wb') as f:
#     dill.dump(translation_prob, f)

# import dill

# with open('translation_probs.p', 'rb') as f:
#     translation_prob = dill.load(f)

# View Translation Probabilities

In [327]:
for spanish_word in translation_prob:
    for english_word in translation_prob[spanish_word]:
        print 'translation_prob({} | {}) = {}'.format(spanish_word, english_word, translation_prob[spanish_word][english_word])

translation_prob(el | the) = 0.333333333333
translation_prob(el | NULL) = 0.333333333333
translation_prob(el | dog) = 0.5
translation_prob(el | chicken) = 0.5
translation_prob(perro | the) = 0.333333333333
translation_prob(perro | NULL) = 0.333333333333
translation_prob(perro | dog) = 0.5
translation_prob(pollo | the) = 0.333333333333
translation_prob(pollo | NULL) = 0.333333333333
translation_prob(pollo | chicken) = 0.5


# EM

# Compute $\delta(k, i, j)$ for all $k, i, j$

In [328]:
delta = {}

for k, (english_sentence, spanish_sentence) in enumerate(zip(english_sentences, spanish_sentences)):
    for i, spanish_word in enumerate(spanish_sentence):
        print '=========================================================================================='
        print 'Computing normalizing term \Sum_j translation_prob(s={} | e_j={})...'.format(spanish_word, english_sentence)
        print
        normalizing_term = 0
        for english_word in english_sentence:
            print '    translation_prob({} | {}) = {}'.format(spanish_word, english_word, translation_prob[spanish_word][english_word])
            normalizing_term += translation_prob[spanish_word][english_word]
            
        print
        
        print 'English sentence: {}'.format(zip(range(len(english_sentence)), english_sentence))
        print 'Spanish sentence: {}'.format(zip(range(len(spanish_sentence)), spanish_sentence))
        print
        
        for j, english_word in enumerate(english_sentence):
            delta[(k, i, j)] = translation_prob[spanish_word][english_word] / float(normalizing_term)
            print 'delta(k={}, i={}, j={}) = {}'.format(k, i, j, delta[(k, i, j)])
        print '=========================================================================================='
        
        print
        print

Computing normalizing term \Sum_j translation_prob(s=el | e_j=['NULL', 'the', 'dog'])...

    translation_prob(el | NULL) = 0.333333333333
    translation_prob(el | the) = 0.333333333333
    translation_prob(el | dog) = 0.5

English sentence: [(0, 'NULL'), (1, 'the'), (2, 'dog')]
Spanish sentence: [(0, 'el'), (1, 'perro')]

delta(k=0, i=0, j=0) = 0.285714285714
delta(k=0, i=0, j=1) = 0.285714285714
delta(k=0, i=0, j=2) = 0.428571428571


Computing normalizing term \Sum_j translation_prob(s=perro | e_j=['NULL', 'the', 'dog'])...

    translation_prob(perro | NULL) = 0.333333333333
    translation_prob(perro | the) = 0.333333333333
    translation_prob(perro | dog) = 0.5

English sentence: [(0, 'NULL'), (1, 'the'), (2, 'dog')]
Spanish sentence: [(0, 'el'), (1, 'perro')]

delta(k=0, i=1, j=0) = 0.285714285714
delta(k=0, i=1, j=1) = 0.285714285714
delta(k=0, i=1, j=2) = 0.428571428571


Computing normalizing term \Sum_j translation_prob(s=el | e_j=['NULL', 'the', 'chicken'])...

    transl

# Compute Expected Alignment Counts

In [329]:
from collections import defaultdict

expected_alignment_count = defaultdict(float)

for k, (english_sentence, spanish_sentence) in enumerate(zip(english_sentences, spanish_sentences)):
    for i, spanish_word in enumerate(spanish_sentence):  
        for j, english_word in enumerate(english_sentence):
            expected_alignment_count[(english_word, spanish_word)] += delta[(k, i, j)]
            expected_alignment_count[english_word] += delta[(k, i, j)]

# View Expected Alignment Counts

In [330]:
for elem, count in expected_alignment_count.items():
    print '{}: {}'.format(elem, count)

('dog', 'el'): 0.428571428571
('the', 'perro'): 0.285714285714
('NULL', 'el'): 0.571428571429
('chicken', 'el'): 0.428571428571
('the', 'el'): 0.571428571429
('NULL', 'perro'): 0.285714285714
('the', 'pollo'): 0.285714285714
dog: 0.857142857143
('chicken', 'pollo'): 0.428571428571
('NULL', 'pollo'): 0.285714285714
chicken: 0.857142857143
the: 1.14285714286
NULL: 1.14285714286
('dog', 'perro'): 0.428571428571


# Re-Estimate Translation Probabilities

In [331]:
print 'Old translation probabilities...'
print
for spanish_word in translation_prob:
    for english_word in translation_prob[spanish_word]:
        print '    translation_problation_prob({} | {}) = {}'.format(spanish_word, english_word, translation_prob[spanish_word][english_word])

print
print
        
for spanish_word in translation_prob:
    for english_word in translation_prob[spanish_word]:
        new_prob = expected_alignment_count[(english_word, spanish_word)] / expected_alignment_count[english_word]
        translation_prob[spanish_word][english_word] = expected_alignment_count[(english_word, spanish_word)] / expected_alignment_count[english_word]

print 'New translation probabilities...'
print
for spanish_word in translation_prob:
    for english_word in translation_prob[spanish_word]:
        print '    translation_prob({} | {}) = {}'.format(spanish_word, english_word, translation_prob[spanish_word][english_word])

Old translation probabilities...

    translation_problation_prob(el | the) = 0.333333333333
    translation_problation_prob(el | NULL) = 0.333333333333
    translation_problation_prob(el | dog) = 0.5
    translation_problation_prob(el | chicken) = 0.5
    translation_problation_prob(perro | the) = 0.333333333333
    translation_problation_prob(perro | NULL) = 0.333333333333
    translation_problation_prob(perro | dog) = 0.5
    translation_problation_prob(pollo | the) = 0.333333333333
    translation_problation_prob(pollo | NULL) = 0.333333333333
    translation_problation_prob(pollo | chicken) = 0.5


New translation probabilities...

    translation_prob(el | the) = 0.5
    translation_prob(el | NULL) = 0.5
    translation_prob(el | dog) = 0.5
    translation_prob(el | chicken) = 0.5
    translation_prob(perro | the) = 0.25
    translation_prob(perro | NULL) = 0.25
    translation_prob(perro | dog) = 0.5
    translation_prob(pollo | the) = 0.25
    translation_prob(pollo | NULL) = 0

# Recover Alignments

In [332]:
alignments = [''] * len(spanish_sentences)

for k, (english_sentence, spanish_sentence) in enumerate(zip(english_sentences, spanish_sentences)):
    print 'Aligning {} to {}...'.format(spanish_sentence, english_sentence)
    print
    
    alignment = [0] * len(spanish_sentence)
    for i, spanish_word in enumerate(spanish_sentence):
        print '    Aligning {}...'.format(spanish_word)
        
        most_likely_alignment = (None, 0)
        for j, english_word in enumerate(english_sentence):
            print '        translation_prob({} | {}) = {}'.format(spanish_word, english_word, translation_prob[spanish_word][english_word])
            most_likely_alignment = (english_word, translation_prob[spanish_word][english_word], j) if translation_prob[spanish_word][english_word] > most_likely_alignment[1] else most_likely_alignment
        
        print
        print '        Aligning {} to {}...'.format(spanish_word, most_likely_alignment)
        print
        alignment[i] = most_likely_alignment
    
    print 'k={} alignment: {}'.format(k, zip(spanish_sentence, [word for word, _, __ in alignment]))
    alignments[k] = alignment
    print
    print

Aligning ['el', 'perro'] to ['NULL', 'the', 'dog']...

    Aligning el...
        translation_prob(el | NULL) = 0.5
        translation_prob(el | the) = 0.5
        translation_prob(el | dog) = 0.5

        Aligning el to ('NULL', 0.5, 0)...

    Aligning perro...
        translation_prob(perro | NULL) = 0.25
        translation_prob(perro | the) = 0.25
        translation_prob(perro | dog) = 0.5

        Aligning perro to ('dog', 0.5, 2)...

k=0 alignment: [('el', 'NULL'), ('perro', 'dog')]


Aligning ['el', 'pollo'] to ['NULL', 'the', 'chicken']...

    Aligning el...
        translation_prob(el | NULL) = 0.5
        translation_prob(el | the) = 0.5
        translation_prob(el | chicken) = 0.5

        Aligning el to ('NULL', 0.5, 0)...

    Aligning pollo...
        translation_prob(pollo | NULL) = 0.25
        translation_prob(pollo | the) = 0.25
        translation_prob(pollo | chicken) = 0.5

        Aligning pollo to ('chicken', 0.5, 2)...

k=1 alignment: [('el', 'NULL'), ('poll

# Post-Process Alignments

In [334]:
alignments = [[j for word, translation_probability, j in alignment] for alignment in alignments]

In [335]:
alignments

[[0, 2], [0, 2]]