# IBM Model I

In [1]:
with open('corpus.en', 'r') as f, open('corpus.es', 'r') as g:
    english_sentences, spanish_sentences = f.readlines(), g.readlines()

## Reference Sentences

In [2]:
english_sentences, spanish_sentences = ['the dog', 'the chicken'], ['el perro', 'el pollo']

In [3]:
english_sentences, spanish_sentences = [sentence.split() for sentence in english_sentences], [sentence.split() for sentence in spanish_sentences]

# Compute $n(e)$ for Every English Word

In [4]:
from collections import defaultdict

candidate_mapping = defaultdict(set)

# Get the total number of spanish words that appear in the same sentence as this english word
for english_sentence in english_sentences:
    for english_word in english_sentence:
        # Already computed the all spanish words that occur in reference translations with this english word?
        if english_word in candidate_mapping:
            continue

        # Go through all pairs of english-spanish sentences, collecting all spanish words that appear in a translation with the english word
        candidate_mapping[english_word].update(
            spanish_word
            for english_sent, spanish_sent in zip(english_sentences, spanish_sentences)
            if english_word in english_sent for spanish_word in spanish_sent)

# For every $e$, initialize $t(f | e) = \frac{1}{n(e)}$ for all $f$ that occur with $e$

In [5]:
from collections import defaultdict

translation_prob = defaultdict(lambda: defaultdict(float))

english_words = set(word for sentence in english_sentences for word in sentence)
spanish_words = set(word for sentence in spanish_sentences for word in sentence)

for english_word in english_words:
    for spanish_word in candidate_mapping[english_word]:
        print 'translation_prob({} | {}) = {}'.format(spanish_word, english_word, 1./len(candidate_mapping[english_word]))
        translation_prob[spanish_word][english_word] = 1. / len(candidate_mapping[english_word])

translation_prob(el | the) = 0.333333333333
translation_prob(perro | the) = 0.333333333333
translation_prob(pollo | the) = 0.333333333333
translation_prob(el | dog) = 0.5
translation_prob(perro | dog) = 0.5
translation_prob(el | chicken) = 0.5
translation_prob(pollo | chicken) = 0.5


In [19]:
import dill

with open('translation_probs.p', 'wb') as f:
    dill.dump(translation_prob, f)

In [6]:
import dill

with open('translation_probs.p', 'rb') as f:
    translation_probs = dill.load(f)

# View Translation Probabilities

In [7]:
for spanish_word in spanish_words:
    for english_word in english_words:
        print 'translation_prob({} | {}) = {}'.format(spanish_word, english_word, translation_prob[spanish_word][english_word])

translation_prob(el | the) = 0.333333333333
translation_prob(el | dog) = 0.5
translation_prob(el | chicken) = 0.5
translation_prob(perro | the) = 0.333333333333
translation_prob(perro | dog) = 0.5
translation_prob(perro | chicken) = 0.0
translation_prob(pollo | the) = 0.333333333333
translation_prob(pollo | dog) = 0.0
translation_prob(pollo | chicken) = 0.5
