# Statistical Machine Translation

In [81]:
from collections import defaultdict
import numpy as np
import string

def clean_sentence(sentence): 
    '''
    Separate out punctuation into their own "words'
    arg:
        sentence (str): Sentence as a string (should not include new line character).
    return:
        cleaned_sentence (str): Sentence with all punctuation separated by whitespace. 
    '''
    chars = []
    for i in range(len(sentence)): 
        if sentence[i] in string.punctuation: 
            # If there's whitespace before the punctuation
            if sentence[i-1]!=" ":
                char = " {}".format(sentence[i])
            # If we're not at the end of the sentence and there's whitespace after the punctuation. 
            if i!=len(sentence)-1 and sentence[i+1]!=" ":
                char = "{} ".format(char)
            chars.append(char)
        else:
            chars.append(sentence[i])
        cleaned_sentence = ''.join(chars)    
    return cleaned_sentence

def read_text_into_array(filename):
    '''
    Read text file into array (removing new line character).
    arg:
        filename (str): Filename as string.
    return:
        corpus_stripped (list): List of stripped sentences (no new line character). 
    '''
    with open(filename, 'r') as f:
        corpus_raw = f.readlines()
        # Strip the new line character from each line.     
        corpus_stripped = [line.strip() for line in corpus_raw]
        return corpus_stripped

corpus = read_text_into_array("corpus.txt")
corpus_clean = []
for sentence in corpus:
    corpus_clean.append(clean_sentence(sentence))
corpus_clean


['I love the boy .',
 'I love the dog .',
 'They love the dog .',
 'They talk to the girl .',
 'They talk to the dog .',
 'I talk to the mother .']

In [82]:
def unigram_count(corpus_clean):
    """
    Counts unigrams in a corpus.
    """
    word_count = defaultdict(float)
    total_words = 0

    for sentence in corpus_clean:
        total_words += len(sentence.split())
        for word in sentence.split():
            word_count[word] += 1
    return word_count

count_dict = unigram_count(corpus_clean)
count_dict

defaultdict(float,
            {'I': 3.0,
             'love': 3.0,
             'the': 6.0,
             'boy': 1.0,
             '.': 6.0,
             'dog': 3.0,
             'They': 3.0,
             'talk': 3.0,
             'to': 3.0,
             'girl': 1.0,
             'mother': 1.0})

In [83]:
def unigram_prob(count_dict):
    """
    Calculates unigram probabilities.
    """
    prob_dict = defaultdict(float)
    total_words = sum(count_dict.values())
    for word, count in count_dict.items():
        word_prob = count / total_words
        prob_dict[word] = word_prob
    return prob_dict

unigram_prob_dict = unigram_prob(count_dict)
unigram_prob_dict

defaultdict(float,
            {'I': 0.09090909090909091,
             'love': 0.09090909090909091,
             'the': 0.18181818181818182,
             'boy': 0.030303030303030304,
             '.': 0.18181818181818182,
             'dog': 0.09090909090909091,
             'They': 0.09090909090909091,
             'talk': 0.09090909090909091,
             'to': 0.09090909090909091,
             'girl': 0.030303030303030304,
             'mother': 0.030303030303030304})

## Probability of each sentence in the corpus using a unigram model

In [84]:
# Multiply the probability of each word in the sentence
for sentence in corpus_clean:
    sentence_prob = []
    for word in sentence.split():
        sentence_prob.append(unigram_prob_dict[word])
    print(f'{np.prod(sentence_prob)}%: {sentence}')

8.27895097412207e-06%: I love the boy .
2.483685292236621e-05%: I love the dog .
2.483685292236621e-05%: They love the dog .
7.526319067383701e-07%: They talk to the girl .
2.25789572021511e-06%: They talk to the dog .
7.526319067383701e-07%: I talk to the mother .


### Using the Bigram Model

In [85]:
def get_bigrams(corpus_clean):
    """
    Extract bigrams from corpus.
    """
    bigrams = []
    for line in corpus_clean:
        line = line.split()
        for i in range(len(line)-1):
            bigrams.append(line[i:i+2])
    return bigrams

def ngram_count(ngrams):
    """
    Count n-grams in a corpus
    """
    ngram_count_dict = defaultdict(float)
    for ngram in ngrams:
        ngram_count_dict[tuple(ngram)] += 1 # Tuples are immutable
    return ngram_count_dict

def bigram_prob(bigram_count_dict, count_dict):
    """
    Calculate bigram probabilities
    """
    bigram_prob_dict = defaultdict(float)
    for bigram, count in bigram_count_dict.items():
        # Dive the bigram count by the count for the single word that comes before it
        bigram_prob = count / count_dict[bigram[0]]
        bigram_prob_dict[bigram] = bigram_prob
    return bigram_prob_dict

# Let's add some "start" and "end" characters and recompute the counts
corpus_mod= ["<s> {} </s>".format(line) for line in corpus_clean]
count_dict = unigram_count(corpus_mod)
unigram_prob_dict = unigram_prob(count_dict)
bigrams = get_bigrams(corpus_mod)
bigram_count_dict = ngram_count(bigrams)
bigram_prob_dict = bigram_prob(bigram_count_dict, count_dict)

for line in corpus_mod:
    line_split = line.split()
    # Start with the probability of <s> alone to begin with
    line_probs = [unigram_prob_dict[line_split[0]]]
    for i in range(len(line_split) - 1):
        line_probs.append(bigram_prob_dict[tuple(line_split[i:i+2])])
    print(f'{np.prod(line_probs)} - {line}')


0.007407407407407406 - <s> I love the boy . </s>
0.02222222222222222 - <s> I love the dog . </s>
0.01111111111111111 - <s> They love the dog . </s>
0.007407407407407406 - <s> They talk to the girl . </s>
0.02222222222222222 - <s> They talk to the dog . </s>
0.003703703703703703 - <s> I talk to the mother . </s>


### Using Trigrams

In [86]:
def get_trigrams(corpus):
    '''Extract trigrams from corpus.'''
    trigrams = []
    for line in corpus:
        line = line.split()
        for i in range(len(line)-2):
            trigrams.append(line[i:i+3])
    return trigrams

def trigram_prob(trigram_count_dict, bigram_count_dict):
    '''Calcualte trigram probabilities.'''
    trigram_prob_dict = defaultdict(float)
    for trigram, count in trigram_count_dict.items():
        trigram_prob = count / bigram_count_dict[trigram[:2]]
        trigram_prob_dict[trigram] = trigram_prob
    return trigram_prob_dict

trigrams = get_trigrams(corpus_mod)
trigram_count_dict = ngram_count(trigrams)
trigram_prob_dict = trigram_prob(trigram_count_dict, bigram_count_dict)

for line in corpus_mod:
    line_split = line.split()
    # Start with the bigram probability to deal with the  first word and the starting token p(word1 | <s>)
    line_probs = [bigram_prob_dict[tuple(line_split[0:2])]]
    for i in range(len(line_split)-2):
        line_probs.append(trigram_prob_dict[tuple(line_split[i:i+3])])
    print(f'{np.prod(line_probs)} - {line_split}')


0.1111111111111111 - ['<s>', 'I', 'love', 'the', 'boy', '.', '</s>']
0.2222222222222222 - ['<s>', 'I', 'love', 'the', 'dog', '.', '</s>']
0.1111111111111111 - ['<s>', 'They', 'love', 'the', 'dog', '.', '</s>']
0.1111111111111111 - ['<s>', 'They', 'talk', 'to', 'the', 'girl', '.', '</s>']
0.1111111111111111 - ['<s>', 'They', 'talk', 'to', 'the', 'dog', '.', '</s>']
0.05555555555555555 - ['<s>', 'I', 'talk', 'to', 'the', 'mother', '.', '</s>']
