In [1]:
from collections import Counter

In [2]:
M = 3
word_counts = {
    'happy': 5, 'because': 3, 'i': 2, 'am': 2, 'learning': 3, '.': 1}
vocabulary = Counter(word_counts).most_common(M)
vocabulary = [w[0] for w in vocabulary]

print(f'the new vocabulary containing {M} most frequent words: '
      f'{vocabulary}\n') 

the new vocabulary containing 3 most frequent words: ['happy', 'because', 'learning']



In [3]:
sentence = ['am', 'i', 'learning']
output_sentence = []
print(f'input sentence: {sentence}')

for w in sentence:
    if w in vocabulary:
        output_sentence.append(w)
    else:
        output_sentence.append('<UNK>')        
print(f'output sentence: {output_sentence}')

input sentence: ['am', 'i', 'learning']
output sentence: ['<UNK>', '<UNK>', 'learning']


In [4]:
f = 3
for word, freq in word_counts.items():
    if freq == f:
        print(word)

because
learning


In [5]:
training_set = ['i', 'am', 'happy', 'because','i', 'am', 'learning', '.']
training_set_unk = [
    'i', 'am', '<UNK>', '<UNK>','i', 'am', '<UNK>', '<UNK>']
test_set = ['i', 'am', 'learning']
test_set_unk = ['i', 'am', '<UNK>']
M = len(test_set)
probability = 1
probability_unk = 1

bigram_probabilities = {('i', 'am'): 1.0, 
                        ('am', 'happy'): 0.5, 
                        ('happy', 'because'): 1.0, 
                        ('because', 'i'): 1.0, 
                        ('am', 'learning'): 0.5, 
                        ('learning', '.'): 1.0}
bigram_probabilities_unk = {('i', 'am'): 1.0, 
                            ('am', '<UNK>'): 1.0, 
                            ('<UNK>', '<UNK>'): 0.5, 
                            ('<UNK>', 'i'): 0.25}

for i in range(len(test_set) - 2 + 1):
    bigram = tuple(test_set[i: i + 2])
    probability = probability * bigram_probabilities[bigram]        
    bigram_unk = tuple(test_set_unk[i: i + 2])
    probability_unk = (probability_unk 
                       * bigram_probabilities_unk[bigram_unk])
perplexity = probability ** (-1 / M)
perplexity_unk = probability_unk ** (-1 / M)
print(f'perplexity for the training set: {perplexity}')
print(f'perplexity for the training set with <UNK>: {perplexity_unk}')

perplexity for the training set: 1.2599210498948732
perplexity for the training set with <UNK>: 1.0


# Smoothing

In [6]:
def add_k_smoothing_probability(
        k, vocabulary_size, n_gram_count, n_gram_prefix_count):
    numerator = n_gram_count + k
    denominator = n_gram_prefix_count + k*vocabulary_size
    return numerator / denominator

In [8]:
trigram_probabilities = {('i', 'am', 'happy') : 2}
bigram_probabilities = {( 'am', 'happy') : 10}
vocabulary_size = 5
k = 1
probability_known_trigram = add_k_smoothing_probability(
    k, 
    vocabulary_size, 
    trigram_probabilities[('i', 'am', 'happy')], 
    bigram_probabilities[( 'am', 'happy')])
probability_unknown_trigram = add_k_smoothing_probability(
    k, 
    vocabulary_size, 
    0, 
    0)
print(f'probability_known_trigram: {probability_known_trigram}')
print(f'probability_unknown_trigram: {probability_unknown_trigram}')

probability_known_trigram: 0.2
probability_unknown_trigram: 0.2


# Back-off

In [9]:
trigram_probabilities = {('i', 'am', 'happy'): 0}
bigram_probabilities = {( 'am', 'happy'): 0.3}
unigram_probabilities = {'happy': 0.4}
trigram = ('are', 'you', 'happy')
bigram = trigram[1: 3]
unigram = trigram[2]
print(f'besides the trigram {trigram} we also use bigram {bigram} and '
      f'unigram ({unigram})\n')
lambda_factor = 0.4
probability_hat_trigram = 0

besides the trigram ('are', 'you', 'happy') we also use bigram ('you', 'happy') and unigram (happy)



In [10]:
if (trigram not in trigram_probabilities 
        or trigram_probabilities[trigram] == 0):
    print(f'probability for trigram {trigram} not found')
    if (bigram not in bigram_probabilities 
            or bigram_probabilities[bigram] == 0):
        print(f'probability for bigram {bigram} not found')
        if unigram in unigram_probabilities:
            print(f'probability for unigram {unigram} found\n')
            probability_hat_trigram = (lambda_factor 
                                       * lambda_factor 
                                       * unigram_probabilities[unigram])
        else:
            probability_hat_trigram = 0
    else:
        probability_hat_trigram = (lambda_factor 
                                   * bigram_probabilities[bigram])
else:
    probability_hat_trigram = trigram_probabilities[trigram]
print(f'probability for trigram {trigram} estimated as '
      f'{probability_hat_trigram}')

probability for trigram ('are', 'you', 'happy') not found
probability for bigram ('you', 'happy') not found
probability for unigram happy found

probability for trigram ('are', 'you', 'happy') estimated as 0.06400000000000002


# Interpolation

In [11]:
trigram_probabilities = {('i', 'am', 'happy'): 0.15}
bigram_probabilities = {( 'am', 'happy'): 0.3}
unigram_probabilities = {'happy': 0.4}
lambda_1 = 0.8
lambda_2 = 0.15
lambda_3 = 0.05

trigram = ('i', 'am', 'happy')
bigram = trigram[1: 3]
unigram = trigram[2]
print(f'besides the trigram {trigram} we also use bigram {bigram} and'
      f'unigram ({unigram})\n')

probability_hat_trigram = (lambda_1 * trigram_probabilities[trigram] 
                           + lambda_2 * bigram_probabilities[bigram]
                           + lambda_3 * unigram_probabilities[unigram])
print(f'estimated probability of the input trigram {trigram} is '
      f'{probability_hat_trigram}')

besides the trigram ('i', 'am', 'happy') we also use bigram ('am', 'happy') andunigram (happy)

estimated probability of the input trigram ('i', 'am', 'happy') is 0.12
