In [33]:
from collections import defaultdict

START = 'START'
STOP = 'STOP'
k = 1

# This is the training data
x1 = 'START a b c STOP'
x2 = 'START a c d STOP'
x3 = 'START b d e a STOP'

V = set()

_FILL_IN_ = 'FILL IN'

In [34]:
# Get the relevant counts
S = [x1, x2, x3]
count_2 = defaultdict(int)
count_1 = defaultdict(int)
theta_1 = defaultdict(float)
theta = defaultdict(float)

In [35]:
# Split the data and get what you need
for x in S:
    words = x.split()  # Split the sentence into words
    for word in words:
        V.add(word)  # Add the word to the vocabulary set
    for i in range(len(words) - 1):
        count_2[(words[i], words[i + 1])] += 1  # Count bigrams
        count_1[words[i]] += 1  # Count unigrams
    count_1[words[-1]] += 1  # Count the last word of the sentence for unigrams


In [36]:
assert(len(V) == 7)
V.remove(START)
# Unigram ML estimates
# Note that START should not be a key
# Don't remove START from count_1 as we will need it in the demoninator
for u in V:
    theta_1[u] = count_1[u] / (sum(count_1.values()) - count_1[START])


In [37]:
# Remove START and STOP form the dictionary - these are not true words
V.discard(STOP)
V.discard(START)
assert(len(V) == 5)

In [38]:
# Get the smoothed estimates
for u in V:
    for v in V | set([STOP]):
        theta[(u, v)] = (count_2[(u, v)] + k*theta_1[v]) / (count_1[u] + k)


# For u = START, get the smoothed probabilities
for v in V | set([STOP]):
    theta[(START, v)] = (count_2[(START, v)] + k*theta_1[v]) / (count_1[START] + k)


In [39]:
# Check that the sum is 1, as it should be
for u in V | set([START]):
    p_sum = 0.0
    p_sum_1 = 0.0
    for v in V | set([STOP]):
        p_sum += theta[(u, v)]
        p_sum_1 += theta_1[v]
    p_sum_1 += theta_1[START]
    assert(abs(p_sum_1 - 1.0) ** 2 <= 0.00001)
    assert(abs(p_sum - 1.0) ** 2 <= 0.0001)

In [40]:
print("Vocabulary V:", V)

print("Unigram probabilities theta_1:")
for word, unigram_prob in theta_1.items():
    print(f"P({word}) = {unigram_prob}")

print("Smoothed Bigram probabilities theta:")
for bigram, bigram_prob in theta.items():
    print(f"P({bigram[0]}|{bigram[1]}) = {bigram_prob}")


Vocabulary V: {'b', 'a', 'd', 'e', 'c'}
Unigram probabilities theta_1:
P(STOP) = 0.23076923076923078
P(b) = 0.15384615384615385
P(a) = 0.23076923076923078
P(d) = 0.15384615384615385
P(e) = 0.07692307692307693
P(c) = 0.15384615384615385
P(START) = 0.0
Smoothed Bigram probabilities theta:
P(b|STOP) = 0.07692307692307693
P(b|b) = 0.05128205128205129
P(b|d) = 0.3846153846153846
P(b|e) = 0.025641025641025644
P(b|c) = 0.3846153846153846
P(b|a) = 0.07692307692307693
P(a|STOP) = 0.3076923076923077
P(a|b) = 0.28846153846153844
P(a|d) = 0.038461538461538464
P(a|e) = 0.019230769230769232
P(a|c) = 0.28846153846153844
P(a|a) = 0.057692307692307696
P(d|STOP) = 0.4102564102564103
P(d|b) = 0.05128205128205129
P(d|d) = 0.05128205128205129
P(d|e) = 0.358974358974359
P(d|c) = 0.05128205128205129
P(d|a) = 0.07692307692307693
P(e|STOP) = 0.11538461538461539
P(e|b) = 0.07692307692307693
P(e|d) = 0.07692307692307693
P(e|e) = 0.038461538461538464
P(e|c) = 0.07692307692307693
P(e|a) = 0.6153846153846154
P(c|ST

In [41]:
import numpy as np

likelihood = 1
log_likelihood = 0

for x in S:
    words = x.split()
    bigrams = zip(words, words[1:])

    likelihood *= np.prod([theta[bigram] for bigram in bigrams])
    log_likelihood += np.sum([np.log(theta[bigram]) for bigram in bigrams])


In [42]:
specific_bigrams_counts = {
    ('START', 'a'): count_2[('START', 'a')],
    ('b', 'c'): count_2[('b', 'c')],
    ('a', 'e'): count_2[('a', 'e')]
}


In [44]:
print("Likelihood:", likelihood)
print("Log Likelihood:", log_likelihood)
print("Counts for specific bigrams:", specific_bigrams_counts)


Likelihood: 4.859263663359929e-06
Log Likelihood: 0.0
Counts for specific bigrams: {('START', 'a'): 2, ('b', 'c'): 1, ('a', 'e'): 0}
