In [4]:
from collections import defaultdict
import math

corpus = [
    "I love natural language processing",
    "I love machine learning",
    "natural language processing is fun",
    "machine learning is interesting"
]
tokenized_corpus = [sentence.split() for sentence in corpus]

def create_bigrams(tokenized_sentences):
    bigrams = []
    for sentence in tokenized_sentences:
        for i in range(len(sentence) - 1):
            bigrams.append((sentence[i], sentence[i + 1]))
    return bigrams

bigrams = create_bigrams(tokenized_corpus)

unigram_counts = defaultdict(int)
bigram_counts = defaultdict(int)

for sentence in tokenized_corpus:
    for word in sentence:
        unigram_counts[word] += 1

for bigram in bigrams:
    bigram_counts[bigram] += 1

vocab_size = len(unigram_counts)

def unsmoothed_bigram_prob(bigram):
    return bigram_counts[bigram] / unigram_counts[bigram[0]]

def laplace_smoothed_bigram_prob(bigram):
    return (bigram_counts[bigram] + 1) / (unigram_counts[bigram[0]] + vocab_size)

test_bigrams = [("I", "love"), ("machine", "learning"), ("is", "fun"), ("language", "machine")]

print("Bigram probabilities without smoothing:")
for bigram in test_bigrams:
    prob = unsmoothed_bigram_prob(bigram) if bigram_counts[bigram] > 0 else 0
    print(f"P({bigram[1]}|{bigram[0]}) = {prob:.4f}")

print("\nBigram probabilities with Laplace smoothing:")
for bigram in test_bigrams:
    smoothed_prob = laplace_smoothed_bigram_prob(bigram)
    print(f"P({bigram[1]}|{bigram[0]}) = {smoothed_prob:.4f}")


Bigram probabilities without smoothing:
P(love|I) = 1.0000
P(learning|machine) = 1.0000
P(fun|is) = 0.5000
P(machine|language) = 0.0000

Bigram probabilities with Laplace smoothing:
P(love|I) = 0.2500
P(learning|machine) = 0.2500
P(fun|is) = 0.1667
P(machine|language) = 0.0833
