Define a language model to compute probabilities for individual English sentences.

Implement a bigram language model as described in the lecture, and use it to compute the probability of a short sentence.
What happens if you try to compute the probability of a sentence that contains a word that did not appear in the training texts? And what happens if your sentence is very long (e.g. 100 words or more)? Optionally, change your code so that it can handle these challenges.

In [4]:
from collections import defaultdict

In [5]:
class BigramLanguageModel:
    def __init__(self):
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.unigram_counts = defaultdict(int)

    def train(self, corpus):
        words = corpus.split()
        for i in range(len(words) - 1):
            current_word, next_word = words[i], words[i + 1]
            self.bigram_counts[current_word][next_word] += 1
            self.unigram_counts[current_word] += 1
        # Handle the last word
        self.unigram_counts[words[-1]] += 1

    def calculate_probability(self, current_word, next_word):
        if current_word in self.bigram_counts and next_word in self.bigram_counts[current_word]:
            # Estimate probability using MLE
            return self.bigram_counts[current_word][next_word] / self.unigram_counts[current_word]
        else:
            # Unseen word
            return 0.0

In [6]:
# Read the training corpus from the .sv file
training_data_path = 'data/europarl-v7.sv-en.lc.en'

with open(training_data_path, 'r', encoding='utf-8') as file:
    training_data = file.read()

# Create a bigram language model
bigram_model = BigramLanguageModel()

# Train the model
bigram_model.train(training_data.lower())  # Change to lowercase for case-insensitive matching

In [7]:
short_sentence = "it is the case of alexander nikitin ."

long_sentence = """indeed , it is quite in keeping with the positions this house has always adopted ."""

unseen_sentence = "the happy dog ."



for sentence, sentence_name in zip([short_sentence, long_sentence, unseen_sentence], ["short_sentence", "long_sentence", "unseen_sentence"]):
    words = sentence.lower().split()
    sentence_probability = 1.0
    for i in range(len(words) - 1):
        current_word, next_word = words[i], words[i + 1]
        # Markov assumption: P(w1, w2, ..., wn) = P(w1) * P(w2 | w1) * P(w3 | w2) * ... * P(wn | wn-1)
        sentence_probability *= bigram_model.calculate_probability(current_word, next_word)
    print(f"The probability of {sentence_name} is: {sentence_probability}")



The probability of short_sentence is: 1.3991701237103848e-09
The probability of long_sentence is: 4.40313841760489e-25
The probability of unseen_sentence is: 0.0
