Refer to code implementations of Language Modelling: https://colab.research.google.com/drive/1ISx9SAXQpLTdlaO4AZ8Kzi3BDcwq9Dty#scrollTo=ccQePt1h1tW6

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data_path = "/content/drive/MyDrive/TXSA_Assignment/Assignment Data/Data_3.txt"
file = open(data_path, "r")
text = ""
for line in file.readlines():
    print(line, end="")
    text += line

Training Corpus
~~~~~~~~~~~~~
<s> He read a book </s>
<s> I read a different book </s>
<s> He read a book by Danielle </s>

Calculate sentence probability for the following sentence
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<s> I read a book by Danielle </s>

In [4]:
import re
text_corpus = re.findall("<s>.+</s>", text)
text_corpus = [sentence.split() for sentence in text_corpus]
test_sentence = " ".join(text_corpus.pop())
print(text_corpus)
print(test_sentence)

[['<s>', 'He', 'read', 'a', 'book', '</s>'], ['<s>', 'I', 'read', 'a', 'different', 'book', '</s>'], ['<s>', 'He', 'read', 'a', 'book', 'by', 'Danielle', '</s>']]
<s> I read a book by Danielle </s>


In [5]:
from nltk.lm.preprocessing import flatten
text_corpus = list(flatten(text_corpus))
text_corpus

['<s>',
 'He',
 'read',
 'a',
 'book',
 '</s>',
 '<s>',
 'I',
 'read',
 'a',
 'different',
 'book',
 '</s>',
 '<s>',
 'He',
 'read',
 'a',
 'book',
 'by',
 'Danielle',
 '</s>']

# Compute sentence probabilities manually

Using unsmoothed bigram model: <br>
$P(\text{<s> I read a book by Danielle </s>}) $ <br>
$= P(I|<s>) \cdot P(read|I) \cdot P(a|read) \cdot P(book|a) \cdot P(by|book) \cdot P(Danielle|by) \cdot P(</s>|Danielle)$ <br>
$ = \frac{1}{3} \cdot \frac{1}{1} \cdot \frac{3}{3} \cdot \frac{2}{3} \cdot \frac{1}{3} \cdot \frac{1}{1} \cdot \frac{1}{1} = 0.07407407407$

Using smoothed bigram model: <br>
$P(\text{<s> I read a book by Danielle </s>}) $ <br>
$= P(I|<s>) \cdot P(read|I) \cdot P(a|read) \cdot P(book|a) \cdot P(by|book) \cdot P(Danielle|by) \cdot P(</s>|Danielle)$ <br>
$ = \frac{1 + 1}{3 + 10 + 1} \cdot \frac{1 + 1}{1 + 10 + 1} \cdot \frac{3 + 1}{3 + 10 + 1} \cdot \frac{2 +1}{3 + 10 + 1} \cdot \frac{1+1}{3 + 10 + 1} \cdot \frac{1+1}{1+ 10 + 1} \cdot \frac{1+1}{1+ 10 + 1} = 0.00000578462677588$

# Compute sentence probabilities by using Python codes

In [6]:
vocabs = set(text_corpus)
print(vocabs)
print(len(vocabs))

{'I', '<s>', 'He', 'a', 'by', 'book', 'Danielle', 'different', 'read', '</s>'}
10


In [7]:
def compute_sentence_probability_unsmoothed_bigram(sentence, vocabs, text_corpus):
    tokens = sentence.split()
    bigram_counts = {}
    # Sliding through corpus to get bigram counts
    for i in range(len(text_corpus) - 1):
        # Getting bigram at each slide
        bigram = (text_corpus[i], text_corpus[i+1])
        # Keeping track of the bigram counts
        if bigram in bigram_counts.keys():
            bigram_counts[bigram] += 1
        else:
            bigram_counts[bigram] = 1

    vocab_counts = {}
    # Sliding through vocabs to get vocab counts
    for vocab in vocabs:
        for word in text_corpus:
            if word == vocab:
                if vocab in vocab_counts.keys():
                  vocab_counts[vocab] += 1
                else:
                  vocab_counts[vocab] = 1
    sentence_prob = 1
    # Sliding through the `tokens` to get bigrams
    for i in range(len(tokens) - 1):
        # Getting bigram at each slide
        bigram = (tokens[i], tokens[i+1])
        sentence_prob *= bigram_counts.get(bigram, 0)/ (vocab_counts.get(tokens[i]))
    return sentence_prob

In [8]:
compute_sentence_probability_unsmoothed_bigram(test_sentence, vocabs, text_corpus)

0.07407407407407407

In [9]:
def compute_sentence_probability_smoothed_bigram(sentence, vocabs, text_corpus):
    tokens = sentence.split()
    bigram_counts = {}
    # Sliding through corpus to get bigram counts
    for i in range(len(text_corpus) - 1):
        # Getting bigram at each slide
        bigram = (text_corpus[i], text_corpus[i+1])
        # Keeping track of the bigram counts
        if bigram in bigram_counts.keys():
            bigram_counts[bigram] += 1
        else:
            bigram_counts[bigram] = 1

    vocab_counts = {}
    # Sliding through vocabs to get vocab counts
    for vocab in vocabs:
        for word in text_corpus:
            if word == vocab:
                if vocab in vocab_counts.keys():
                  vocab_counts[vocab] += 1
                else:
                  vocab_counts[vocab] = 1
    sentence_prob = 1
    # Sliding through the `tokens` to get bigrams
    for i in range(len(tokens) - 1):
        # Getting bigram at each slide
        bigram = (tokens[i], tokens[i+1])
        sentence_prob *= (bigram_counts.get(bigram, 0) + 1)/(vocab_counts.get(tokens[i]) + len(vocabs) + 1)
    return sentence_prob

In [10]:
compute_sentence_probability_smoothed_bigram(test_sentence, vocabs, text_corpus)

5.784626775880419e-06