This exercise notebook demostrates how to compute the query term likelihood given a unigram, or a bigram, or a trigram language model

In [3]:
import numpy as np
import nltk
nltk.download('popular')
nltk.download('brown')
from nltk.corpus import brown

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [4]:
words = list(brown.words())

In [5]:
len(words)

1161192

In [7]:
words_dict = dict()
for word in words:
    if word not in words_dict.keys():
        words_dict[word] = 1
    else:
        words_dict[word] +=1

In [11]:
# buil the list with the words occuring only once

occured_once = [i for i,j in words_dict.items() if j ==1]

In [12]:
# this iteration will take some time, be patient waiting for 20 minutes

count = 0
for num, word in enumerate(words):
    if word in occured_once:
        words[num] = "OOV"

In [13]:
sentence = "I have never given it much thought"

In [14]:
tokens = sentence.split()
tokens

['I', 'have', 'never', 'given', 'it', 'much', 'thought']

In [15]:
for num,token in enumerate(tokens):
    if not token in words:
        tokens[num] = "OOV"

In [16]:
# convert all tokens to lowercase

tokens = [token.lower() for token in tokens]
words = [word.lower() for word in words]

In [17]:
def bigram_probability(bigram,words):
    count = [0,0]
    for i in range(len(words)-1):
        if (words[i] == bigram[0]):
            count[1] += 1
            if (words[i+1] == bigram[1]):
                count[0] += 1
    return count[0]/count[1], count

In [18]:
bigrams = []
for i in range(len(tokens)-1):
    bigrams.append((tokens[i],tokens[i+1]))

In [19]:
for bigram in bigrams:
    print(bigram_probability(bigram,words)[1])

[259, 5164]
[24, 3942]
[2, 697]
[7, 377]
[2, 8760]
[2, 937]


In [20]:
bigrams

[('i', 'have'),
 ('have', 'never'),
 ('never', 'given'),
 ('given', 'it'),
 ('it', 'much'),
 ('much', 'thought')]

In [21]:
def bigram_prob_sentence(tokens, bigrams):
    prob = []
    for bigram in bigrams:
        p = bigram_probability(bigram,words)[0]
        prob.append(p)
    return np.prod(prob)

In [22]:
bigram_prob_sentence(tokens, bigrams)

7.928268578305691e-15

In [23]:
trigrams = []
for i in range(len(tokens)-2):
    trigrams.append((tokens[i],tokens[i+1], tokens[i+2]))

In [24]:
trigrams

[('i', 'have', 'never'),
 ('have', 'never', 'given'),
 ('never', 'given', 'it'),
 ('given', 'it', 'much'),
 ('it', 'much', 'thought')]

In [25]:
def trigram_probability(trigram,words):
    count = [0,0]
    for i in range(len(words)-2):
        if (words[i] == trigram[0]) and (words[i+1] == trigram[1]):
            count[1] += 1
            if (words[i+2] == trigram[2]):
                count[0] += 1
    return count[0]/count[1], count

In [26]:
for trigram in trigrams:
    print(trigram_probability(trigram,words)[1])

[8, 259]
[0, 24]
[0, 2]
[1, 7]
[1, 2]


In [27]:
def trigram_prob_sentence(tokens, trigrams):
    prob = []
    for trigram in trigrams:
        p = trigram_probability(trigram,words)[0]
        prob.append(p)
    return np.prod(prob)

In [28]:
trigram_prob_sentence(tokens, trigrams)

0.0

In [29]:
unigrams = []
for i in range(len(tokens)):
    unigrams.append(tokens[i])

In [30]:
for unigram in unigrams:
    print(unigram)

i
have
never
given
it
much
thought


In [31]:
def unigram_probability(unigram,words):
    count = [0,0]
    for i in range(len(words)):
        count[1] += 1
        if (words[i] == unigram[0]):
            count[0] += 1
    return count[0]/count[1], count

In [32]:
for unigram in unigrams:
    print(unigram_probability(unigram,words)[1])

[5164, 1161192]
[23, 1161192]
[38, 1161192]
[16, 1161192]
[5164, 1161192]
[16, 1161192]
[77, 1161192]


In [33]:
def unigram_prob_sentence(tokens, unigrams):
    prob = []
    for unigram in unigrams:
        p = unigram_probability(unigram,words)[0]
        prob.append(p)
    return np.prod(prob)

In [34]:
unigram_prob_sentence(tokens, unigrams)

1.6139361322466987e-28