# Find the bigram probabilities of the sentence tokens

In [0]:
import nltk

## Load Data

In [4]:
from nltk.corpus import brown
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [5]:
# Corpus
words = brown.words()
words

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [7]:
words=[w.lower() for w in words]

words[0:10]

['the',
 'fulton',
 'county',
 'grand',
 'jury',
 'said',
 'friday',
 'an',
 'investigation',
 'of']

## Unigram frequency 


In [8]:
# Unigram frequency 
uni_freq = nltk.FreqDist(w.lower() for w in words)
uni_freq

FreqDist({'the': 69971,
          'fulton': 17,
          'county': 155,
          'grand': 48,
          'jury': 67,
          'said': 1961,
          'friday': 60,
          'an': 3740,
          'investigation': 51,
          'of': 36412,
          "atlanta's": 4,
          'recent': 179,
          'primary': 96,
          'election': 77,
          'produced': 90,
          '``': 8837,
          'no': 2139,
          'evidence': 204,
          "''": 8789,
          'that': 10594,
          'any': 1344,
          'irregularities': 8,
          'took': 426,
          'place': 570,
          '.': 49346,
          'further': 218,
          'in': 21337,
          'term-end': 1,
          'presentments': 1,
          'city': 393,
          'executive': 55,
          'committee': 168,
          ',': 58334,
          'which': 3561,
          'had': 5133,
          'over-all': 35,
          'charge': 122,
          'deserves': 16,
          'praise': 17,
          'and': 28853,
          'th

## Size of corpus


In [9]:
# Size of corpus
total_words = len(words)
total_words

1161192

In [10]:
print('Frequency of tokens of the sample sentence:',total_words)


Frequency of tokens of the sample sentence: 1161192


In [0]:
#Sentence 
test_sentence_tokens=['this','is','a','sunny','day','.','however','i','am','not','feeling','well','lots','of','cold']

In [12]:
for word in test_sentence_tokens:
    print('Frequency of "',word,'" is ',uni_freq[word])
print('\n\n')

Frequency of " this " is  5145
Frequency of " is " is  10109
Frequency of " a " is  23195
Frequency of " sunny " is  13
Frequency of " day " is  687
Frequency of " . " is  49346
Frequency of " however " is  552
Frequency of " i " is  5164
Frequency of " am " is  237
Frequency of " not " is  4610
Frequency of " feeling " is  172
Frequency of " well " is  897
Frequency of " lots " is  42
Frequency of " of " is  36412
Frequency of " cold " is  171





## Creating bigrams


In [13]:
# Creating bigrams

bigram_words = []
previous = 'EMPTY'
sentences = 0
for word in words:
    if previous in ['EMPTY','.','?','!']:
        ## insert word_boundaries at beginning of Brown,
        bigram_words.append('*start_end*')
    else:
        bigram_words.append(word)
    
    previous = word
    bigram_words.append('*start_end*') ## assume one additional *start_end* at the end of Brown

updated_uni_freq  = nltk.FreqDist(w.lower() for w in bigram_words)


print('Calculating bigram probalities for sentence, including bigrams with sentence boundaries, i.e., *start_end*')



Calculating bigram probalities for sentence, including bigrams with sentence boundaries, i.e., *start_end*


In [14]:
updated_uni_freq

FreqDist({'*start_end*': 1216827,
          'fulton': 16,
          'county': 151,
          'grand': 48,
          'jury': 66,
          'said': 1945,
          'friday': 59,
          'an': 3565,
          'investigation': 51,
          'of': 36279,
          "atlanta's": 4,
          'recent': 169,
          'primary': 94,
          'election': 75,
          'produced': 90,
          '``': 4884,
          'no': 1892,
          'evidence': 201,
          "''": 8702,
          'that': 10321,
          'any': 1308,
          'irregularities': 8,
          'took': 425,
          'place': 549,
          '.': 49346,
          'further': 194,
          'in': 19689,
          'term-end': 1,
          'presentments': 1,
          'the': 63695,
          'city': 389,
          'executive': 55,
          'committee': 167,
          ',': 58330,
          'which': 3549,
          'had': 5103,
          'over-all': 35,
          'charge': 121,
          'deserves': 16,
          'praise': 17,
   

In [17]:
# Bigram corpus
bigrams = nltk.bigrams(w.lower() for w in bigram_words)
bigrams

<generator object bigrams at 0x7f8ce64d4b48>

## Calculating Bigram probabilities


In [20]:
# Bigram probabilities
conditional_freq = nltk.ConditionalFreqDist(bigrams)
conditional_freq

ConditionalFreqDist(nltk.probability.FreqDist, {})

In [0]:
# Function to calculate bigram probability
def get_bigram_probability(first,second):
    
    bigram_freq = conditional_freq[first][second]
    unigram_freq = updated_uni_freq[first]

    bigram_prob = (bigram_freq)/(unigram_freq)
    
    return bigram_prob


In [22]:
## Calculating the bigram probability

prob_list=[]
previous = '*start_end*'

for token in test_sentence_tokens:
    next_probability = get_bigram_probability(previous,token)
    print(previous,token,(float('%.3g' % next_probability)))
    previous = token
    prob_list.append(next_probability)
  


*start_end* this 0.0
this is 0.0
is a 0.0
a sunny 0.0
sunny day 0.0
day . 0.0
. however 0.0
however i 0.0
i am 0.0
am not 0.0
not feeling 0.0
feeling well 0.0
well lots 0.0
lots of 0.0
of cold 0.0


In [23]:
# Creating bigrams

bigram_words = []
previous = 'EMPTY'
sentences = 0
for word in words:
    if previous in ['EMPTY','.','?','!']:
        ## insert word_boundaries at beginning of Brown,
        bigram_words.append('*start_end*')
    else:
        bigram_words.append(word)
    
    previous = word


    
    
bigram_words.append('*start_end*') ## assume one additional *start_end* at the end of Brown

updated_uni_freq  = nltk.FreqDist(w.lower() for w in bigram_words)


print('Calculating bigram probalities for sentence, including bigrams with sentence boundaries, i.e., *start_end*')


# Bigram corpus
bigrams = nltk.bigrams(w.lower() for w in bigram_words)


# Bigram probabilities
conditional_freq = nltk.ConditionalFreqDist(bigrams)



# Code begins here


# Function to calculate bigram probability
def get_bigram_probability(first,second):
    
    bigram_freq = conditional_freq[first][second]
    unigram_freq = updated_uni_freq[first]

    bigram_prob = (bigram_freq)/(unigram_freq) #without Laplacian smoothing

    
    return bigram_prob

## Calculating the bigram probability

prob_list=[]
previous = '*start_end*'

for token in test_sentence_tokens:
    next_probability = get_bigram_probability(previous,token)
    print(previous,token,(float('%.3g' % next_probability)))
    previous = token
    prob_list.append(next_probability)


    
# For the final term    
next_probability = get_bigram_probability(previous,'*start_end*')
print(previous,'*start_end*',next_probability)
prob_list.append(next_probability)    

print(prob_list)    


Calculating bigram probalities for sentence, including bigrams with sentence boundaries, i.e., *start_end*
*start_end* this 0.0083
this is 0.0503
is a 0.0861
a sunny 4.51e-05
sunny day 0.154
day . 0.163
. however 0.0
however i 0.0
i am 0.0396
am not 0.106
not feeling 0.0
feeling well 0.0
well lots 0.0
lots of 0.7
of cold 0.000138
cold *start_end* 0.0
[0.008303975842979365, 0.05030826140567201, 0.08609535184632229, 4.5083630133898384e-05, 0.15384615384615385, 0.16251830161054173, 0.0, 0.0, 0.0396452790818988, 0.10638297872340426, 0.0, 0.0, 0.0, 0.7, 0.00013782077786047024, 0.0]


## Find the perplexity and total probabilities of the given sentences

In [24]:
prob_list=[0.1, 0.023 ,0.09]


perplexity=1

# Calculating N
N=len(prob_list)-1


# Calculating the perplexity
for val in prob_list:
    perplexity = perplexity * (1/val)

perplexity = pow(perplexity, 1/float(N)) 

print("Perplexity= :",perplexity)


Perplexity= : 69.5048046856916


In [25]:


"""For the sentence: 'this is a sunny day' """ 
prob_list_1=[0.008303975842979365, 0.05030826140567201, 0.08609535184632229, 4.5083630133898384e-05, 0.15384615384615385]



total_prob_1 = 1

# Multiplying all the values of the probability and storing it
for val in prob_list_1:
    total_prob_1 *= val


print("For the sentence- 'this is a sunny day'")
print("Total probability:",total_prob_1)


perplexity_1=1

# Calculating N
N=len(prob_list_1)-1


# Calculating the perplexity
for val in prob_list_1:
    perplexity_1 = perplexity_1 * (1/val)

perplexity_1 = pow(perplexity_1, 1/float(N)) 

print("Perplexity:",perplexity_1)



"""For the sentence: 'this place is beautiful' """
prob_list_2=[0.008303975842979365, 0.0022194821208384712, 0.02185792349726776, 9.953219866626854e-05]

total_prob_2 = 1

# Multiplying all the values of the probability and storing it
for val in prob_list_2:
    total_prob_2 *= val

print("\n\nFor the sentence- 'this place is beautiful'")    
print("Total probability: ",total_prob_2)


perplexity_2=1

# Calculating N
N=len(prob_list_2)-1

# Calculating perplexity
for val in prob_list_2:
    perplexity_2 = perplexity_2 * (1/val)

perplexity_2 = pow(perplexity_2, 1/float(N)) 

print("Perplexity: ",perplexity_2)



For the sentence- 'this is a sunny day'
Total probability: 2.494655687321879e-10
Perplexity: 251.62126814544143


For the sentence- 'this place is beautiful'
Total probability:  4.009684736463708e-11
Perplexity:  2921.6616783932823


## Calculate the probability using Laplace smoothing

In [30]:
import nltk
from nltk.corpus import brown

# Corpus
words = brown.words()
words=[w.lower() for w in words]

# Unigram frequency 
uni_freq = nltk.FreqDist(w.lower() for w in words)

# Size of corpus
total_words = len(words)

print('Frequency of tokens of the sample sentence:')

for word in test_sentence_tokens:
    print(word,uni_freq[word])

    
# Creating bigrams

bigram_words = []
previous = 'EMPTY'
sentences = 0
for word in words:
    if previous in ['EMPTY','.','?','!']:
        ## insert word_boundaries at beginning of Brown,
        bigram_words.append('*start_end*')
    else:
        bigram_words.append(word)
    
    previous = word


    
    
bigram_words.append('*start_end*') ## assume one additional *start_end* at the end of Brown

updated_uni_freq  = nltk.FreqDist(w.lower() for w in bigram_words)


print('\nCalculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*')


# Bigram corpus
bigrams = nltk.bigrams(w.lower() for w in bigram_words)


# Bigram probabilities
conditional_freq = nltk.ConditionalFreqDist(bigrams)

#Sentence 
test_sentence_tokens=['sunset','looks','magnificient','.']

# Code begins here



V=len(set(words))


# Function to calculate bigram probability
def get_bigram_probability(first,second):
    
    bigram_freq = conditional_freq[first][second]
    unigram_freq = updated_uni_freq[first]

    bigram_prob = (bigram_freq + 1)/(unigram_freq + V) # with Laplacian Smoothing
    
    return bigram_prob

# Calculating the bigram probability

prob_list=[]
previous = '*start_end*'
for token in test_sentence_tokens:
    next_probability = get_bigram_probability(previous,token)
    print(previous,token,(float('%.3g' % next_probability)))
    previous = token
    prob_list.append(next_probability)

    
# For the final term    
next_probability = get_bigram_probability(previous,'*start_end*')
print(previous,'*start_end*',next_probability)
prob_list.append(next_probability)    

print(prob_list)    



# Calculating the total probability

total_prob = 1
for val in prob_list:
    total_prob *= val

print("\nTotal probability:",total_prob)

Frequency of tokens of the sample sentence:
sunset 14
looks 78
magnificient 0
. 49346

Calculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*
*start_end* sunset 9.48e-06
sunset looks 2.01e-05
looks magnificient 2e-05
magnificient . 2.01e-05
. *start_end* 0.49764524359375156
[9.48307744829352e-06, 2.0068634730779264e-05, 2.004329351399022e-05, 2.007427481682224e-05, 0.49764524359375156]

Total probability: 3.8106225670516194e-20


## Calculate the probability using Backoff method

In [27]:

import nltk
from nltk.corpus import brown

#Sentence 
test_sentence_tokens=['this','is','a','very','sunny','day','.']


# Corpus
words = brown.words()
words=[w.lower() for w in words]

# Unigram frequency 
uni_freq = nltk.FreqDist(w.lower() for w in words)

# Size of corpus
total_words = len(words)

print('Frequency of tokens of the sample sentence:')

for word in test_sentence_tokens:
    print(word,uni_freq[word])

    
# Creating bigrams

bigram_words = []
previous = 'EMPTY'
sentences = 0
for word in words:
    if previous in ['EMPTY','.','?','!']:
        ## insert word_boundaries at beginning of Brown,
        bigram_words.append('*start_end*')
    else:
        bigram_words.append(word)
    
    previous = word


    
    
bigram_words.append('*start_end*') ## assume one additional *start_end* at the end of Brown

updated_uni_freq  = nltk.FreqDist(w.lower() for w in bigram_words)


print('\nCalculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*')


# Bigram corpus
bigrams = nltk.bigrams(w.lower() for w in bigram_words)


# Bigram probabilities
conditional_freq = nltk.ConditionalFreqDist(bigrams)


# Code begins here


V=len(set(words))


# Function to calculate bigram probability
def get_bigram_probability(first,second):

    if not second in conditional_freq[first]:
        print('Backing Off to Unigram Probability for',second)
        unigram_prob = updated_uni_freq[second]/len(words)
        return unigram_prob 
    

    bigram_freq = conditional_freq[first][second]
    unigram_freq = updated_uni_freq[first]
    bigram_prob = bigram_freq/unigram_freq
    
    return bigram_prob


# Calculating the bigram probability

prob_list=[]
previous = '*start_end*'
for token in test_sentence_tokens:
    next_probability = get_bigram_probability(previous,token)
    print(previous,token,(float('%.3g' % next_probability)))
    previous = token
    prob_list.append(next_probability)

    
# For the final term    
next_probability = get_bigram_probability(previous,'*start_end*')
print(previous,'*start_end*',next_probability)
prob_list.append(next_probability)    

print(prob_list)    



# Calculating the total probability

total_prob = 1
for val in prob_list:
    total_prob *= val

print("\nTotal probability:",total_prob)

Frequency of tokens of the sample sentence:
this 5145
is 10109
a 23195
very 796
sunny 13
day 687
. 49346

Calculating bigram counts for sentence, including bigrams with sentence boundaries, i.e., *BEGIN* and *END*
*start_end* this 0.0083
this is 0.0503
is a 0.0861
a very 0.00613
Backing Off to Unigram Probability for sunny
very sunny 1.12e-05
sunny day 0.154
day . 0.163
. *start_end* 1.0
[0.008303975842979365, 0.05030826140567201, 0.08609535184632229, 0.00613137369821018, 1.1195392320994288e-05, 0.15384615384615385, 0.16251830161054173, 1.0]

Total probability: 6.172926606098926e-14
