<a href="https://colab.research.google.com/github/ejini6969/Text-Analytics/blob/main/Q3_4bigram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import nltk
nltk.download('punkt')

# Read Text File
textFile = open("/content/Text Corpus.txt", "r")
corpus = [line.strip() for line in textFile]
# Perform Tokenization and Decapitalization
tokenizedCorpus = [nltk.tokenize.word_tokenize(corpus[i].lower()) for i in range(len(corpus))]

['<s> He read a book </s>', '<s> I read a different book </s>', '<s> He read a book my Danielle </s>']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
paddings = ['>', '<', 's', '/s']

# Filter Paddings
filteredCorpus = [[j for j in tokenizedCorpus[i] if j not in paddings] for i in range(len(tokenizedCorpus))]

print(filteredCorpus)

[['he', 'read', 'a', 'book'], ['i', 'read', 'a', 'different', 'book'], ['he', 'read', 'a', 'book', 'my', 'danielle']]


In [3]:
from nltk.util import bigrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

# Insert Paddings at Both Ends and Form Bigram
print(list(pad_both_ends(filteredCorpus[0], n=2)))
print(list(bigrams(pad_both_ends(filteredCorpus[0], n=2))), "\n")

print(list(pad_both_ends(filteredCorpus[1], n=2)))
print(list(bigrams(pad_both_ends(filteredCorpus[0], n=2))), "\n")

print(list(pad_both_ends(filteredCorpus[2], n=2)))
print(list(bigrams(pad_both_ends(filteredCorpus[0], n=2))), "\n")


['<s>', 'he', 'read', 'a', 'book', '</s>']
[('<s>', 'he'), ('he', 'read'), ('read', 'a'), ('a', 'book'), ('book', '</s>')] 

['<s>', 'i', 'read', 'a', 'different', 'book', '</s>']
[('<s>', 'he'), ('he', 'read'), ('read', 'a'), ('a', 'book'), ('book', '</s>')] 

['<s>', 'he', 'read', 'a', 'book', 'my', 'danielle', '</s>']
[('<s>', 'he'), ('he', 'read'), ('read', 'a'), ('a', 'book'), ('book', '</s>')] 



In [4]:
# Perform Flattening
print(list(flatten(pad_both_ends(sent, n=2) for sent in filteredCorpus)))

['<s>', 'he', 'read', 'a', 'book', '</s>', '<s>', 'i', 'read', 'a', 'different', 'book', '</s>', '<s>', 'he', 'read', 'a', 'book', 'my', 'danielle', '</s>']


In [8]:

print('\n--------------------------------------------------')
print('SMOOTHED BIGRAM PROBABILITY')
print('------------------# Preprocess the Tokenized Text for Bigram Language Modelling')
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import Laplace # Laplace Smoothed
from nltk.lm import Vocabulary

# Create Lookup Vocabulary
vocab = Vocabulary(["he", "read", "a", "book", "i", "different", "my", "danielle", "<s>", "</s"])

#Bigram
trainData, paddedSents = padded_everygram_pipeline(2, filteredCorpus)

smoothedModel = Laplace(vocabulary = vocab, order = 2) # Training Bigram Laplace Smoothed Model
smoothedModel.fit(trainData, paddedSents) # Building Model--------------------------------')

print('P(He|<s>): ', round(smoothedModel.score('he', '<s>'.split()),4)) # P(He|<s>)
print('P(I|<s>): ', round(smoothedModel.score('i', '<s>'.split()),4)) # P(I|<s>)
print('P(read|<He>): ', round(smoothedModel.score('read', 'he'.split()),4)) # P(read|He)
print('P(a|read): ', round(smoothedModel.score('a', 'read'.split()),4)) # P(a|read)
print('P(book|a): ', round(smoothedModel.score('book', 'a'.split()),4)) # P(book|a)
print('P(different|a): ', round(smoothedModel.score('different', 'a'.split()),4)) # P(different|a)
print('P(my|book): ', round(smoothedModel.score('my', 'book'.split()),4)) # P(my|book)
print('P(</s>|book): ', round(smoothedModel.score('</s>', 'book'.split()),4)) # P(</s>|book)
print('P(read|I): ', round(smoothedModel.score('read', 'i'.split()),4)) # P(read|I)
print('P(book|different): ', round(smoothedModel.score('book', 'different'.split()),4)) # P(book|different)
print('P(Danielle|my): ', round(smoothedModel.score('danielle', 'my'.split()),4)) # P(Danielle|my)
print('P(</s>|Danielle): ', round(smoothedModel.score('</s>', 'danielle'.split()),4)) # P(</s>|Danielle)


--------------------------------------------------
SMOOTHED BIGRAM PROBABILITY
------------------# Preprocess the Tokenized Text for Bigram Language Modelling
P(He|<s>):  0.2143
P(I|<s>):  0.1429
P(read|<He>):  0.2308
P(a|read):  0.2857
P(book|a):  0.2143
P(different|a):  0.1429
P(my|book):  0.1429
P(</s>|book):  0.2143
P(read|I):  0.1667
P(book|different):  0.1667
P(Danielle|my):  0.1667
P(</s>|Danielle):  0.1667


In [24]:
# compute sentence probability based on smoothed bigram model above
for sent in corpus:
  tk = sent.lower().split()
  padded_bigram = list(bigrams(tk)) # generate bigrams for each sentence
  probability = 1
  for x1, x2 in padded_bigram:
    val = smoothedModel.score(x2, x1.split())
    probability *= val
  print(f'P("{sent}") = {probability}')

P("<s> He read a book </s>") = 0.0006487681414795117
P("<s> I read a different book </s>") = 3.4707760655282514e-05
P("<s> He read a book my Danielle </s>") = 1.2014224842213177e-05
