## Imports

In [1]:
from nltk import word_tokenize, sent_tokenize
from pylab import *
import matplotlib.pyplot as plt
import os
import numpy as np
import re

In [2]:
from nltk.probability import FreqDist,ConditionalProbDist,ConditionalFreqDist,LidstoneProbDist
from nltk.util import ngrams
from collections import Counter
import operator

In [3]:
import math

In [14]:
book = open("Sherlock Holmes.txt","r")
content = book.read()
sent3=content
# created a corpus where the sentence begin and end with <s> and <!s>
sos = "<s>"
eos = "<!s>"

## Cleaning Data : Removing newlines, special characters mentioned oterwise ,roman numbers etc.
#### Q1 and Q2 below

In [15]:
sent3 = content
sent3 = re.sub(r'[^\w\s.]|',r'',sent3) # removing
sent3 = re.sub(r'[\r\n]+','\n',sent3)
sent3 = re.sub(r'[I|V|X]+[.]','.',sent3)
sent3 = re.sub(r'[\n]',' ',sent3)
sent3 = re.sub(r'[\s]+',' ',sent3)

In [71]:
sentences = sent_tokenize(sent3)
sent_list = [re.sub(r'[\s]+[.]','',s) for s in sentences ]
list_size = len(sent_list)
#maintaining original corpus 
corpus=''
tokens = 0 # all the words in corpus

unique_words = set() #types
for i in range(list_size):
    sent_list[i] = re.sub(r'[.]','',sent_list[i])
    l = sent_list[i].split()
    tokens += len(l)
    unique_words = unique_words.union(l)
    sent_list[i] = ('<s> '+sent_list[i]+' <!s>').lower()
    corpus = corpus+sent_list[i]+' '

# number of unique words in corpus
# used later in Add -1 smoothing
types = len(unique_words)

### Building Models
#### from above we have following information:
##### Corpus : a cleared string with sos an eos
##### sent_list: a list of string where each string is a sentence from corpus
##### Q3 are done below

##### Preparing the 80% data and 20% data

In [6]:
_80dataIndex = int(5850*0.8)
trainData = sent_list[0:4681]
testData = sent_list[4681:5851]

### Unigram Model

In [7]:
def uniGramModel(data,smoothing):
    global sent_list,corpus,tokens,types
    MLE_uniGram = dict()
    unigram_count = dict()
    for s in data:
        myUniGram = dict(Counter(ngrams(s.split(),1)))
        for key in myUniGram:
            string = ''.join(key)
            unigram_count[string]=unigram_count.get(string,0)+myUniGram[key]
    for key in unigram_count:
        value = unigram_count[key]
        if smoothing:
            MLE_uniGram[key] =MLE_uniGram.get(key,0)+ (value+1) / (tokens+types)
        else:
            MLE_uniGram[key] =MLE_uniGram.get(key,0)+ value / tokens
    return MLE_uniGram,unigram_count

In [18]:
# ugc : unigram_count
# uMLE : unigramMLE
uMLE,ugc = uniGramModel(trainData,False)

In [202]:
len(ugc)

7665

### Bigram Model

In [37]:
# _pairs : represent the 
def biGramModel(data,smoothing,ugc):
    #global sent_list,corpus
    global types
    #global ugc
    MLE_biGram = dict()
    biGram_count= dict()
    for s in data:
        myBigram = dict(Counter(ngrams(s.split(),2)))
        for key in myBigram:
            _keys = ' '.join(key) # joining the wi-1 and wi, so to form a single string seperated by comma
            _value = myBigram[key] # fetching the value for the key  
            biGram_count[_keys] = biGram_count.get(_keys,0)+_value
    for key in biGram_count:
        pair = key.split()
        W0 = pair[0]
        if smoothing==True:
            MLE_biGram[key] = MLE_biGram.get(key,0)+ ((biGram_count[key]+1)/(ugc[W0]+types))
        else:
            MLE_biGram[key] = MLE_biGram.get(key,0)+ (biGram_count[key]/ugc[W0])
    return MLE_biGram,biGram_count

In [42]:
bMLE, bgc = biGramModel(trainData,False,ugc)

In [64]:
bMLE

{'test a': 0.25,
 'were lounging': 0.004291845493562232,
 'turned over': 0.09090909090909091,
 'it appeared,': 0.0010964912280701754,
 'the mantelpiece': 0.0013003901170351106,
 'by an': 0.008583690987124463,
 'using my': 0.25,
 'no question': 0.005235602094240838,
 'can you': 0.04424778761061947,
 '"never" <!s>': 1.0,
 'writing lately,': 0.3333333333333333,
 'never so': 0.02857142857142857,
 'lestrade, whom': 0.2,
 "received' <!s>": 1.0,
 'shorter to': 1.0,
 '<s> some,': 0.0004272591326639607,
 'cocaine injections,': 0.3333333333333333,
 'mouth <!s>': 0.25,
 'make me': 0.05,
 'was thrown': 0.003293084522502744,
 'indirectly responsible': 1.0,
 'cane came': 0.5,
 'rushed forward,': 0.047619047619047616,
 'struggle, so,': 1.0,
 'skin the': 0.5,
 'wonder at': 0.25,
 'know the': 0.02830188679245283,
 'for having': 0.006224066390041493,
 'my wife,': 0.005,
 'play <!s>': 0.2,
 'if an': 0.00641025641025641,
 'travelled back': 0.2,
 'to reason': 0.001142204454597373,
 'safe were': 0.111111111

### TriGram Model

In [10]:
def triGramModel(data,smoothing,bgc):
    global sent_list,corpus
    #global bgc
    MLE_triGram = dict()
    triGram_count = dict()
    for s in data:
        myTriGram = dict(Counter(ngrams(s.split(),3)))
        for key in myTriGram:
            _keys = ' '.join(key)
            _value = myTriGram[key]
            triGram_count[_keys]= triGram_count.get(_keys,0)+ _value
    for key in triGram_count:
        pair = key.split()
        W0 = pair[0]
        W1 = pair[1]
        W2 = pair[2]
        bgcW0W1 = bgc[W0+' '+W1] # value of bigram
        if smoothing:
            MLE_triGram[key] = MLE_triGram.get(key,0) + (triGram_count[key]+1) / (bgcW0W1+types)
        else:
            MLE_triGram[key] = MLE_triGram.get(key,0) + (triGram_count[key] / bgcW0W1)
    return MLE_triGram,triGram_count

In [43]:
tMLE,tgc = triGramModel(trainData,False,bgc)

### QuadGrams Model

In [11]:
def quadGramModel():
    global sent_list
    global tgc # trigram count
    MLE_qGram = dict()
    qGram_count = dict()
    for s in sent_list:
        myQGram = dict(Counter(ngrams(s.split(),4)))
        for key in myQGram:
            _keys = ' '.join(key)
            _value = myQGram[key]
            qGram_count[_keys]= qGram_count.get(_keys,0)+ _value
    for key in qGram_count:
        pair = key.split()
        W0 = pair[0]
        W1 = pair[1]
        W2 = pair[2]
        tgcW0W1W2 = tgc[W0+' '+W1+' '+W2] # value of bigram
        MLE_qGram[key] = MLE_qGram.get(key,0) + (qGram_count[key] / tgcW0W1W2)
    return MLE_qGram,qGram_count

In [113]:
qMLE,qgc = quadGramModel()

In [67]:
UMLE,UGC = uniGramModel(sent_list,False)
BMLE,BGC = biGramModel(sent_list,False,UGC)
TMLE,TGC = triGramModel(sent_list,False,BGC)


### Q3: How many ngrams possible and how many exist

In [69]:
print("Calculated - Unigram | Bigram | Trigram")
print("-------------",len(UMLE),"|",len(BMLE),"|",len(TMLE))

Calculated - Unigram | Bigram | Trigram
------------- 8711 | 51288 | 87575


In [70]:
print("Actual Unigram | Bigram | Trigram")
print("------",math.pow(types,1),"|",math.pow(types,2),"|",math.pow(types,3))

Actual Unigram | Bigram | Trigram
------ 9417.0 | 88679889.0 | 835098514713.0


### Add -1 smoothing on Training Set

In [74]:
# unsmoothed = us
# smoothed = s
us_uMLE,us_ugc = uniGramModel(trainData,False)
us_bMLE,us_bgc = biGramModel(trainData,False,us_ugc)

s_uMLE,s_ugc = uniGramModel(trainData,True)
s_bMLE,s_bgc = biGramModel(trainData,True,s_ugc)

#### Q5

In [79]:
print("After smoothing -","interposed your - probability",0.00021235931195582927) 
print("Before smoothening -","interposed your - probability",1)
print("After smoothing -","received' <!s> - probability",0.00021235931195582927) 
print("Before smoothening -","received' <!s> - probability",1)
print("After smoothening -","lestrade, whom - probability",0.00021226915729144556)
print("Before smoothening -","lestrade, whom - probability",0.2)
#'interposed your': 1.0, before smoothening

After smoothing - interposed your - probability 0.00021235931195582927
Before smoothening - interposed your - probability 1
After smoothing - received' <!s> - probability 0.00021235931195582927
Before smoothening - received' <!s> - probability 1
After smoothening - lestrade, whom - probability 0.00021226915729144556
Before smoothening - lestrade, whom - probability 0.2


In [78]:
s_bMLE

{'test a': 0.00021229168878038424,
 'were lounging': 0.0002072538860103627,
 'turned over': 0.0004232804232804233,
 'it appeared,': 0.00019362958660083262,
 'the mantelpiece': 0.00045242044940431306,
 'by an': 0.000310880829015544,
 'using my': 0.00021229168878038424,
 'no question': 0.00020815986677768527,
 'can you': 0.0006295907660020986,
 '"never" <!s>': 0.0003185051491665782,
 'writing lately,': 0.00021231422505307856,
 'never so': 0.00031622219879835566,
 'lestrade, whom': 0.00021226915729144556,
 "received' <!s>": 0.00021235931195582927,
 'shorter to': 0.00021235931195582927,
 '<s> some,': 0.00021279614129663782,
 'cocaine injections,': 0.00021231422505307856,
 'mouth <!s>': 0.00021229168878038424,
 'make me': 0.000317225335730147,
 'was thrown': 0.0003872966692486445,
 'indirectly responsible': 0.00021235931195582927,
 'cane came': 0.00021233676611105213,
 'rushed forward,': 0.00021190930281839374,
 'struggle, so,': 0.00021235931195582927,
 'skin the': 0.00021233676611105213,
 

In [77]:
us_bMLE

{'test a': 0.25,
 'were lounging': 0.004291845493562232,
 'turned over': 0.09090909090909091,
 'it appeared,': 0.0010964912280701754,
 'the mantelpiece': 0.0013003901170351106,
 'by an': 0.008583690987124463,
 'using my': 0.25,
 'no question': 0.005235602094240838,
 'can you': 0.04424778761061947,
 '"never" <!s>': 1.0,
 'writing lately,': 0.3333333333333333,
 'never so': 0.02857142857142857,
 'lestrade, whom': 0.2,
 "received' <!s>": 1.0,
 'shorter to': 1.0,
 '<s> some,': 0.0004272591326639607,
 'cocaine injections,': 0.3333333333333333,
 'mouth <!s>': 0.25,
 'make me': 0.05,
 'was thrown': 0.003293084522502744,
 'indirectly responsible': 1.0,
 'cane came': 0.5,
 'rushed forward,': 0.047619047619047616,
 'struggle, so,': 1.0,
 'skin the': 0.5,
 'wonder at': 0.25,
 'know the': 0.02830188679245283,
 'for having': 0.006224066390041493,
 'my wife,': 0.005,
 'play <!s>': 0.2,
 'if an': 0.00641025641025641,
 'travelled back': 0.2,
 'to reason': 0.001142204454597373,
 'safe were': 0.111111111

### Good Turing Smoothing on Training Set

In [81]:
def GoodTuring(train_gram_count):
    #fof is frequency of frequency
    fof =[0,1,2,3,4,5,6,7,8,9,10,11] # refer 'c': count of each frequecncy: as per slides
    Nc  ={0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0,10:0,11:0,12:0}
    # Building fof for those unigrams whos have occurence between [1,10]
    total_N=0
    gramCount = train_gram_count
    for key in gramCount:
        if 1<=gramCount[key] and gramCount[key]<=12:
            Nc[gramCount[key]] +=1
            total_N +=1
    N = len(ugc) # all the different Ngrams
    count={0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0}
    c_star =[]
    for i in fof:
        if i!=0:
            count[i]=(i+1)*Nc[i+1]/Nc[i]
            c_star.append(count[i])
        else:
            count[i]=(i+1)*Nc[i+1]/N
            c_star.append(count[i])        
    d = list(map(operator.sub,fof,c_star))
    return d,count,c_star,total_N

### Q6: the value of d is fluctuating though oscillating around 0.92

In [83]:
# Good Turing Perform
dtrain,count_train,c_star_train,train_total_N = GoodTuring(bgc)
print(dtrain)

[-3.2183070866141734, 0.7395559361428834, 0.9262564584311883, 1.0866141732283465, 0.982167352537723, 0.7999999999999998, 1.3636363636363633, 1.3529411764705879, 1.375, 2.2075471698113205, -1.7638888888888893, 3.0519480519480515]


### Perlexity Value considering both Add -1 smoothing and Good Turing

In [20]:
# first considering the Add -1 case:
#unigram
test_uMLE,test_ugc = uniGramModel(testData,True)
#bigram
test_bMLE,test_bgc = biGramModel(testData,True,test_ugc)
#trigram
test_tMLE,test_tgc = triGramModel(testData,True,test_bgc)

### Add 1

In [21]:
Add1 = {'u':test_uMLE,'b':test_bMLE,'t':test_tMLE}
Add1_PPW=[]
for key in Add1:
    MLE = Add1[key]
    nodu =  len(MLE)
    prob_sum=0
    for key in MLE:
        prob_sum +=math.log(MLE[key],2)
    prob_sum = -1*prob_sum*1/nodu
    Add1_PPW.append(round(pow(2,prob_sum)))
print(Add1_PPW)

[4265, 4597, 40902]


### Good Turing

In [235]:
test_du,test_ucount,test_uc_star,uN = GoodTuring(test_ugc)
test_db,test_bcount,test_bc_star,bN = GoodTuring(test_bgc)
test_dt,test_tcount,test_tc_star,tN = GoodTuring(test_tgc)

### Q7: Good Turing performs better as compared to Add-1 smoothing

### Q4b

In [60]:
sentence_list = ["<s> the boscombe valley mystery <!s>","<s> i had seen little of holmes lately <!s>","<s> the man with the twisted lip <!s>"]

def probability(sentences,model_name):
    sumP=0
    log_prob=[]
    if(model_name=="unigram"):
        MLE=uMLE
        for sentence in sentences:
            for word in sentence.split():
                sumP +=math.log(MLE[word],2)
            log_prob.append(sumP)
    elif (model_name=="bigram"):
        MLE=bMLE
        for sentence in sentences:
            myBigram = dict(Counter(ngrams(sentence.split(),2)))
            for key in myBigram:
                _keys = ' '.join(key) # joining the wi-1 and wi, so to form a single string seperated by comma
                sumP += math.log(MLE[_keys],2)
            log_prob.append(sumP)
    elif (model_name=="trigram"):
        MLE=tMLE
        for sentence in sentences:
            myTrigram = dict(Counter(ngrams(sentence.split(),3)))
            for key in myTrigram:
                _keys = ' '.join(key) # joining the wi-1 and wi, so to form a single string seperated by comma
                sumP += math.log(MLE.get(_keys,1),2)
            log_prob.append(sumP)
    return log_prob

In [85]:
lp = probability(sentence_list,"unigram")
print(lp)
lp = probability(sentence_list,"bigram")
print(lp)

[-53.7864001369231, -126.49674521697675, -190.08044322793666]
[-18.965407573619615, -61.17952440325384, -92.54516922004123]


### Q4a

In [63]:
def Generator(model_name):
    if(model_name=='bigram'):
        sent_begin = "<s>"
        all_bigrams = list(bMLE.keys())
        bigram_probability=[]
        for bgs in all_bigram:
            bigram_probability.append(bMLE[bgs])
           
        while last_word != eos:
            
            
        else:
        print("pass a valid bigramModel")

SyntaxError: unexpected EOF while parsing (<ipython-input-63-dad037528d9d>, line 1)