In [1]:
from nltk import bigrams , ConditionalFreqDist
import numpy as np
import itertools
import pickle
import pandas as pd

In [2]:
def readFile(fileName,sheet):
    xl_file = pd.read_excel(fileName,sheet, header=None)
    df = pd.DataFrame(xl_file)
    df = df.fillna(method='ffill')
    return df

In [3]:
df  = readFile('viterbi_train.xlsx','Sheet1')
dftest = readFile('viterbi_test.xlsx','Sheet1')

In [4]:
#if create it call loadSentenceAndTags function 
def createSentenceAndTags(df,sentfirst,sentlast):
    sentences = []
    sentence_tags = []
    for i in range(sentfirst,sentlast):
        sent = df[df[0]== ('Sentence: '+str(i))]
        sentences.append(sent[1])
        sentence_tags.append(sent[2])
    with open('sentences2.data', 'wb') as filehandle:
        # store the data as binary data stream
        pickle.dump(sentences, filehandle)
    with open('sentence_tags2.data', 'wb') as filehandle:
        # store the data as binary data stream
        pickle.dump(sentence_tags, filehandle)

In [5]:
def loadSentenceAndTags():
    with open('sentences2.data', 'rb') as filehandle:
        # read the data as binary data stream
        sentences = pickle.load(filehandle)
    with open('sentence_tags2.data', 'rb') as filehandle:
        # read the data as binary data stream
        sentence_tags = pickle.load(filehandle)
    return sentences,sentence_tags

In [6]:
createSentenceAndTags(df,30000,34000)
sentences,sentence_tags = loadSentenceAndTags()

In [7]:
createSentenceAndTags(dftest,34000,35001)
testSentences,testSentence_tags = loadSentenceAndTags()

In [8]:
vocab = set([])
for s in sentences:
    for w in s:
        vocab.add(str(w).lower())

In [9]:
xl_file = pd.read_excel('pos_list.xlsx')
tagf = pd.DataFrame(xl_file, columns= ['POS_list'])
vocabtags = set(tagf['POS_list'])
vocabtags.add('-#start#-')
vocabtags.add('-#end#-')

In [10]:
def createSentTagPair(sentences,sentence_tags):
    sentenceTag = []
    sent = []
    x = 0
    for i in range(len(sentences)):
        sent = []
        sent.append(('-#start#-','-#start#-'))
        for j in range(x,x+len(sentences[i])):
            x += 1
            sent.append((sentences[i][j],sentence_tags[i][j]))
        sent.append(('-#end#-','-#end#-'))
        sentenceTag.append(sent)
    return sentenceTag

In [11]:
train_data = createSentTagPair(sentences,sentence_tags)
test_data = createSentTagPair(testSentences,testSentence_tags)

In [12]:
testWords=[]
testTags=[]
for sent in test_data:
    x=[]
    y=[]
    for (word,tag) in sent:
        x.append(str(word).lower())
        y.append(tag)
    testWords.append(x)
    testTags.append(y)

In [13]:
def createEmissionProb():
    tagWords = {}
    emissionProb = {}
    for sent in train_data:
        for (word,tag) in sent:
            word = str(word).lower()
            if tag in tagWords:
                if word in tagWords[tag] :
                    tagWords[tag][word] += 1
                else:
                    tagWords[tag][word] = 1
            else:
                tagWords[tag] = {word:1}
    for tag in tagWords.keys():
        emissionProb[tag] = {}
        for word in tagWords[tag].keys():
            emissionProb[tag][word] = tagWords[tag][word] / sum(tagWords[tag].values())
    return emissionProb

In [14]:
emissionProb = createEmissionProb()

In [15]:
def createTransitionProb():
    
    tags = []
    for sent in train_data:
        for (word,tag) in sent:
            tags.append(tag)
    brown_trigrams = bigrams(tags)
    condition_pairs = ((w0,w1) for w0, w1 in brown_trigrams)
    cfd_brown = ConditionalFreqDist(condition_pairs)
    transitionProb={}
    print(cfd_brown.keys())
    for tag1 in cfd_brown.keys():
#         if(tag1 != '-#end#-'):
        transitionProb[tag1] = {}
        for tag2 in cfd_brown[tag1].keys():
            transitionProb[tag1][tag2]=cfd_brown[tag1][tag2]/sum(cfd_brown[tag1].values())
    return transitionProb

In [16]:
transitionProb = createTransitionProb()
transitionProb.keys()

dict_keys(['-#start#-', 'JJ', 'NNP', 'VBZ', 'VBN', 'TO', 'VB', 'DT', 'NN', '.', '-#end#-', ',', 'CC', 'POS', 'VBP', 'VBG', 'IN', 'NNS', 'PRP', 'RP', 'VBD', 'MD', 'JJR', 'RB', 'EX', '$', 'CD', 'RBR', 'PRP$', 'JJS', 'WP', 'WRB', 'WDT', 'PDT', '``', 'NNPS', 'RBS', 'WP$', 'LRB', 'RRB', ':', ';'])


dict_keys(['-#start#-', 'JJ', 'NNP', 'VBZ', 'VBN', 'TO', 'VB', 'DT', 'NN', '.', '-#end#-', ',', 'CC', 'POS', 'VBP', 'VBG', 'IN', 'NNS', 'PRP', 'RP', 'VBD', 'MD', 'JJR', 'RB', 'EX', '$', 'CD', 'RBR', 'PRP$', 'JJS', 'WP', 'WRB', 'WDT', 'PDT', '``', 'NNPS', 'RBS', 'WP$', 'LRB', 'RRB', ':', ';'])

In [17]:
def viterbi():
    predictedTags = []                
    for i in range(int(len(testWords))):
        sent = testWords[i]
        viterbiMatrix = {}              
        for j in range(len(sent)):
            word = sent[j]
            #first word in sentence
            if j == 1:                
                viterbiMatrix[j] = {}
                for tag in transitionProb.keys():
                    if word in vocab and word in emissionProb[tag].keys() and tag in transitionProb['-#start#-']:
                        viterbiMatrix[j][tag] = ['-#start#-',transitionProb['-#start#-'][tag]*emissionProb[tag][word]]
                    else:
                        viterbiMatrix[j][tag] = ['-#start#-',0.00000001]

            # for all words of sentence except first
            if(j > 1):
                viterbiMatrix[j] = {}
                previous_states = list(viterbiMatrix[j-1].keys())
                for tag in transitionProb.keys(): 
                    if(tag != '-#start#-'):
                        tempValus = []
                        for ptag in previous_states:                         
                            if word in vocab and word in emissionProb[tag].keys() and tag in transitionProb[ptag] :
                                tempValus.append(viterbiMatrix[j-1][ptag][1]*transitionProb[ptag][tag]*emissionProb[tag][word])
                            else:
                                tempValus.append(viterbiMatrix[j-1][ptag][1]*0.00000001)
                        index = tempValus.index(max(tempValus))
                        bestpt = previous_states[index]
                        viterbiMatrix[j][tag]=[bestpt,max(tempValus)]
        pred_tags = []
        total_steps_num = viterbiMatrix.keys()
        lensent = len(sent) - 1
        for w in range(lensent):
            i = lensent - w
            if i == lensent:
                pred_tags.append('-#end#-')
                pred_tags.append(viterbiMatrix[i]['-#end#-'][0])
            if i<lensent and i>0:
                pred_tags.append(viterbiMatrix[i][pred_tags[len(pred_tags)-1]][0])
        predictedTags.append(list(reversed(pred_tags)))
    return predictedTags

In [18]:
predictedTags = viterbi() 

In [19]:
pred = []
for i in range(len(predictedTags)):
    pred.append(predictedTags[i][1:len(predictedTags[i])-1])
    

In [20]:
result = []
for i in range(len(pred)):
    for j in range(len(pred[i])):
        result.append(pred[i][j])

In [21]:
dftest['predict'] = result

In [22]:
writer = pd.ExcelWriter('outputHMM.xlsx')
# write dataframe to excel
dftest.to_excel(writer)
# save the excel
writer.save()

In [23]:
from sklearn.metrics import accuracy_score
print('accuracy = ',accuracy_score(dftest[2],result))

accuracy =  0.8871004001618632


In [25]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

print('Precision = ',precision_score(dftest[2],result,average='weighted'))

Precision =  0.9274104498228184


In [27]:
print('recall = ',recall_score(dftest[2],result,average='weighted'))

recall =  0.8871004001618632
