In [1]:
import nltk
import numpy as np
import pandas as pd
import tkinter as tk
from tkinter import *

gui = tk.Tk()
gui.title("POS Tagging using HMM (BT20CSE188)")
gui.geometry("10000x10000")
gui.configure(bg="black")
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
# nltk_data=list(nltk.corpus.brown.tagged_sents(tagset='universal'))
train_tagged_words = [ tup for sent in nltk_data for tup in sent ]
patterns = [
    (r'.*ing$', 'VERB'),              # gerund
    (r'.*ed$', 'VERB'),               # past tense 
    (r'.*es$', 'VERB'),               # verb    
    (r'.*\'s$', 'NOUN'),              # possessive nouns
    (r'.*s$', 'NOUN'),                # plural nouns
    (r'\*T?\*?-[0-9]+$', 'X'),        # X
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), # cardinal numbers
    (r'.*', 'NOUN')                   # nouns
]

rule_based_tagger = nltk.RegexpTagger(patterns)

def emissionProb(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    return (count_w_given_tag, count_tag)


def transitionProb(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)


def Viterbi(words, trans_df, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = trans_df.loc['.', tag]
            else:
                transition_p = trans_df.loc[state[-1], tag]
                 
            emission_p = emissionProb(words[key], tag)[0]/emissionProb(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
             
        pmax = max(p)
        state_max = rule_based_tagger.tag([word])[0][1]       
        if(pmax==0):
            state_max = rule_based_tagger.tag([word])[0][1] 
        else:
            if state_max != 'X':
                state_max = T[p.index(pmax)]                
             
        state.append(state_max)
    return list(zip(words, state))


def Take_input():
    tags = {tag for word,tag in train_tagged_words}
    test_sent = textInput.get("1.0", "end-1c")
    words=test_sent.split()
    printOut.insert(END, "                                             *Treebank corpus is used to calculate the correct POS tags*\n")
    printOut.insert(END, "\nState Transition Probability Matrix :=> \n\n")
    transMat = np.zeros((len(tags), len(tags)), dtype='float32')
    for i, t1 in enumerate(list(tags)):
        for j, t2 in enumerate(list(tags)): 
            threedec = transitionProb(t2, t1)[0]/transitionProb(t2, t1)[1]
            transMat[i,j] = "{:.3f}".format(threedec)
            # print(type(transMat[i,j]))
    trans_df = pd.DataFrame(transMat, columns = list(tags), index=list(tags))
    # print(type(trans_df))
    printOut.insert(END, trans_df)
    # print(trans_df)



    printOut.insert(END, "\n\n\nEmission Probability Matrix :=> \n\n")
    emissionMat = np.zeros((len(tags), len(words)), dtype='float32')
    for key, word in enumerate(words):
            for j, tag in enumerate(list(tags)): 
                threedec = emissionProb(words[key], tag)[0]/emissionProb(words[key], tag)[1]
                emissionMat[j,key] = "{:.3f}".format(threedec)

    emiss_df = pd.DataFrame(emissionMat, columns = list(words), index=list(tags))
    printOut.insert(END, emiss_df)
    # print(emiss_df)
           
    tagged_sentence=Viterbi(words,trans_df)
    printOut.insert(END, "\n\nPOS Tagging using Viterbi Algorithm :=> ")
    printOut.insert(END, tagged_sentence)
    # print(tagged_sentence)

label1 = Label(text = "Enter a sentence",font=("Courier", 17),bg='black',fg='white')
label1.pack()

textInput = Text(gui, height = 1, width = 35, font=("Courier", 15))
textInput.pack()

button1 = Button(gui, height = 2, width = 15, text ="Get POS", font=("Courier", 12), command = lambda:Take_input(),bg="Pink")
button1.pack()

printOut = Text(gui, height = 37, width = 10000, background = 'grey')
printOut.pack()

gui.mainloop()
