## POS tagging using modified Viterbi

### Data Preparation

In [1]:
#Importing libraries
import nltk
import numpy as np
import pandas as pd
import time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from collections import Counter

In [2]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [3]:
# first few tagged sentences
print(nltk_data[:5])

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')], [('Rudolph', 'NOUN'), ('Agnew', 'NOUN'), (',', '.'), ('55', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), ('and', 'CONJ'), ('former', 'ADJ'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Consolidated', 'NOUN'), ('Gold', 'NOUN'), ('Fields', 'NOUN'), ('PLC', 'NOUN'), (',', '.'), ('was', 'VERB'), ('named', 'VERB'), ('*-1', 'X'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('of', 'ADP'), ('this', 'DET'), ('British', 'ADJ'), ('industrial', 'ADJ'), ('

### Train and Test Set

In [4]:
# Splitting into train and test
random.seed(1234)
train_set, test_set = train_test_split(nltk_data,test_size=0.05)

print(len(train_set))
print(len(test_set))

3718
196


In [5]:
# Getting list of tagged train words
train_tagged_words = [tup for sent in train_set for tup in sent]
len(train_tagged_words)

95607

In [6]:
# tokens 
tokens = [pair[0] for pair in train_tagged_words]
tokens[:5]

['20', 'billion', 'yen', 'of', '6']

In [7]:
# vocabulary
V = set(tokens)
print(len(V))
N_Words=int(len(V))

12044


In [8]:
# number of tags
T = set([pair[1] for pair in train_tagged_words])
print(len(T))
print(T)
print(list(T)[0])

12
{'ADV', 'ADJ', 'ADP', 'DET', 'NOUN', 'PRON', 'PRT', '.', 'X', 'NUM', 'CONJ', 'VERB'}
ADV


In [9]:
# computing P(w/t) and storing in T x V matrix
t = len(T)
v = len(V)
w_given_t = np.zeros((t, v))

In [10]:
# most frequent tag in the corpus
tags = [pair[1] for pair in train_tagged_words]
tag_counts = Counter(tags)
#print(tags)
# the most common tags can be seen using the most_common() method of Counter
most_common_tags =tag_counts.most_common(5)
print(most_common_tags)

[('NOUN', 27420), ('VERB', 12894), ('.', 11108), ('ADP', 9380), ('DET', 8308)]


In [11]:
# Getting list of tagged test words

# Running on entire test dataset would take more than 3-4hrs. 
# Let's test our Viterbi algorithm on a few sample sentences of test dataset

random.seed(1234)

# choose random 5 sents
rndom = [random.randint(1,len(test_set)) for x in range(5)]

# list of sents
test_run = [test_set[i] for i in rndom]

# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]
len(test_tagged_words)

144

### Emission Probabilities

In [12]:
# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [13]:
# testing the Emission Probability function
print("\n", "chairman")
print(word_given_tag('chairman', 'CONJ'))
print(word_given_tag('chairman', 'NOUN'))
print(word_given_tag('chairman', 'VERB'), "\n")


 chairman
(0, 2151)
(41, 27420)
(0, 12894) 



### Transition Probabilities

In [14]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [15]:
# testing Transition Probability
print(t2_given_t1(t2='DET', t1='NOUN'))
print(t2_given_t1('VERB', 'NOUN'))
print(t2_given_t1('CONJ', 'VERB'))
print(t2_given_t1(',', 'NNP'))
print(t2_given_t1('DT', '.'))
print(t2_given_t1('VBG', '.'))
print(t2_given_t1('NN', '.'))
print(t2_given_t1('NNP', '.'))

(372, 27420)
(4034, 27420)
(69, 12894)
(0, 0)
(0, 11108)
(0, 11108)
(0, 11108)
(0, 11108)


### Transition matrix of tags

In [16]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(T), len(T)), dtype='float32')
for i, t1 in enumerate(list(T)):
    for j, t2 in enumerate(list(T)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [17]:
tags_matrix

array([[7.83272460e-02, 1.28443405e-01, 1.18154660e-01, 6.87022880e-02,
        3.25257219e-02, 1.55990710e-02, 1.39395948e-02, 1.36077002e-01,
        2.32326593e-02, 3.28576155e-02, 6.96979742e-03, 3.45170915e-01],
       [4.76034125e-03, 6.74655288e-02, 7.74786621e-02, 4.92449105e-03,
        7.00262666e-01, 6.56598830e-04, 1.08338809e-02, 6.41825348e-02,
        2.08470132e-02, 1.98621135e-02, 1.65791195e-02, 1.21470783e-02],
       [1.38592748e-02, 1.05650321e-01, 1.64179113e-02, 3.23880583e-01,
        3.23880583e-01, 6.80170581e-02, 1.38592755e-03, 3.96588482e-02,
        3.50746252e-02, 6.26865700e-02, 8.52878438e-04, 8.63539428e-03],
       [1.23976888e-02, 2.04742417e-01, 9.26817488e-03, 5.53683192e-03,
        6.38902247e-01, 3.24987969e-03, 2.40731824e-04, 1.80548877e-02,
        4.52575833e-02, 2.20269617e-02, 4.81463649e-04, 3.98411155e-02],
       [1.70678329e-02, 1.21079506e-02, 1.77097008e-01, 1.35667399e-02,
        2.64040858e-01, 4.70459508e-03, 4.38001454e-02, 2.39

In [18]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(T), index=list(T))

In [19]:
tags_df

Unnamed: 0,ADV,ADJ,ADP,DET,NOUN,PRON,PRT,.,X,NUM,CONJ,VERB
ADV,0.078327,0.128443,0.118155,0.068702,0.032526,0.015599,0.01394,0.136077,0.023233,0.032858,0.00697,0.345171
ADJ,0.00476,0.067466,0.077479,0.004924,0.700263,0.000657,0.010834,0.064183,0.020847,0.019862,0.016579,0.012147
ADP,0.013859,0.10565,0.016418,0.323881,0.323881,0.068017,0.001386,0.039659,0.035075,0.062687,0.000853,0.008635
DET,0.012398,0.204742,0.009268,0.005537,0.638902,0.00325,0.000241,0.018055,0.045258,0.022027,0.000481,0.039841
NOUN,0.017068,0.012108,0.177097,0.013567,0.264041,0.004705,0.0438,0.23957,0.028811,0.009373,0.042743,0.147119
PRON,0.033924,0.073246,0.022359,0.009252,0.204318,0.008096,0.013107,0.041249,0.093292,0.007325,0.005397,0.488435
PRT,0.010108,0.084773,0.021193,0.103358,0.249755,0.018259,0.001956,0.040756,0.013694,0.055103,0.002282,0.398761
.,0.052305,0.044653,0.092366,0.174289,0.220022,0.066619,0.002341,0.093896,0.027188,0.079402,0.057796,0.089035
X,0.025858,0.017558,0.144613,0.053631,0.062251,0.055547,0.184677,0.163288,0.07502,0.001756,0.010535,0.205267
NUM,0.002413,0.034691,0.035596,0.003318,0.348718,0.001508,0.028356,0.118854,0.211463,0.182202,0.013876,0.019005


### Build the vanilla Viterbi based POS tagger

In [20]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

### Evaluating on Test Set

In [21]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start
print("tagged_seq: ", tagged_seq)
print("Time taken in seconds: ", difference)

tagged_seq:  [('J.P.', 'ADV'), ('Bolduc', 'ADV'), (',', '.'), ('vice', 'NOUN'), ('chairman', 'NOUN'), ('of', 'ADP'), ('W.R.', 'NOUN'), ('Grace', 'NOUN'), ('&', 'CONJ'), ('Co.', 'NOUN'), (',', '.'), ('which', 'DET'), ('*T*-10', 'X'), ('holds', 'VERB'), ('a', 'DET'), ('83.4', 'ADV'), ('%', 'NOUN'), ('interest', 'NOUN'), ('in', 'ADP'), ('this', 'DET'), ('energy-services', 'ADV'), ('company', 'NOUN'), (',', '.'), ('was', 'VERB'), ('elected', 'VERB'), ('*-10', 'X'), ('a', 'DET'), ('director', 'NOUN'), ('.', '.'), ('Elisa', 'ADV'), ('Hollis', 'ADV'), ('launched', 'VERB'), ('a', 'DET'), ('diaper', 'NOUN'), ('service', 'NOUN'), ('last', 'ADJ'), ('year', 'NOUN'), ('because', 'ADP'), ('State', 'NOUN'), ('College', 'NOUN'), (',', '.'), ('Pa.', 'NOUN'), (',', '.'), ('where', 'ADV'), ('she', 'PRON'), ('lives', 'VERB'), ('*T*-1', 'X'), (',', '.'), ('did', 'VERB'), ("n't", 'ADV'), ('have', 'VERB'), ('one', 'NUM'), ('.', '.'), ('Rally', 'NOUN'), ("'s", 'PRT'), ('Inc.', 'NOUN'), ('said', 'VERB'), ('0',

In [22]:
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
accuracy = len(check)/len(tagged_seq)
print(accuracy)

0.9236111111111112


In [23]:
# incorrect tags
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]
print(incorrect_tagged_cases)

[[("''", '.'), (('J.P.', 'ADV'), ('J.P.', 'NOUN'))], [('J.P.', 'NOUN'), (('Bolduc', 'ADV'), ('Bolduc', 'NOUN'))], [('a', 'DET'), (('83.4', 'ADV'), ('83.4', 'NUM'))], [('this', 'DET'), (('energy-services', 'ADV'), ('energy-services', 'ADJ'))], [('.', '.'), (('Elisa', 'ADV'), ('Elisa', 'NOUN'))], [('Elisa', 'NOUN'), (('Hollis', 'ADV'), ('Hollis', 'NOUN'))], [('Management', 'NOUN'), (('blurred', 'ADV'), ('blurred', 'VERB'))], [('blurred', 'VERB'), (('that', 'ADP'), ('that', 'DET'))], [('get', 'VERB'), (('across', 'ADP'), ('across', 'PRT'))], [('*-3', 'X'), (('governor', 'ADV'), ('governor', 'NOUN'))], [('the', 'DET'), (('onus', 'ADV'), ('onus', 'NOUN'))]]


In [24]:
# most frequent tag in the corpus
def get_tagcounts(tagged_cases):
    tag_list = []
    for item in tagged_cases:
        tag_list.append(item[0])
        tag_list.append(item[1][0])
        tag_list.append(item[1][1])
    return tag_list

tags_list = get_tagcounts(incorrect_tagged_cases)

tags = [pair[1] for pair in tags_list]
tag_counts = Counter(tags)
# the most common tags can be seen using the most_common() method of Counter
most_common_tags =tag_counts.most_common(5)
print(most_common_tags)

[('ADV', 9), ('NOUN', 9), ('DET', 4), ('VERB', 3), ('.', 2)]


### Solve the problem of unknown words

In [25]:
# Viterbi Heuristic
#Modifying the vannila viterbi
def Viterbi_techniqueone(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    V=[i[0] for i in train_bag]
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
     
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]      
            if words[key] in V:
                state_probability = emission_p * transition_p
            else:
                state_probability = transition_p    #Considering only the transition prob as emission will be zero        
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        
        state.append(state_max)      
    return list(zip(words, state))

In [26]:
# tagging the test sentences
start = time.time()
tagged_seq_unk = Viterbi_techniqueone(test_tagged_words)
end = time.time()
difference = end-start
print("tagged_seq: ", tagged_seq_unk )
print("Time taken in seconds: ", difference)

tagged_seq:  [('J.P.', 'NOUN'), ('Bolduc', 'NOUN'), (',', '.'), ('vice', 'NOUN'), ('chairman', 'NOUN'), ('of', 'ADP'), ('W.R.', 'NOUN'), ('Grace', 'NOUN'), ('&', 'CONJ'), ('Co.', 'NOUN'), (',', '.'), ('which', 'DET'), ('*T*-10', 'X'), ('holds', 'VERB'), ('a', 'DET'), ('83.4', 'NOUN'), ('%', 'NOUN'), ('interest', 'NOUN'), ('in', 'ADP'), ('this', 'DET'), ('energy-services', 'NOUN'), ('company', 'NOUN'), (',', '.'), ('was', 'VERB'), ('elected', 'VERB'), ('*-10', 'X'), ('a', 'DET'), ('director', 'NOUN'), ('.', '.'), ('Elisa', 'NOUN'), ('Hollis', 'NOUN'), ('launched', 'VERB'), ('a', 'DET'), ('diaper', 'NOUN'), ('service', 'NOUN'), ('last', 'ADJ'), ('year', 'NOUN'), ('because', 'ADP'), ('State', 'NOUN'), ('College', 'NOUN'), (',', '.'), ('Pa.', 'NOUN'), (',', '.'), ('where', 'ADV'), ('she', 'PRON'), ('lives', 'VERB'), ('*T*-1', 'X'), (',', '.'), ('did', 'VERB'), ("n't", 'ADV'), ('have', 'VERB'), ('one', 'NUM'), ('.', '.'), ('Rally', 'NOUN'), ("'s", 'PRT'), ('Inc.', 'NOUN'), ('said', 'VERB'),

#### Evaluating tagging accuracy

In [27]:
# accuracy
check_unk = [i for i, j in zip(tagged_seq_unk, test_run_base) if i == j]
accuracy_unk = len(check_unk)/len(tagged_seq_unk)
print(accuracy_unk)

0.9583333333333334


In [28]:
# incorrect tags
incorrect_tagged_cases_unk = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq_unk, test_run_base)) if j[0]!=j[1]]
print(incorrect_tagged_cases_unk)

[[('a', 'DET'), (('83.4', 'NOUN'), ('83.4', 'NUM'))], [('this', 'DET'), (('energy-services', 'NOUN'), ('energy-services', 'ADJ'))], [('Management', 'NOUN'), (('blurred', 'NOUN'), ('blurred', 'VERB'))], [('blurred', 'VERB'), (('that', 'ADP'), ('that', 'DET'))], [('get', 'VERB'), (('across', 'ADP'), ('across', 'PRT'))], [('*-3', 'X'), (('governor', 'VERB'), ('governor', 'NOUN'))]]


### 2nd Technique 2 using patterns to replace tags for unknown words

In [29]:
## Modifying using RegEx
def Viterbi_techniquetwo(incorrect_tagged_cases, tagged_seq_to_improve):
    patterns = [
    (r'.*es$', 'VERB'),
    (r'.*ing$', 'VERB'),
    (r'\d?[a-z]?-[a-z]', 'ADP'),         
    (r'.*ed$', 'VERB'),
    (r'^an?$|the$', 'DET'),              
    (r'.*ful$', 'ADJ'), 
    (r'.*ous$', 'ADJ'),
    (r'.*ble$', 'ADJ'),
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), 
    (r'.*', 'NOUN')
    ]
    # create a regex tagger 
    regexp_tagger = nltk.RegexpTagger(patterns)
 
    #tag the incorrect words properly this time 
    incorrect_words=[i[1][0] for i in incorrect_tagged_cases]
    regex_result=regexp_tagger.tag_sents(incorrect_words)
    for i in incorrect_words[:]:
        tagged_seq_to_improve.remove(i)
    for i in regex_result:
        tagged_seq_to_improve.append(i[0])
    return tagged_seq_to_improve

In [30]:
#The accuracy for the modified code 
tagged_seq_validation_temp = tagged_seq_unk
tagged_seq_validation_result = Viterbi_techniquetwo(incorrect_tagged_cases_unk, tagged_seq_validation_temp)

tagged_seq_validation_result.sort()
test_run_base.sort()
check_validation = [i for i, j in zip(tagged_seq_validation_result, test_run_base) if i == j] 
accuracy_validation = len(check_validation)/len(tagged_seq_validation_result)
print(accuracy_validation)

0.9791666666666666


In [31]:
# incorrect tags
incorrect_tag_cases_modified_validation =[[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq_validation_result, test_run_base)) if j[0]!=j[1]]
print(incorrect_tag_cases_modified_validation)

[[('able', 'ADJ'), (('across', 'NOUN'), ('across', 'PRT'))], [('elected', 'VERB'), (('energy-services', 'VERB'), ('energy-services', 'ADJ'))], [('that', 'ADP'), (('that', 'NOUN'), ('that', 'DET'))]]


### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

In [32]:
#The validation accuracy for the vanilla viterbi is  as below 
accuracy

0.9236111111111112

In [33]:
#The validation accuracy for the viterbi unknown is  as below 
accuracy_unk

0.9583333333333334

In [34]:
#The validation accuracy for the viterbi pattern using regex is  as below 
accuracy_validation

0.9791666666666666

### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications

In [35]:
#The incorrect tag list for the vanilla viterbi is  as below 
incorrect_tagged_cases

[[("''", '.'), (('J.P.', 'ADV'), ('J.P.', 'NOUN'))],
 [('J.P.', 'NOUN'), (('Bolduc', 'ADV'), ('Bolduc', 'NOUN'))],
 [('a', 'DET'), (('83.4', 'ADV'), ('83.4', 'NUM'))],
 [('this', 'DET'), (('energy-services', 'ADV'), ('energy-services', 'ADJ'))],
 [('.', '.'), (('Elisa', 'ADV'), ('Elisa', 'NOUN'))],
 [('Elisa', 'NOUN'), (('Hollis', 'ADV'), ('Hollis', 'NOUN'))],
 [('Management', 'NOUN'), (('blurred', 'ADV'), ('blurred', 'VERB'))],
 [('blurred', 'VERB'), (('that', 'ADP'), ('that', 'DET'))],
 [('get', 'VERB'), (('across', 'ADP'), ('across', 'PRT'))],
 [('*-3', 'X'), (('governor', 'ADV'), ('governor', 'NOUN'))],
 [('the', 'DET'), (('onus', 'ADV'), ('onus', 'NOUN'))]]

In [36]:
#The incorrect tag list for the viterbi unk is  as below 
incorrect_tagged_cases_unk

[[('a', 'DET'), (('83.4', 'NOUN'), ('83.4', 'NUM'))],
 [('this', 'DET'), (('energy-services', 'NOUN'), ('energy-services', 'ADJ'))],
 [('Management', 'NOUN'), (('blurred', 'NOUN'), ('blurred', 'VERB'))],
 [('blurred', 'VERB'), (('that', 'ADP'), ('that', 'DET'))],
 [('get', 'VERB'), (('across', 'ADP'), ('across', 'PRT'))],
 [('*-3', 'X'), (('governor', 'VERB'), ('governor', 'NOUN'))]]

In [37]:
#The incorrect tag list for the viterbi pattern using regex is  as below 
incorrect_tag_cases_modified_validation

[[('able', 'ADJ'), (('across', 'NOUN'), ('across', 'PRT'))],
 [('elected', 'VERB'),
  (('energy-services', 'VERB'), ('energy-services', 'ADJ'))],
 [('that', 'ADP'), (('that', 'NOUN'), ('that', 'DET'))]]

### Validation on test sentences

In [38]:
# Read the test file
test_sentence = open("E:/AIML/Course4-Natural Language Processing/Module3-Syntactic Processing-Assignment/Test_sentences.txt", "r")
test_text_sentence = test_sentence.read()
test_sentence.close()
print(test_text_sentence)

Android is a mobile operating system developed by Google.
Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.
Google and Twitter made a deal in 2015 that gave Google access to Twitter's firehose.
Twitter is an online news and social networking service on which users post and interact with messages known as tweets.
Before entering politics, Donald Trump was a domineering businessman and a television personality.
The 2018 FIFA World Cup is the 21st FIFA World Cup, an international football tournament contested once every four years.
This is the first World Cup to be held in Eastern Europe and the 11th time that it has been held in Europe.
Show me the cheapest round trips from Dallas to Atlanta
I would like to see flights from Denver to Philadelphia.
Show me the price of the flights leaving Atlanta at about 3 in the afternoon and arriving in San Francisco.
NASA invited social media users to experience the launch of ICESAT-2 Satellite.






In [39]:
## convert into word
#sentence_test = test_sentence.to_string(buf=None, columns=None, index=False)
words = word_tokenize(test_text_sentence)
print(words)

['Android', 'is', 'a', 'mobile', 'operating', 'system', 'developed', 'by', 'Google', '.', 'Android', 'has', 'been', 'the', 'best-selling', 'OS', 'worldwide', 'on', 'smartphones', 'since', '2011', 'and', 'on', 'tablets', 'since', '2013', '.', 'Google', 'and', 'Twitter', 'made', 'a', 'deal', 'in', '2015', 'that', 'gave', 'Google', 'access', 'to', 'Twitter', "'s", 'firehose', '.', 'Twitter', 'is', 'an', 'online', 'news', 'and', 'social', 'networking', 'service', 'on', 'which', 'users', 'post', 'and', 'interact', 'with', 'messages', 'known', 'as', 'tweets', '.', 'Before', 'entering', 'politics', ',', 'Donald', 'Trump', 'was', 'a', 'domineering', 'businessman', 'and', 'a', 'television', 'personality', '.', 'The', '2018', 'FIFA', 'World', 'Cup', 'is', 'the', '21st', 'FIFA', 'World', 'Cup', ',', 'an', 'international', 'football', 'tournament', 'contested', 'once', 'every', 'four', 'years', '.', 'This', 'is', 'the', 'first', 'World', 'Cup', 'to', 'be', 'held', 'in', 'Eastern', 'Europe', 'and',

In [40]:
# Tag words using POS
pos_tagged_test_sentence = nltk.pos_tag(words, tagset='universal')
print("pos_tagged_test_sentence", pos_tagged_test_sentence)

pos_tagged_test_sentence [('Android', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'NOUN'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google', 'NOUN'), ('.', '.'), ('Android', 'NOUN'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'ADJ'), ('OS', 'NOUN'), ('worldwide', 'NOUN'), ('on', 'ADP'), ('smartphones', 'NOUN'), ('since', 'ADP'), ('2011', 'NUM'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('2013', 'NUM'), ('.', '.'), ('Google', 'NOUN'), ('and', 'CONJ'), ('Twitter', 'NOUN'), ('made', 'VERB'), ('a', 'DET'), ('deal', 'NOUN'), ('in', 'ADP'), ('2015', 'NUM'), ('that', 'DET'), ('gave', 'VERB'), ('Google', 'NOUN'), ('access', 'NOUN'), ('to', 'PRT'), ('Twitter', 'NOUN'), ("'s", 'PRT'), ('firehose', 'NOUN'), ('.', '.'), ('Twitter', 'NOUN'), ('is', 'VERB'), ('an', 'DET'), ('online', 'ADJ'), ('news', 'NOUN'), ('and', 'CONJ'), ('social', 'ADJ'), ('networking', 'NOUN'), ('service', 'NOUN'), ('on', 'AD

In [41]:
# Validation to run on Viterbi
start = time.time()
word_tagged_seq = Viterbi(words)
end = time.time()
difference = end-start
print(word_tagged_seq)
print(difference)

[('Android', 'ADV'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'NOUN'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google', 'ADV'), ('.', '.'), ('Android', 'ADV'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'ADJ'), ('OS', 'ADV'), ('worldwide', 'ADV'), ('on', 'ADP'), ('smartphones', 'ADV'), ('since', 'ADP'), ('2011', 'ADV'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('2013', 'ADV'), ('.', '.'), ('Google', 'ADV'), ('and', 'CONJ'), ('Twitter', 'ADV'), ('made', 'VERB'), ('a', 'DET'), ('deal', 'NOUN'), ('in', 'ADP'), ('2015', 'ADV'), ('that', 'ADP'), ('gave', 'VERB'), ('Google', 'ADV'), ('access', 'NOUN'), ('to', 'PRT'), ('Twitter', 'ADV'), ("'s", 'PRT'), ('firehose', 'ADV'), ('.', '.'), ('Twitter', 'ADV'), ('is', 'VERB'), ('an', 'DET'), ('online', 'ADV'), ('news', 'NOUN'), ('and', 'CONJ'), ('social', 'ADJ'), ('networking', 'NOUN'), ('service', 'NOUN'), ('on', 'ADP'), ('which', 'DET'), ('users', 'NOU

In [42]:
# accuracy
word_check = [i for i, j in zip(word_tagged_seq, pos_tagged_test_sentence) if i == j] 
word_accuracy = len(word_check)/len(word_tagged_seq)
print(word_accuracy)

0.7624309392265194


In [43]:
# Incorrect Tags
incorrect_tagged_cases_word = [[pos_tagged_test_sentence[i-1],j] for i, j in enumerate(zip(word_tagged_seq, pos_tagged_test_sentence)) if j[0]!=j[1]]
print(incorrect_tagged_cases_word)

[[('.', '.'), (('Android', 'ADV'), ('Android', 'NOUN'))], [('by', 'ADP'), (('Google', 'ADV'), ('Google', 'NOUN'))], [('.', '.'), (('Android', 'ADV'), ('Android', 'NOUN'))], [('best-selling', 'ADJ'), (('OS', 'ADV'), ('OS', 'NOUN'))], [('OS', 'NOUN'), (('worldwide', 'ADV'), ('worldwide', 'NOUN'))], [('on', 'ADP'), (('smartphones', 'ADV'), ('smartphones', 'NOUN'))], [('since', 'ADP'), (('2011', 'ADV'), ('2011', 'NUM'))], [('since', 'ADP'), (('2013', 'ADV'), ('2013', 'NUM'))], [('.', '.'), (('Google', 'ADV'), ('Google', 'NOUN'))], [('and', 'CONJ'), (('Twitter', 'ADV'), ('Twitter', 'NOUN'))], [('in', 'ADP'), (('2015', 'ADV'), ('2015', 'NUM'))], [('2015', 'NUM'), (('that', 'ADP'), ('that', 'DET'))], [('gave', 'VERB'), (('Google', 'ADV'), ('Google', 'NOUN'))], [('to', 'PRT'), (('Twitter', 'ADV'), ('Twitter', 'NOUN'))], [("'s", 'PRT'), (('firehose', 'ADV'), ('firehose', 'NOUN'))], [('.', '.'), (('Twitter', 'ADV'), ('Twitter', 'NOUN'))], [('an', 'DET'), (('online', 'ADV'), ('online', 'ADJ'))], 

In [44]:
# Validation to run on Viterbi unknown
start = time.time()
word_tagged_seq_unk = Viterbi_techniqueone(words)
end = time.time()
difference = end-start
print(word_tagged_seq_unk)
print(difference)

[('Android', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'NOUN'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google', 'DET'), ('.', '.'), ('Android', 'NOUN'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'ADJ'), ('OS', 'NOUN'), ('worldwide', 'NOUN'), ('on', 'ADP'), ('smartphones', 'DET'), ('since', 'ADP'), ('2011', 'DET'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('2013', 'DET'), ('.', '.'), ('Google', 'NOUN'), ('and', 'CONJ'), ('Twitter', 'NOUN'), ('made', 'VERB'), ('a', 'DET'), ('deal', 'NOUN'), ('in', 'ADP'), ('2015', 'DET'), ('that', 'ADP'), ('gave', 'VERB'), ('Google', 'X'), ('access', 'NOUN'), ('to', 'PRT'), ('Twitter', 'VERB'), ("'s", 'PRT'), ('firehose', 'VERB'), ('.', '.'), ('Twitter', 'NOUN'), ('is', 'VERB'), ('an', 'DET'), ('online', 'NOUN'), ('news', 'NOUN'), ('and', 'CONJ'), ('social', 'ADJ'), ('networking', 'NOUN'), ('service', 'NOUN'), ('on', 'ADP'), ('which', 'DET'), ('user

In [45]:
# accuracy
word_check_unk = [i for i, j in zip(word_tagged_seq_unk, pos_tagged_test_sentence) if i == j] 
word_accuracy_unk = len(word_check_unk)/len(word_tagged_seq_unk)
print(word_accuracy_unk)

0.8674033149171271


In [46]:
# Incorerct Tags
incorrect_tagged_cases_word_unk = [[pos_tagged_test_sentence[i-1],j] for i, j in enumerate(zip(word_tagged_seq_unk, pos_tagged_test_sentence)) if j[0]!=j[1]]
print(incorrect_tagged_cases_word_unk)

[[('by', 'ADP'), (('Google', 'DET'), ('Google', 'NOUN'))], [('on', 'ADP'), (('smartphones', 'DET'), ('smartphones', 'NOUN'))], [('since', 'ADP'), (('2011', 'DET'), ('2011', 'NUM'))], [('since', 'ADP'), (('2013', 'DET'), ('2013', 'NUM'))], [('in', 'ADP'), (('2015', 'DET'), ('2015', 'NUM'))], [('2015', 'NUM'), (('that', 'ADP'), ('that', 'DET'))], [('gave', 'VERB'), (('Google', 'X'), ('Google', 'NOUN'))], [('to', 'PRT'), (('Twitter', 'VERB'), ('Twitter', 'NOUN'))], [("'s", 'PRT'), (('firehose', 'VERB'), ('firehose', 'NOUN'))], [('an', 'DET'), (('online', 'NOUN'), ('online', 'ADJ'))], [('with', 'ADP'), (('messages', 'DET'), ('messages', 'NOUN'))], [('as', 'ADP'), (('tweets', 'DET'), ('tweets', 'NOUN'))], [('a', 'DET'), (('domineering', 'NOUN'), ('domineering', 'ADJ'))], [('The', 'DET'), (('2018', 'NOUN'), ('2018', 'NUM'))], [('the', 'DET'), (('21st', 'NOUN'), ('21st', 'NUM'))], [('tournament', 'NOUN'), (('contested', 'NOUN'), ('contested', 'VERB'))], [('the', 'DET'), (('11th', 'ADJ'), ('11

In [47]:
#The validation accuracy for the modified code using regex
tagged_seq_validation_regex = word_tagged_seq_unk
tagged_seq_validation_result_regex = Viterbi_techniquetwo(incorrect_tagged_cases_word_unk, tagged_seq_validation_regex)

tagged_seq_validation_result_regex.sort()
pos_tagged_test_sentence.sort()
check_validation_regex = [i for i, j in zip(tagged_seq_validation_result_regex, pos_tagged_test_sentence) if i == j] 
accuracy_validation_regex = len(check_validation_regex)/len(tagged_seq_validation_result_regex)
print(accuracy_validation_regex)

0.9337016574585635


In [48]:
# Incorrect tags
incorrect_tagged_cases_word_regex = [[pos_tagged_test_sentence[i-1],j] for i, j in enumerate(zip(tagged_seq_validation_result_regex, pos_tagged_test_sentence)) if j[0]!=j[1]]
print(incorrect_tagged_cases_word_regex)

[[('.', '.'), (('11th', 'NOUN'), ('11th', 'NUM'))], [('2018', 'NUM'), (('21st', 'NOUN'), ('21st', 'NUM'))], [('Satellite', 'NOUN'), (('Show', 'NOUN'), ('Show', 'VERB'))], [('Show', 'VERB'), (('Show', 'NOUN'), ('Show', 'VERB'))], [('a', 'DET'), (('about', 'NOUN'), ('about', 'ADV'))], [('developed', 'VERB'), (('domineering', 'VERB'), ('domineering', 'ADJ'))], [('every', 'DET'), (('experience', 'NOUN'), ('experience', 'VERB'))], [('leaving', 'VERB'), (('like', 'NOUN'), ('like', 'VERB'))], [('media', 'NOUN'), (('messages', 'VERB'), ('messages', 'NOUN'))], [('once', 'ADV'), (('online', 'NOUN'), ('online', 'ADJ'))], [('since', 'ADP'), (('smartphones', 'VERB'), ('smartphones', 'NOUN'))], [('that', 'ADP'), (('that', 'NOUN'), ('that', 'DET'))]]
