## POS tagging using modified Viterbi

### Data Preparation

In [49]:
#Importing libraries
import nltk
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import random
import time
import re

In [2]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [3]:
nltk_data[0:5]

[[('Pierre', 'NOUN'),
  ('Vinken', 'NOUN'),
  (',', '.'),
  ('61', 'NUM'),
  ('years', 'NOUN'),
  ('old', 'ADJ'),
  (',', '.'),
  ('will', 'VERB'),
  ('join', 'VERB'),
  ('the', 'DET'),
  ('board', 'NOUN'),
  ('as', 'ADP'),
  ('a', 'DET'),
  ('nonexecutive', 'ADJ'),
  ('director', 'NOUN'),
  ('Nov.', 'NOUN'),
  ('29', 'NUM'),
  ('.', '.')],
 [('Mr.', 'NOUN'),
  ('Vinken', 'NOUN'),
  ('is', 'VERB'),
  ('chairman', 'NOUN'),
  ('of', 'ADP'),
  ('Elsevier', 'NOUN'),
  ('N.V.', 'NOUN'),
  (',', '.'),
  ('the', 'DET'),
  ('Dutch', 'NOUN'),
  ('publishing', 'VERB'),
  ('group', 'NOUN'),
  ('.', '.')],
 [('Rudolph', 'NOUN'),
  ('Agnew', 'NOUN'),
  (',', '.'),
  ('55', 'NUM'),
  ('years', 'NOUN'),
  ('old', 'ADJ'),
  ('and', 'CONJ'),
  ('former', 'ADJ'),
  ('chairman', 'NOUN'),
  ('of', 'ADP'),
  ('Consolidated', 'NOUN'),
  ('Gold', 'NOUN'),
  ('Fields', 'NOUN'),
  ('PLC', 'NOUN'),
  (',', '.'),
  ('was', 'VERB'),
  ('named', 'VERB'),
  ('*-1', 'X'),
  ('a', 'DET'),
  ('nonexecutive', 'ADJ'),
 

In [4]:
print('Size of nltk data: ', len(nltk_data))

Size of nltk data:  3914


### Build the vanilla Viterbi based POS tagger

In [5]:
train_set, test_set = train_test_split(nltk_data, test_size=0.3)
print('Size of training set', len(train_set))
print('Size of test set', len(test_set))

Size of training set 2739
Size of test set 1175


In [6]:
train_all_tuples = [tup for sent in train_set for tup in sent]
len(train_all_tuples)

70256

In [7]:
train_all_words = [tup[0] for tup in train_all_tuples]

train_all_tags = [tup[1] for tup in train_all_tuples]

In [8]:
print('Size of all words: ',len(train_all_words))
print('Size of all tags: ',len(train_all_tags))

Size of all words:  70256
Size of all tags:  70256


In [9]:
# Find the unique words in the list
train_uniq_words = set(train_all_words)
print('Size of unique words: ',len(train_uniq_words)) 

Size of unique words:  10211


In [10]:
# Find the unique tags in the list
train_uniq_tags = set(train_all_tags)
print('Size of unique tags: ',len(train_uniq_tags)) 
print('Tags--->', train_uniq_tags)

Size of unique tags:  12
Tags---> {'VERB', 'PRON', 'ADJ', 'PRT', 'CONJ', 'NOUN', 'ADV', 'NUM', 'X', '.', 'DET', 'ADP'}


In [11]:
# compute word given tag: Emission Probability
def calculateEmissionProb(word, tag, train_bag = train_all_tuples):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    prob = count_w_given_tag/count_tag
    return prob

In [12]:
print(calculateEmissionProb('man','NOUN'))

0.00044997750112494374


In [13]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def calculateTransitionProb(t2, t1, train_bag = train_all_tuples):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    prob = count_t2_t1/count_t1
    return prob

In [14]:
print(calculateTransitionProb('NOUN','DET'))

0.63787099983419


In [15]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(train_uniq_tags), len(train_uniq_tags)), dtype='float32')
for i, t1 in enumerate(list(train_uniq_tags)):
    for j, t2 in enumerate(list(train_uniq_tags)): 
        tags_matrix[i, j] = calculateTransitionProb(t2,t1)


In [16]:
print('Size of tags matrix:', tags_matrix.shape)
tags_matrix

Size of tags matrix: (12, 12)


array([[1.67451590e-01, 3.52694914e-02, 6.66666701e-02, 3.12925167e-02,
        5.86080598e-03, 1.10518053e-01, 8.33071694e-02, 2.16640495e-02,
        2.18733653e-01, 3.57927792e-02, 1.31972790e-01, 9.14704353e-02],
       [4.90478635e-01, 8.74935649e-03, 7.10241869e-02, 1.49253728e-02,
        4.11734451e-03, 2.06896558e-01, 3.34534235e-02, 8.74935649e-03,
        8.90375674e-02, 4.16881107e-02, 8.23468901e-03, 2.26453934e-02],
       [1.28089888e-02, 4.49438201e-04, 6.47191033e-02, 1.01123592e-02,
        1.77528095e-02, 6.97528064e-01, 4.26966278e-03, 1.88764036e-02,
        2.13483144e-02, 6.69662952e-02, 4.04494395e-03, 8.11235979e-02],
       [4.15480435e-01, 1.73487552e-02, 9.11921710e-02, 2.66903918e-03,
        1.77935942e-03, 2.25533813e-01, 9.34163667e-03, 5.42704612e-02,
        1.29003562e-02, 4.40391451e-02, 1.02758005e-01, 2.26868335e-02],
       [1.63809523e-01, 6.15873002e-02, 1.18095241e-01, 5.71428565e-03,
        6.34920609e-04, 3.43492061e-01, 5.46031743e-02, 4.25

In [17]:
# convert the matrix to a df for better readability
tags_matrix_df = pd.DataFrame(tags_matrix, columns = list(train_uniq_tags), index=list(train_uniq_tags))
tags_matrix_df

Unnamed: 0,VERB,PRON,ADJ,PRT,CONJ,NOUN,ADV,NUM,X,.,DET,ADP
VERB,0.167452,0.035269,0.066667,0.031293,0.005861,0.110518,0.083307,0.021664,0.218734,0.035793,0.131973,0.09147
PRON,0.490479,0.008749,0.071024,0.014925,0.004117,0.206897,0.033453,0.008749,0.089038,0.041688,0.008235,0.022645
ADJ,0.012809,0.000449,0.064719,0.010112,0.017753,0.697528,0.00427,0.018876,0.021348,0.066966,0.004045,0.081124
PRT,0.41548,0.017349,0.091192,0.002669,0.001779,0.225534,0.009342,0.05427,0.0129,0.044039,0.102758,0.022687
CONJ,0.16381,0.061587,0.118095,0.005714,0.000635,0.343492,0.054603,0.04254,0.008254,0.029206,0.121905,0.050159
NOUN,0.149143,0.00465,0.011299,0.042098,0.042148,0.264387,0.017599,0.00955,0.029699,0.240988,0.013099,0.175341
ADV,0.333186,0.014563,0.131068,0.015887,0.007944,0.031774,0.086055,0.029568,0.022948,0.135922,0.070168,0.120918
NUM,0.019223,0.001227,0.03681,0.027403,0.015133,0.350102,0.002454,0.179959,0.213906,0.114519,0.00409,0.035174
X,0.202964,0.056486,0.015679,0.189648,0.010954,0.065292,0.024485,0.003436,0.074742,0.166881,0.050902,0.138531
.,0.08663,0.066425,0.045856,0.002178,0.056987,0.2219,0.052632,0.079976,0.026255,0.099456,0.173382,0.088203


In [18]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_all_tuples):
    state = []
    T = list(train_uniq_tags)
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_matrix_df.loc['.', tag]
            else:
                transition_p = tags_matrix_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = calculateEmissionProb(words[key], tag) 
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [19]:
# Running on entire test dataset would take more than 3-4hrs. 
# Let's test our Viterbi algorithm on a few sample sentences of test dataset

random.seed(1234)

# choose random 5 sents
rndom = [random.randint(1,len(test_set)) for x in range(5)]

# list of sents
test_run = [test_set[i] for i in rndom]

# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]
test_run

[[('20', 'NUM'),
  ('billion', 'NUM'),
  ('yen', 'NOUN'),
  ('of', 'ADP'),
  ('6', 'NUM'),
  ('%', 'NOUN'),
  ('Eurobonds', 'NOUN'),
  ('due', 'ADJ'),
  ('Nov.', 'NOUN'),
  ('21', 'NUM'),
  (',', '.'),
  ('1994', 'NUM'),
  (',', '.'),
  ('priced', 'VERB'),
  ('*', 'X'),
  ('at', 'ADP'),
  ('101', 'NUM'),
  ('3\\/4', 'NUM'),
  ('*', 'X'),
  ('to', 'PRT'),
  ('yield', 'VERB'),
  ('6.03', 'NUM'),
  ('%', 'NOUN'),
  ('less', 'ADV'),
  ('full', 'ADJ'),
  ('fees', 'NOUN'),
  (',', '.'),
  ('via', 'ADP'),
  ('Mitsui', 'NOUN'),
  ('Finance', 'NOUN'),
  ('International', 'NOUN'),
  ('.', '.')],
 [('This', 'DET'),
  ('is', 'VERB'),
  ('the', 'DET'),
  ('real', 'ADJ'),
  ('issue', 'NOUN'),
  ('raised', 'VERB'),
  ('*', 'X'),
  ('by', 'ADP'),
  ('the', 'DET'),
  ('Wedtech', 'NOUN'),
  ('scandal', 'NOUN'),
  ('.', '.')],
 [('Macmillan\\/McGraw', 'NOUN'),
  ('says', 'VERB'),
  ('0', 'X'),
  ('``', '.'),
  ('well', 'ADV'),
  ('over', 'ADP'),
  ('10', 'NUM'),
  ('million', 'NUM'),
  ("''", '.'),
  ('o

In [20]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start

In [21]:
print("Time taken in seconds: ", difference)
print(tagged_seq)

Time taken in seconds:  6.838514804840088
[('20', 'NUM'), ('billion', 'NUM'), ('yen', 'NOUN'), ('of', 'ADP'), ('6', 'NUM'), ('%', 'NOUN'), ('Eurobonds', 'VERB'), ('due', 'ADJ'), ('Nov.', 'NOUN'), ('21', 'NUM'), (',', '.'), ('1994', 'NUM'), (',', '.'), ('priced', 'VERB'), ('*', 'X'), ('at', 'ADP'), ('101', 'NUM'), ('3\\/4', 'NUM'), ('*', 'X'), ('to', 'PRT'), ('yield', 'VERB'), ('6.03', 'VERB'), ('%', 'NOUN'), ('less', 'ADJ'), ('full', 'ADJ'), ('fees', 'NOUN'), (',', '.'), ('via', 'ADP'), ('Mitsui', 'NOUN'), ('Finance', 'NOUN'), ('International', 'NOUN'), ('.', '.'), ('This', 'DET'), ('is', 'VERB'), ('the', 'DET'), ('real', 'ADJ'), ('issue', 'NOUN'), ('raised', 'VERB'), ('*', 'X'), ('by', 'ADP'), ('the', 'DET'), ('Wedtech', 'NOUN'), ('scandal', 'NOUN'), ('.', '.'), ('Macmillan\\/McGraw', 'NOUN'), ('says', 'VERB'), ('0', 'X'), ('``', '.'), ('well', 'ADV'), ('over', 'ADP'), ('10', 'NUM'), ('million', 'NUM'), ("''", '.'), ('of', 'ADP'), ('its', 'PRON'), ('Scoring', 'NOUN'), ('High', 'NOUN')

In [22]:
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 

In [23]:
accuracy = len(check)/len(tagged_seq)

In [24]:
accuracy

0.9349593495934959

In [25]:
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]

In [26]:
incorrect_tagged_cases

[[('%', 'NOUN'), (('Eurobonds', 'VERB'), ('Eurobonds', 'NOUN'))],
 [('yield', 'VERB'), (('6.03', 'VERB'), ('6.03', 'NUM'))],
 [('%', 'NOUN'), (('less', 'ADJ'), ('less', 'ADV'))],
 [('High', 'NOUN'),
  (('test-preparation', 'VERB'), ('test-preparation', 'ADJ'))],
 [('years', 'NOUN'), (('ago', 'ADP'), ('ago', 'ADV'))],
 [('a', 'DET'), (('wholesaler', 'VERB'), ('wholesaler', 'NOUN'))],
 [('of', 'ADP'), (('spirits', 'VERB'), ('spirits', 'NOUN'))],
 [('*T*-1', 'X'), (('controls', 'NOUN'), ('controls', 'VERB'))]]

### Solve the problem of unknown words

In [27]:
test_sentences = pd.read_csv('sample_test.txt', header=None, sep='\n')

In [28]:
test_sentences

Unnamed: 0,0
0,Android is a mobile operating system developed...
1,Android has been the best-selling OS worldwide...
2,Google and Twitter made a deal in 2015 that ga...
3,Twitter is an online news and social networkin...
4,"Before entering politics, Donald Trump was a d..."
5,The 2018 FIFA World Cup is the 21st FIFA World...
6,This is the first World Cup to be held in East...
7,Show me the cheapest round trips from Dallas t...
8,I would like to see flights from Denver to Phi...
9,Show me the price of the flights leaving Atlan...


In [29]:
test_sentences_list = list(test_sentences[0])
test_sentences_list

['Android is a mobile operating system developed by Google.',
 'Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.',
 "Google and Twitter made a deal in 2015 that gave Google access to Twitter's firehose.",
 'Twitter is an online news and social networking service on which users post and interact with messages known as tweets.',
 'Before entering politics, Donald Trump was a domineering businessman and a television personality.',
 'The 2018 FIFA World Cup is the 21st FIFA World Cup, an international football tournament contested once every four years.',
 'This is the first World Cup to be held in Eastern Europe and the 11th time that it has been held in Europe.',
 'Show me the cheapest round trips from Dallas to Atlanta',
 'I would like to see flights from Denver to Philadelphia.',
 'Show me the price of the flights leaving Atlanta at about 3 in the afternoon and arriving in San Francisco.',
 'NASA invited social media users to experienc

In [30]:
test_sentences_list[0]
test_all_words = [sent.split() for sent in test_sentences_list]
test_all_words

[['Android',
  'is',
  'a',
  'mobile',
  'operating',
  'system',
  'developed',
  'by',
  'Google.'],
 ['Android',
  'has',
  'been',
  'the',
  'best-selling',
  'OS',
  'worldwide',
  'on',
  'smartphones',
  'since',
  '2011',
  'and',
  'on',
  'tablets',
  'since',
  '2013.'],
 ['Google',
  'and',
  'Twitter',
  'made',
  'a',
  'deal',
  'in',
  '2015',
  'that',
  'gave',
  'Google',
  'access',
  'to',
  "Twitter's",
  'firehose.'],
 ['Twitter',
  'is',
  'an',
  'online',
  'news',
  'and',
  'social',
  'networking',
  'service',
  'on',
  'which',
  'users',
  'post',
  'and',
  'interact',
  'with',
  'messages',
  'known',
  'as',
  'tweets.'],
 ['Before',
  'entering',
  'politics,',
  'Donald',
  'Trump',
  'was',
  'a',
  'domineering',
  'businessman',
  'and',
  'a',
  'television',
  'personality.'],
 ['The',
  '2018',
  'FIFA',
  'World',
  'Cup',
  'is',
  'the',
  '21st',
  'FIFA',
  'World',
  'Cup,',
  'an',
  'international',
  'football',
  'tournament',
  '

In [31]:
test_all_words_list = []
for sublist in test_all_words:
    for item in sublist:
        test_all_words_list.append(item)

In [32]:
test_all_words_list

['Android',
 'is',
 'a',
 'mobile',
 'operating',
 'system',
 'developed',
 'by',
 'Google.',
 'Android',
 'has',
 'been',
 'the',
 'best-selling',
 'OS',
 'worldwide',
 'on',
 'smartphones',
 'since',
 '2011',
 'and',
 'on',
 'tablets',
 'since',
 '2013.',
 'Google',
 'and',
 'Twitter',
 'made',
 'a',
 'deal',
 'in',
 '2015',
 'that',
 'gave',
 'Google',
 'access',
 'to',
 "Twitter's",
 'firehose.',
 'Twitter',
 'is',
 'an',
 'online',
 'news',
 'and',
 'social',
 'networking',
 'service',
 'on',
 'which',
 'users',
 'post',
 'and',
 'interact',
 'with',
 'messages',
 'known',
 'as',
 'tweets.',
 'Before',
 'entering',
 'politics,',
 'Donald',
 'Trump',
 'was',
 'a',
 'domineering',
 'businessman',
 'and',
 'a',
 'television',
 'personality.',
 'The',
 '2018',
 'FIFA',
 'World',
 'Cup',
 'is',
 'the',
 '21st',
 'FIFA',
 'World',
 'Cup,',
 'an',
 'international',
 'football',
 'tournament',
 'contested',
 'once',
 'every',
 'four',
 'years.',
 'This',
 'is',
 'the',
 'first',
 'World',

In [33]:
start = time.time()
tagged_seq = Viterbi(test_all_words_list)
end = time.time()
difference = end-start

In [34]:
print("Time taken in seconds: ", difference)
print(tagged_seq)

Time taken in seconds:  9.188700914382935
[('Android', 'VERB'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'NOUN'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google.', 'VERB'), ('Android', 'VERB'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'VERB'), ('OS', 'VERB'), ('worldwide', 'VERB'), ('on', 'ADP'), ('smartphones', 'VERB'), ('since', 'ADP'), ('2011', 'VERB'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('2013.', 'VERB'), ('Google', 'VERB'), ('and', 'CONJ'), ('Twitter', 'VERB'), ('made', 'VERB'), ('a', 'DET'), ('deal', 'NOUN'), ('in', 'ADP'), ('2015', 'VERB'), ('that', 'ADP'), ('gave', 'VERB'), ('Google', 'VERB'), ('access', 'NOUN'), ('to', 'PRT'), ("Twitter's", 'VERB'), ('firehose.', 'VERB'), ('Twitter', 'VERB'), ('is', 'VERB'), ('an', 'DET'), ('online', 'VERB'), ('news', 'NOUN'), ('and', 'CONJ'), ('social', 'ADJ'), ('networking', 'NOUN'), ('service', 'NOUN'), ('on', 'ADP'), ('which', 'DET'), (

#### Viterbi Modification 1 - making emission probability as 1 for words not in vocabulary

In [40]:

def Viterbi_Mod1(words, train_bag = train_all_tuples):
    state = []
    T = list(train_uniq_tags)
    V = list(train_uniq_words)
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        
        p = [] 
        
        for tag in T:
            if key == 0:
                transition_p = tags_matrix_df.loc['.', tag]
            else:
                transition_p = tags_matrix_df.loc[state[-1], tag]

            # compute emission and state probabilities
            if word in V:
                emission_p = calculateEmissionProb(words[key], tag) 
            else:
                emission_p = 1
            state_probability = emission_p * transition_p    
            p.append(state_probability)
        
           
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [45]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi_Mod1(test_tagged_words)
end = time.time()
difference = end-start

In [46]:
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 

accuracy = len(check)/len(tagged_seq)

print ('Accuracy after first modification: ', accuracy)

incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(test_tagged_seq, test_run_base)) if j[0]!=j[1]]

print ('incorrect_tagged_cases after first modification: ', incorrect_tagged_cases)

Accuracy after first modification:  0.959349593495935
incorrect_tagged_cases after first modification:  [[('.', '.'), (('Android', 'NOUN'), ('20', 'NUM'))], [('20', 'NUM'), (('is', 'VERB'), ('billion', 'NUM'))], [('billion', 'NUM'), (('a', 'DET'), ('yen', 'NOUN'))], [('yen', 'NOUN'), (('mobile', 'ADJ'), ('of', 'ADP'))], [('of', 'ADP'), (('operating', 'NOUN'), ('6', 'NUM'))], [('6', 'NUM'), (('system', 'NOUN'), ('%', 'NOUN'))], [('%', 'NOUN'), (('developed', 'VERB'), ('Eurobonds', 'NOUN'))], [('Eurobonds', 'NOUN'), (('by', 'ADP'), ('due', 'ADJ'))], [('due', 'ADJ'), (('Google.', 'NOUN'), ('Nov.', 'NOUN'))], [('Nov.', 'NOUN'), (('Android', 'NOUN'), ('21', 'NUM'))], [('21', 'NUM'), (('has', 'VERB'), (',', '.'))], [(',', '.'), (('been', 'VERB'), ('1994', 'NUM'))], [('1994', 'NUM'), (('the', 'DET'), (',', '.'))], [(',', '.'), (('best-selling', 'NOUN'), ('priced', 'VERB'))], [('priced', 'VERB'), (('OS', 'NOUN'), ('*', 'X'))], [('*', 'X'), (('worldwide', 'NOUN'), ('at', 'ADP'))], [('at', 'ADP'

In [48]:
test_tagged_seq = Viterbi_Mod1(test_all_words_list)
print('The tag list of the test sentences are: \n', test_tagged_seq)

The tag list of the test sentences are: 
 [('Android', 'NOUN'), ('is', 'VERB'), ('a', 'DET'), ('mobile', 'ADJ'), ('operating', 'NOUN'), ('system', 'NOUN'), ('developed', 'VERB'), ('by', 'ADP'), ('Google.', 'NOUN'), ('Android', 'NOUN'), ('has', 'VERB'), ('been', 'VERB'), ('the', 'DET'), ('best-selling', 'NOUN'), ('OS', 'NOUN'), ('worldwide', 'NOUN'), ('on', 'ADP'), ('smartphones', 'NOUN'), ('since', 'ADP'), ('2011', 'NOUN'), ('and', 'CONJ'), ('on', 'ADP'), ('tablets', 'NOUN'), ('since', 'ADP'), ('2013.', 'NOUN'), ('Google', 'NOUN'), ('and', 'CONJ'), ('Twitter', 'NOUN'), ('made', 'VERB'), ('a', 'DET'), ('deal', 'NOUN'), ('in', 'ADP'), ('2015', 'NOUN'), ('that', 'ADP'), ('gave', 'VERB'), ('Google', 'X'), ('access', 'NOUN'), ('to', 'PRT'), ("Twitter's", 'VERB'), ('firehose.', 'X'), ('Twitter', 'VERB'), ('is', 'VERB'), ('an', 'DET'), ('online', 'NOUN'), ('news', 'NOUN'), ('and', 'CONJ'), ('social', 'ADJ'), ('networking', 'NOUN'), ('service', 'NOUN'), ('on', 'ADP'), ('which', 'DET'), ('users

#### Viterbi Modification 2 - adding regular expressions for words not in vocabulary

In [67]:
#Viterbi Modification 2 - adding regular expressions for words not in vocabulary


def Viterbi_Mod2(words, train_bag = train_all_tuples):
    state = []
    T = list(train_uniq_tags)
    V = list(train_uniq_words)
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        if word in V:
            p = [] 

            for tag in T:
                if key == 0:
                    transition_p = tags_matrix_df.loc['.', tag]
                else:
                    transition_p = tags_matrix_df.loc[state[-1], tag]

                # compute emission and state probabilities
                if word in V:
                    emission_p = calculateEmissionProb(words[key], tag) 
                else:
                    emission_p = 1
                state_probability = emission_p * transition_p    
                p.append(state_probability)
            pmax = max(p)
            # getting state for which probability is maximum
            state_max = T[p.index(pmax)]
        else:
            if re.match(r'.*ing$',word):
                state_max='VBG'       # gerunds
            elif re.match(r'.*ed$',word):
                state_max='VBD' # simple past
            elif re.match(r'.*es$',word):
                state_max='VBZ'
            elif re.match(r'.*ould$',word):
                state_max='MD'
            elif re.match(r'.*\'s$',word):
                state_max='NN$'
            elif re.match(r'^-?[0-9]+(.[0-9]+)?$',word):
                state_max='NUM'
            else:
                state_max='NOUN'
            
        state.append(state_max)
    return list(zip(words, state))



In [68]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi_Mod2(test_tagged_words)
end = time.time()
difference = end-start


In [70]:
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 

accuracy = len(check)/len(tagged_seq)

print ('Accuracy after second modification: ', accuracy)

incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(test_tagged_seq, test_run_base)) if j[0]!=j[1]]

print ('\n\n incorrect_tagged_cases after second modification: ', incorrect_tagged_cases)

test_tagged_seq = Viterbi_Mod2(test_all_words_list)
print('The tag list of the test sentences are: \n', test_tagged_seq)

Accuracy after second modification:  0.967479674796748


 incorrect_tagged_cases after second modification:  [[('.', '.'), (('Android', 'NOUN'), ('20', 'NUM'))], [('20', 'NUM'), (('is', 'VERB'), ('billion', 'NUM'))], [('billion', 'NUM'), (('a', 'DET'), ('yen', 'NOUN'))], [('yen', 'NOUN'), (('mobile', 'ADJ'), ('of', 'ADP'))], [('of', 'ADP'), (('operating', 'NOUN'), ('6', 'NUM'))], [('6', 'NUM'), (('system', 'NOUN'), ('%', 'NOUN'))], [('%', 'NOUN'), (('developed', 'VERB'), ('Eurobonds', 'NOUN'))], [('Eurobonds', 'NOUN'), (('by', 'ADP'), ('due', 'ADJ'))], [('due', 'ADJ'), (('Google.', 'NOUN'), ('Nov.', 'NOUN'))], [('Nov.', 'NOUN'), (('Android', 'NOUN'), ('21', 'NUM'))], [('21', 'NUM'), (('has', 'VERB'), (',', '.'))], [(',', '.'), (('been', 'VERB'), ('1994', 'NUM'))], [('1994', 'NUM'), (('the', 'DET'), (',', '.'))], [(',', '.'), (('best-selling', 'NOUN'), ('priced', 'VERB'))], [('priced', 'VERB'), (('OS', 'NOUN'), ('*', 'X'))], [('*', 'X'), (('worldwide', 'NOUN'), ('at', 'ADP'))], [('at', 

KeyError: 'the label [VBZ] is not in the [index]'