## POS tagging using modified Viterbi

### Data Preparation

In [1]:
#Importing libraries
import nltk
import time
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [5]:
# Splitting into train and test
random.seed(1234)
train_set, test_set = train_test_split(nltk_data,test_size=0.05)

print(len(train_set))
print(len(test_set))
print(train_set[:40])

3718
196
[[(u'*-1', u'X'), (u'Taking', u'VERB'), (u'a', u'DET'), (u'cue', u'NOUN'), (u'from', u'ADP'), (u'California', u'NOUN'), (u',', u'.'), (u'more', u'ADJ'), (u'politicians', u'NOUN'), (u'will', u'VERB'), (u'launch', u'VERB'), (u'their', u'PRON'), (u'campaigns', u'NOUN'), (u'by', u'ADP'), (u'*-1', u'X'), (u'backing', u'VERB'), (u'initiatives', u'NOUN'), (u',', u'.'), (u'says', u'VERB'), (u'0', u'X'), (u'*T*-2', u'X'), (u'David', u'NOUN'), (u'Magleby', u'NOUN'), (u'of', u'ADP'), (u'Brigham', u'NOUN'), (u'Young', u'NOUN'), (u'University', u'NOUN'), (u'.', u'.')], [(u'And', u'CONJ'), (u'several', u'ADJ'), (u'new', u'ADJ'), (u'funds', u'NOUN'), (u'that', u'DET'), (u'*T*-44', u'X'), (u'are', u'VERB'), (u"n't", u'ADV'), (u'even', u'ADV'), (u'fully', u'ADV'), (u'invested', u'VERB'), (u'yet', u'ADV'), (u'have', u'VERB'), (u'jumped', u'VERB'), (u'*-1', u'X'), (u'to', u'PRT'), (u'trade', u'VERB'), (u'at', u'ADP'), (u'big', u'ADJ'), (u'premiums', u'NOUN'), (u'.', u'.')], [(u'Failure', u'NOUN'

In [6]:
# Getting list of tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]
len(train_tagged_words)

95498

In [7]:
# tokens 
tokens = [pair[0] for pair in train_tagged_words]
tokens[:10]

[u'*-1',
 u'Taking',
 u'a',
 u'cue',
 u'from',
 u'California',
 u',',
 u'more',
 u'politicians',
 u'will']

In [8]:
# vocabulary
V = set(tokens)
print(len(V))

12019


In [9]:
# number of tags
T = set([pair[1] for pair in train_tagged_words])
len(T)

12

In [10]:
# number of tags
tags = [pair[1] for pair in train_tagged_words]
unique_tags = set(tags)
len(unique_tags)

12

In [11]:
print(unique_tags)

set([u'ADV', u'NOUN', u'NUM', u'ADP', u'PRT', u'DET', u'.', u'PRON', u'VERB', u'X', u'CONJ', u'ADJ'])


In [12]:
from collections import Counter
tag_counts = Counter(tags)
tag_counts

Counter({u'.': 11052,
         u'ADJ': 6087,
         u'ADP': 9364,
         u'ADV': 3015,
         u'CONJ': 2149,
         u'DET': 8298,
         u'NOUN': 27331,
         u'NUM': 3374,
         u'PRON': 2603,
         u'PRT': 3054,
         u'VERB': 12897,
         u'X': 6274})

In [13]:
tag_counts.most_common(5)

[(u'NOUN', 27331),
 (u'VERB', 12897),
 (u'.', 11052),
 (u'ADP', 9364),
 (u'DET', 8298)]

### Build the vanilla Viterbi based POS tagger

In [14]:
# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    #print(count_w_given_tag, count_tag)
    return (count_w_given_tag, count_tag)

In [15]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [16]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(T), len(T)), dtype='float32')
for i, t1 in enumerate(list(T)):
    for j, t2 in enumerate(list(T)): 
        #print(t2_given_t1(t2, t1)[0])
        #print(t2_given_t1(t2, t1)[1])
        tags_matrix[i, j] = (float(t2_given_t1(t2, t1)[0])/float(t2_given_t1(t2, t1)[1]))
        #print("###########")
        #print(tags_matrix[i, j])

In [17]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(T), index=list(T))

In [18]:
tags_df

Unnamed: 0,ADV,NOUN,NUM,ADP,PRT,DET,.,PRON,VERB,X,CONJ,ADJ
ADV,0.07927,0.031841,0.029851,0.119403,0.014262,0.067993,0.13665,0.015589,0.348259,0.022886,0.007297,0.1267
NOUN,0.017343,0.264864,0.00955,0.176905,0.044089,0.013355,0.237715,0.004793,0.147086,0.029051,0.042772,0.012477
NUM,0.002964,0.354475,0.186426,0.034973,0.02786,0.00326,0.117961,0.001482,0.017783,0.20658,0.013041,0.033195
ADP,0.013669,0.321017,0.062687,0.017407,0.001388,0.323473,0.040154,0.068988,0.008543,0.03428,0.000854,0.10754
PRT,0.010151,0.249181,0.057957,0.020629,0.001965,0.100851,0.043877,0.017027,0.398821,0.013752,0.001637,0.084152
DET,0.012413,0.637383,0.022295,0.009159,0.000241,0.005785,0.017836,0.003495,0.039769,0.045915,0.000482,0.20523
.,0.052117,0.217427,0.081162,0.091567,0.002443,0.175443,0.092924,0.066685,0.089577,0.027506,0.058089,0.044969
PRON,0.03496,0.208221,0.006915,0.021129,0.012294,0.009988,0.041106,0.007299,0.485209,0.094506,0.005378,0.072993
VERB,0.081492,0.109483,0.023261,0.091417,0.03148,0.134062,0.035124,0.035435,0.168954,0.217803,0.005583,0.065907
X,0.025661,0.060089,0.002869,0.145521,0.183615,0.054989,0.16417,0.055467,0.206726,0.073956,0.010041,0.016895


In [19]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = float(word_given_tag(words[key], tag)[0])/float(word_given_tag(words[key], tag)[1])
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        #print(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return zip(words, state)



In [20]:
# Running on entire test dataset would take more than 3-4hrs. 
# Let's test our Viterbi algorithm on a few sample sentences of test dataset

random.seed(1234)

# choose random 5 sents
rndom = [random.randint(1,len(test_set)) for x in range(5)]

# list of sents
test_run = [test_set[i] for i in rndom]

# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]
test_run

[[(u'Securities', u'NOUN'),
  (u'firms', u'NOUN'),
  (u'are', u'VERB'),
  (u'among', u'ADP'),
  (u'the', u'DET'),
  (u'biggest', u'ADJ'),
  (u'issuers', u'NOUN'),
  (u'of', u'ADP'),
  (u'commercial', u'ADJ'),
  (u'paper', u'NOUN'),
  (u',', u'.'),
  (u'or', u'CONJ'),
  (u'short-term', u'ADJ'),
  (u'corporate', u'ADJ'),
  (u'IOUs', u'NOUN'),
  (u',', u'.'),
  (u'which', u'DET'),
  (u'they', u'PRON'),
  (u'sell', u'VERB'),
  (u'*T*-1', u'X'),
  (u'*-2', u'X'),
  (u'to', u'PRT'),
  (u'finance', u'VERB'),
  (u'their', u'PRON'),
  (u'daily', u'ADJ'),
  (u'operations', u'NOUN'),
  (u'.', u'.')],
 [(u'Commonwealth', u'NOUN'),
  (u'Edison', u'NOUN'),
  (u'Co.', u'NOUN'),
  (u'was', u'VERB'),
  (u'ordered', u'VERB'),
  (u'*-1', u'X'),
  (u'*-2', u'X'),
  (u'to', u'PRT'),
  (u'refund', u'VERB'),
  (u'about', u'ADP'),
  (u'$', u'.'),
  (u'250', u'NUM'),
  (u'million', u'NUM'),
  (u'*U*', u'X'),
  (u'to', u'PRT'),
  (u'its', u'PRON'),
  (u'current', u'ADJ'),
  (u'and', u'CONJ'),
  (u'former', u'AD

In [21]:
test_tagged_words

[u'Securities',
 u'firms',
 u'are',
 u'among',
 u'the',
 u'biggest',
 u'issuers',
 u'of',
 u'commercial',
 u'paper',
 u',',
 u'or',
 u'short-term',
 u'corporate',
 u'IOUs',
 u',',
 u'which',
 u'they',
 u'sell',
 u'*T*-1',
 u'*-2',
 u'to',
 u'finance',
 u'their',
 u'daily',
 u'operations',
 u'.',
 u'Commonwealth',
 u'Edison',
 u'Co.',
 u'was',
 u'ordered',
 u'*-1',
 u'*-2',
 u'to',
 u'refund',
 u'about',
 u'$',
 u'250',
 u'million',
 u'*U*',
 u'to',
 u'its',
 u'current',
 u'and',
 u'former',
 u'ratepayers',
 u'for',
 u'illegal',
 u'rates',
 u'collected',
 u'*',
 u'for',
 u'cost',
 u'overruns',
 u'on',
 u'a',
 u'nuclear',
 u'power',
 u'plant',
 u'.',
 u'Bribe',
 u'by',
 u'bribe',
 u',',
 u'Mr.',
 u'Sternberg',
 u'and',
 u'his',
 u'co-author',
 u',',
 u'Matthew',
 u'C.',
 u'Harrison',
 u'Jr.',
 u',',
 u'lead',
 u'us',
 u'along',
 u'the',
 u'path',
 u'0',
 u'Wedtech',
 u'traveled',
 u'*T*-1',
 u',',
 u'from',
 u'its',
 u'inception',
 u'as',
 u'a',
 u'small',
 u'manufacturing',
 u'company',

In [22]:
# tagging the test sentences
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start

In [23]:
print("Time taken in seconds: ", difference)
print(tagged_seq)
#print(test_run_base)

('Time taken in seconds: ', 49.21199989318848)
[(u'Securities', u'NOUN'), (u'firms', u'NOUN'), (u'are', u'VERB'), (u'among', u'ADP'), (u'the', u'DET'), (u'biggest', u'ADJ'), (u'issuers', u'ADV'), (u'of', u'ADP'), (u'commercial', u'ADJ'), (u'paper', u'NOUN'), (u',', u'.'), (u'or', u'CONJ'), (u'short-term', u'ADJ'), (u'corporate', u'ADJ'), (u'IOUs', u'ADV'), (u',', u'.'), (u'which', u'DET'), (u'they', u'PRON'), (u'sell', u'VERB'), (u'*T*-1', u'X'), (u'*-2', u'X'), (u'to', u'PRT'), (u'finance', u'VERB'), (u'their', u'PRON'), (u'daily', u'ADJ'), (u'operations', u'NOUN'), (u'.', u'.'), (u'Commonwealth', u'NOUN'), (u'Edison', u'NOUN'), (u'Co.', u'NOUN'), (u'was', u'VERB'), (u'ordered', u'VERB'), (u'*-1', u'X'), (u'*-2', u'X'), (u'to', u'PRT'), (u'refund', u'VERB'), (u'about', u'ADP'), (u'$', u'.'), (u'250', u'NUM'), (u'million', u'NUM'), (u'*U*', u'X'), (u'to', u'PRT'), (u'its', u'PRON'), (u'current', u'ADJ'), (u'and', u'CONJ'), (u'former', u'ADJ'), (u'ratepayers', u'ADV'), (u'for', u'ADP'),

In [24]:
# accuracy of vanilla viterbi
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j]

In [25]:
#Accuracy of vanilla viterbi
accuracy = float(len(check))/float(len(tagged_seq))
accuracy

0.89937106918239

In [26]:
#Incorrect tagged cases by vanilla viterbi
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]
incorrect_tagged_cases



[[(u'biggest', u'ADJ'), ((u'issuers', u'ADV'), (u'issuers', u'NOUN'))],
 [(u'corporate', u'ADJ'), ((u'IOUs', u'ADV'), (u'IOUs', u'NOUN'))],
 [(u'former', u'ADJ'), ((u'ratepayers', u'ADV'), (u'ratepayers', u'NOUN'))],
 [(u'cost', u'NOUN'), ((u'overruns', u'ADV'), (u'overruns', u'NOUN'))],
 [(u'.', u'.'), ((u'Bribe', u'ADV'), (u'Bribe', u'NOUN'))],
 [(u'by', u'ADP'), ((u'bribe', u'ADV'), (u'bribe', u'NOUN'))],
 [(u',', u'.'), ((u'Matthew', u'ADV'), (u'Matthew', u'NOUN'))],
 [(u'C.', u'NOUN'), ((u'Harrison', u'ADV'), (u'Harrison', u'NOUN'))],
 [(u'the', u'DET'), ((u'path', u'ADV'), (u'path', u'NOUN'))],
 [(u'Wedtech', u'NOUN'), ((u'traveled', u'ADV'), (u'traveled', u'VERB'))],
 [(u'its', u'PRON'), ((u'inception', u'ADV'), (u'inception', u'NOUN'))],
 [(u'small', u'ADJ'),
  ((u'manufacturing', u'NOUN'), (u'manufacturing', u'VERB'))],
 [(u'of', u'ADP'), ((u'full-fledged', u'ADV'), (u'full-fledged', u'ADJ'))],
 [(u'producing', u'VERB'), ((u'vital', u'ADV'), (u'vital', u'ADJ'))],
 [(u'immediat

In [27]:
#No. of incorrectly tagged cases
len(incorrect_tagged_cases)

16

### Solve the problem of unknown words

# 1. Using a combined lexicon based tagger and a ruled based tagger 

#Approach:

The Lexicon tagger will be backed up by rule based tagger as per rules mentioned below.
Since the majority occurences ~27k are for NOUN pos, any unknown word will be tagged as NOUN if no other rule is satisfied.

In [28]:
# Based on the observation of the POS tags incorrectly classified by the Viterbi Model, the following rules can be brought in use.
# - A lot of verbs are missclassified as ADV ending in 'ing' or 'ed'
# - A lot of other nouns ending in 's' are misclassified as ADV
# - Numbers are clearly misclassified as ADV instead of NUM.

patterns = [
    (r'.*ing$', 'VERB'),              # gerund
    (r'.*ed$', 'VERB'),               # past tense
    (r'.*s$', 'NOUN'),                
    (r'.*', 'NOUN'),                   
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM')
]

In [29]:
# rule based tagger
rule_based_tagger = nltk.RegexpTagger(patterns)

# lexicon backed up by the rule-based tagger
lexicon_tagger = nltk.UnigramTagger(train_set, backoff=rule_based_tagger)

lexicon_tagger.evaluate(test_run)

0.9685534591194969

Clearly there is a lift of 7% as the new model shows an accuracy of ~97% while the vanilla Viterbi shows an accuracy of just 90%.

# 2. Using Stochastic Parsing technique

#Approach: 

Using the probabilities(Transition and emission) to find the state of the word. In case the word is present in corpus, the state probability = transition_p * emission_p
In case the word is out of corpus, the state probability = transition probability as the emission probability of the tag is not present.


In [31]:
# Viterbi Heuristic part 2 (modified approach)

def Viterbi_2(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    W = list(set([pair[0] for pair in train_bag]))
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = float(word_given_tag(words[key], tag)[0])/float(word_given_tag(words[key], tag)[1])
            print("eission out", emission_p )
            
            if word in W:
                
                #For words present in corpus state prob is prod of emission and trasition as both exists
                print("I am there")
                emission_p = float(word_given_tag(words[key], tag)[0])/float(word_given_tag(words[key], tag)[1])
                print("emission", emission_p)
                state_probability = emission_p * transition_p
            else:
                #For words not present in corpus, emission probability of tag is not present 
                print("I am not there")
                #state_probability = emission_p
                state_probability = transition_p
            p.append(state_probability)
            
        pmax = max(p)
        #print(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))


In [32]:
start = time.time()
tagged_seq = Viterbi_2(test_tagged_words)
end = time.time()
difference = end-start

('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0009513007207932384)
I am there
('emission', 0.0009513007207932384)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0011708316563609088)
I am there
('emission', 0.0011708316563609088)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am th

('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.003942828979793002)
I am there
('emission', 0.003942828979793002)
('eission out', 0.0)
I am not there
('eission out', 0.0)
I am not there
('eission out', 0.0)
I am not there
('eission out', 0.0)
I am not there
('eission out', 0.0)
I am not there
('eission out', 0.0)
I am not there
('eission out', 0.0)
I am not there
('eission out', 0.0)
I am not there
('eission out', 0.0)
I am not there
('eission out', 0.0)
I am not there
('eission out', 0.0)
I am not there
('eission out', 0.0)
I am not there
('eission out', 0.0)
I am there
('

('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.32935215345638796)
I am there
('emission', 0.32935215345638796)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0004390618711353408)
I am there
('emission', 0.0004390618711353408)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there


('eission out', 0.10966212211025489)
I am there
('emission', 0.10966212211025489)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.

('eission out', 0.0003316749585406302)
I am there
('emission', 0.0003316749585406302)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.08361811191798377)
I am there
('emission', 0.08361811191798377)
('eission out', 0.0003274394237066143)
I am there
('emission', 0.0003274394237066143)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.000365884892612784)
I am there
('emission', 0.000365884892612784)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission ou

('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 3.65884892612784e-05)
I am there
('emission', 3.65884892612784e-05)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0

('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 3.65884892612784e-05)
I am there
('emission', 3.65884892612784e-05)
('eission out', 0.0002963841138114997)
I am there
('emission', 0.0002963841138114997)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.4619185345866474)
I am there
('emission', 0.4619185345866474)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.000821422704123542)
I am there
('emission', 0.000821422704123542)
('eission out', 0.0)
I am not there
('eission out', 0.0)
I am not there
('eission out', 0.0)
I am not there
('eission out', 0.0)
I am not there
('ei

('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0005427618826083586)
I am there
('emission', 0.0005427618826083586)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.009073945336797043)
I am there
('emission', 0.009073945336797043)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am ther

('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.000821422704123542)
I am there
('emission', 0.000821422704123542)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 7.31769785225568e-05)
I am there
('emission', 7.31769785225568e-05)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there


('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.001152516327314637)
I am there
('emission', 0.001152516327314637)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0001642845408247084)
I am there
('emission', 0.0

('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0011630611770179111)
I am there
('emission', 0.0011630611770179111)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0002927079140902272)
I am there
('emission', 0.0002927079140902272)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am th

('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 7.31769785225568e-05)
I am there
('emission', 7.31769785225568e-05)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.00010679196924391286)
I am there
('emission', 0.00010679196924391286)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.6621684504420661)
I am there
('emission', 0.6621684504420661)
('eission out', 0.0004928536224741252)
I am there
('emission', 0.0004928536224741252)
('eission out', 0.0)
I am there
('emission',

('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.12209116990755499)
I am there
('emission', 0.12209116990755499)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.0)
I am there
('emission', 0.0)
('eission out', 0.32935215345638796)
I am there
('emission', 0.32935215345638796)
('eission out', 0.0)
I am there
('em

In [33]:
print("Time taken in seconds: ", difference)
print(tagged_seq)
#print(test_run_base)

('Time taken in seconds: ', 101.47399997711182)
[(u'Securities', u'NOUN'), (u'firms', u'NOUN'), (u'are', u'VERB'), (u'among', u'ADP'), (u'the', u'DET'), (u'biggest', u'ADJ'), (u'issuers', u'NOUN'), (u'of', u'ADP'), (u'commercial', u'ADJ'), (u'paper', u'NOUN'), (u',', u'.'), (u'or', u'CONJ'), (u'short-term', u'ADJ'), (u'corporate', u'ADJ'), (u'IOUs', u'NOUN'), (u',', u'.'), (u'which', u'DET'), (u'they', u'PRON'), (u'sell', u'VERB'), (u'*T*-1', u'X'), (u'*-2', u'X'), (u'to', u'PRT'), (u'finance', u'VERB'), (u'their', u'PRON'), (u'daily', u'ADJ'), (u'operations', u'NOUN'), (u'.', u'.'), (u'Commonwealth', u'NOUN'), (u'Edison', u'NOUN'), (u'Co.', u'NOUN'), (u'was', u'VERB'), (u'ordered', u'VERB'), (u'*-1', u'X'), (u'*-2', u'X'), (u'to', u'PRT'), (u'refund', u'VERB'), (u'about', u'ADP'), (u'$', u'.'), (u'250', u'NUM'), (u'million', u'NUM'), (u'*U*', u'X'), (u'to', u'PRT'), (u'its', u'PRON'), (u'current', u'ADJ'), (u'and', u'CONJ'), (u'former', u'ADJ'), (u'ratepayers', u'NOUN'), (u'for', u'AD

In [34]:
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j]

In [38]:
#Correctly classified cases
check

[(u'Securities', u'NOUN'),
 (u'firms', u'NOUN'),
 (u'are', u'VERB'),
 (u'among', u'ADP'),
 (u'the', u'DET'),
 (u'biggest', u'ADJ'),
 (u'issuers', u'NOUN'),
 (u'of', u'ADP'),
 (u'commercial', u'ADJ'),
 (u'paper', u'NOUN'),
 (u',', u'.'),
 (u'or', u'CONJ'),
 (u'short-term', u'ADJ'),
 (u'corporate', u'ADJ'),
 (u'IOUs', u'NOUN'),
 (u',', u'.'),
 (u'which', u'DET'),
 (u'they', u'PRON'),
 (u'sell', u'VERB'),
 (u'*T*-1', u'X'),
 (u'*-2', u'X'),
 (u'to', u'PRT'),
 (u'finance', u'VERB'),
 (u'their', u'PRON'),
 (u'daily', u'ADJ'),
 (u'operations', u'NOUN'),
 (u'.', u'.'),
 (u'Commonwealth', u'NOUN'),
 (u'Edison', u'NOUN'),
 (u'Co.', u'NOUN'),
 (u'was', u'VERB'),
 (u'ordered', u'VERB'),
 (u'*-1', u'X'),
 (u'*-2', u'X'),
 (u'to', u'PRT'),
 (u'refund', u'VERB'),
 (u'about', u'ADP'),
 (u'$', u'.'),
 (u'250', u'NUM'),
 (u'million', u'NUM'),
 (u'*U*', u'X'),
 (u'to', u'PRT'),
 (u'its', u'PRON'),
 (u'current', u'ADJ'),
 (u'and', u'CONJ'),
 (u'former', u'ADJ'),
 (u'ratepayers', u'NOUN'),
 (u'for', u'ADP

In [35]:
#Accuracy of the model
accuracy = float(len(check))/float(len(tagged_seq))
accuracy

0.9622641509433962

In [36]:
#Incorrect tagged cases by modified POS tagger
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]
incorrect_tagged_cases


[[(u'by', u'ADP'), ((u'bribe', u'DET'), (u'bribe', u'NOUN'))],
 [(u'Wedtech', u'NOUN'), ((u'traveled', u'NOUN'), (u'traveled', u'VERB'))],
 [(u'its', u'PRON'), ((u'inception', u'VERB'), (u'inception', u'NOUN'))],
 [(u'small', u'ADJ'),
  ((u'manufacturing', u'NOUN'), (u'manufacturing', u'VERB'))],
 [(u'of', u'ADP'), ((u'full-fledged', u'DET'), (u'full-fledged', u'ADJ'))],
 [(u'producing', u'VERB'), ((u'vital', u'X'), (u'vital', u'ADJ'))]]

In [37]:
#No. of incorrect cases
len(incorrect_tagged_cases)

6

#### Evaluating tagging accuracy

### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

### Cases which were incorrectly tagged by original POS tagger and got corrected by the modifications