## POS tagging using modified Viterbi

### Data Preparation

In [1]:
#Importing libraries
import nltk
import numpy as np
import pandas as pd

In [2]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [3]:
nltk_data[:2]

[[('Pierre', 'NOUN'),
  ('Vinken', 'NOUN'),
  (',', '.'),
  ('61', 'NUM'),
  ('years', 'NOUN'),
  ('old', 'ADJ'),
  (',', '.'),
  ('will', 'VERB'),
  ('join', 'VERB'),
  ('the', 'DET'),
  ('board', 'NOUN'),
  ('as', 'ADP'),
  ('a', 'DET'),
  ('nonexecutive', 'ADJ'),
  ('director', 'NOUN'),
  ('Nov.', 'NOUN'),
  ('29', 'NUM'),
  ('.', '.')],
 [('Mr.', 'NOUN'),
  ('Vinken', 'NOUN'),
  ('is', 'VERB'),
  ('chairman', 'NOUN'),
  ('of', 'ADP'),
  ('Elsevier', 'NOUN'),
  ('N.V.', 'NOUN'),
  (',', '.'),
  ('the', 'DET'),
  ('Dutch', 'NOUN'),
  ('publishing', 'VERB'),
  ('group', 'NOUN'),
  ('.', '.')]]

In [4]:
tagged_words = [ word for sent in nltk_data for word in sent ]

In [5]:
tagged_words[:2]

[('Pierre', 'NOUN'), ('Vinken', 'NOUN')]

In [6]:
len(tagged_words)

100676

**Checking the unique tags**

In [7]:
tags = [ tag[1] for tag in tagged_words]
set(tags)

{'.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X'}

In [8]:
from collections import Counter
tag_counts = Counter(tags)
print(tag_counts)
print('\n')
print(tag_counts.most_common)

Counter({'NOUN': 28867, 'VERB': 13564, '.': 11715, 'ADP': 9857, 'DET': 8725, 'X': 6613, 'ADJ': 6397, 'NUM': 3546, 'PRT': 3219, 'ADV': 3171, 'PRON': 2737, 'CONJ': 2265})


<bound method Counter.most_common of Counter({'NOUN': 28867, 'VERB': 13564, '.': 11715, 'ADP': 9857, 'DET': 8725, 'X': 6613, 'ADJ': 6397, 'NUM': 3546, 'PRT': 3219, 'ADV': 3171, 'PRON': 2737, 'CONJ': 2265})>


In [9]:
verb_tagged_words = [ word[0] for word in tagged_words if word[1]=='VERB' ]
words_ending_ing_ed = [word for word in verb_tagged_words if word.endswith('ed') or word.endswith('ing')]
print(len(words_ending_ing_ed)/len(verb_tagged_words))

0.3154674137422589


In [10]:
tokens = [word[0] for word in tagged_words]
vocab = set(tokens)

### Build the vanilla Viterbi based POS tagger

In [13]:
# t X v matrix
t = len(tags)
v = len(vocab)
print("Tag length : ",t)
print("Vocab length : ",v)
t_v_mat = np.zeros((t,v)) 

Tag length :  100676
Vocab length :  12408


In [18]:
import random
from sklearn.model_selection import train_test_split

random.seed(1234)
train_set, val_set = train_test_split(nltk_data,test_size=0.05)
print(len(train_set))
print(len(val_set))

3718
196


In [14]:
# Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

NameError: name 'train_tagged_words' is not defined

In [None]:
# Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

### Solve the problem of unknown words

### Evaluating tagging accuracy

### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications