# Project 5 - Build your own POS Tagger

In [1]:
sentence = 'This NLP Workshop is being organized by Analytics Vidhya as part of the DataHack Summit 2018'
sentence

'This NLP Workshop is being organized by Analytics Vidhya as part of the DataHack Summit 2018'

In [2]:
import nltk
import pandas as pd

sentence_tokens = nltk.word_tokenize(sentence)
nltk_pos_tagged = nltk.pos_tag(sentence_tokens)
pd.DataFrame(nltk_pos_tagged, 
             columns=['Word', 'POS tag']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
Word,This,NLP,Workshop,is,being,organized,by,Analytics,Vidhya,as,part,of,the,DataHack,Summit,2018
POS tag,DT,NNP,NNP,VBZ,VBG,VBN,IN,NNS,NNP,IN,NN,IN,DT,NNP,NNP,CD


In [3]:
from nltk.corpus import treebank

data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]

len(train_data), len(test_data)

(3500, 414)

# Default Tagger

In [4]:
# default tagger
from nltk.tag import DefaultTagger
dt = DefaultTagger('NN')
dt

<DefaultTagger: tag=NN>

In [5]:
# accuracy on test data
dt.evaluate(test_data)

0.1454158195372253

In [6]:
# tagging our sample sentence
pos_tagged = dt.tag(nltk.word_tokenize(sentence))
pd.DataFrame(pos_tagged, 
             columns=['Word', 'POS tag']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
Word,This,NLP,Workshop,is,being,organized,by,Analytics,Vidhya,as,part,of,the,DataHack,Summit,2018
POS tag,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN,NN


# Regular Expression based Tagger

In [7]:
# regex tagger
from nltk.tag import RegexpTagger

# define regex tag patterns
patterns = [
        (r'.*ing$', 'VBG'),               # gerunds
        (r'.*ed$', 'VBD'),                # simple past
        (r'.*es$', 'VBZ'),                # 3rd singular present
        (r'.*ould$', 'MD'),               # modals
        (r'.*\'s$', 'NN$'),               # possessive nouns
        (r'.*s$', 'NNS'),                 # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')                     # nouns (default) ... 
]

rt = RegexpTagger(patterns)
rt

<Regexp Tagger: size=8>

In [8]:
# accuracy on test data
rt.evaluate(test_data)

0.24039113176493368

In [9]:
# tagging our sample sentence
pos_tagged = rt.tag(nltk.word_tokenize(sentence))
pd.DataFrame(pos_tagged, 
             columns=['Word', 'POS tag']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
Word,This,NLP,Workshop,is,being,organized,by,Analytics,Vidhya,as,part,of,the,DataHack,Summit,2018
POS tag,NNS,NN,NN,NNS,VBG,VBD,NN,NNS,NN,NNS,NN,NN,NN,NN,NN,CD


# N-gram Taggers

In [10]:
## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)
tt

<TrigramTagger: size=41616>

## Your Turn: Evaluate the performance of each of these taggers on the test data

In [11]:
print('Unigram Tagger Performance: {}\nBigram Tagger Performance: {}\nTrigram Tagger Performance: {}'.format(
                                                                                               ut.evaluate(test_data),
                                                                                               bt.evaluate(test_data),
                                                                                               tt.evaluate(test_data))) 

Unigram Tagger Performance: 0.8607803272340013
Bigram Tagger Performance: 0.13466937748087907
Trigram Tagger Performance: 0.08064672281924679


# Combining Taggers with Backoffs

In [12]:
ct1 = UnigramTagger(train=train_data, backoff=rt)
ct1

<UnigramTagger: size=7326>

In [13]:
ct1.evaluate(test_data)

0.9008616516603737

In [14]:
ct2 = BigramTagger(train=train_data, backoff=ct1)
ct2

<BigramTagger: size=2121>

In [15]:
ct2.evaluate(test_data)

0.9103495014038145

## Your Turn: Build a tri-gram tagger using the previous chained tagger as backoff 

You know the drill! Leverage the `TrigramTagger and use the previous built tagger as the backoff  and also test the performance on the test data.

Then check the pos tags on our sample sentence.

In [16]:
ct3 = TrigramTagger(train=train_data, backoff=ct2)
ct3

<TrigramTagger: size=975>

In [17]:
ct3.evaluate(test_data)

0.9094781682641108

In [18]:
# tagging our sample sentence
pos_tagged = ct3.tag(nltk.word_tokenize(sentence))
pd.DataFrame(pos_tagged, 
             columns=['Word', 'POS tag']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
Word,This,NLP,Workshop,is,being,organized,by,Analytics,Vidhya,as,part,of,the,DataHack,Summit,2018
POS tag,DT,NN,NN,VBZ,VBG,JJ,IN,NNS,NN,IN,NN,IN,DT,NN,NN,CD


# Supervised Learning for POS Tagging

In [19]:
from nltk.classify import NaiveBayesClassifier
from nltk.tag.sequential import ClassifierBasedPOSTagger

nbt = ClassifierBasedPOSTagger(train=train_data,
                               classifier_builder=NaiveBayesClassifier.train)
nbt

<ClassifierBasedTagger: <nltk.classify.naivebayes.NaiveBayesClassifier object at 0x0000023DAC385208>>

## Your Turn: Test the performance of the model and tag the sample sentence

In [20]:
nbt.evaluate(test_data)

0.9306806079969019

In [21]:
# tagging our sample sentence
pos_tagged = nbt.tag(nltk.word_tokenize(sentence))
pd.DataFrame(pos_tagged, 
             columns=['Word', 'POS tag']).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
Word,This,NLP,Workshop,is,being,organized,by,Analytics,Vidhya,as,part,of,the,DataHack,Summit,2018
POS tag,DT,NN,WRB,VBZ,VBG,VBN,IN,NNS,NNPS,IN,NN,IN,DT,NNP,NN,CD
