## Backoff Tagging

In [1]:
# Default tagger

from nltk.tag import DefaultTagger
tagger = DefaultTagger('NN')

print(tagger.tag('Hello World'.split()))

[('Hello', 'NN'), ('World', 'NN')]


In [2]:
# Set up train/test sents

from nltk.corpus import treebank
from nltk.tag import untag


print(f'There are {len(treebank.tagged_sents())} tagged sentences in treebank.')

train_sents = treebank.tagged_sents()[:3000]
example_sent = treebank.sents()[3000]
test_sents = treebank.tagged_sents()[3001:]

sent_slice = slice(10,12)
print(train_sents[sent_slice])
print('\n')

for i, sent in enumerate(train_sents[sent_slice]):
    print(f'Sentence {i+1}:')
#     print(f'{" ".join([word for word, _ in sent])}\n')
    print(f'{" ".join(untag(sent))}\n')


There are 3914 tagged sentences in treebank.
[[('Neither', 'DT'), ('Lorillard', 'NNP'), ('nor', 'CC'), ('the', 'DT'), ('researchers', 'NNS'), ('who', 'WP'), ('*T*-3', '-NONE-'), ('studied', 'VBD'), ('the', 'DT'), ('workers', 'NNS'), ('were', 'VBD'), ('aware', 'JJ'), ('of', 'IN'), ('any', 'DT'), ('research', 'NN'), ('on', 'IN'), ('smokers', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Kent', 'NNP'), ('cigarettes', 'NNS'), ('.', '.')], [('``', '``'), ('We', 'PRP'), ('have', 'VBP'), ('no', 'DT'), ('useful', 'JJ'), ('information', 'NN'), ('on', 'IN'), ('whether', 'IN'), ('users', 'NNS'), ('are', 'VBP'), ('at', 'IN'), ('risk', 'NN'), (',', ','), ("''", "''"), ('said', 'VBD'), ('*T*-1', '-NONE-'), ('James', 'NNP'), ('A.', 'NNP'), ('Talcott', 'NNP'), ('of', 'IN'), ('Boston', 'NNP'), ("'s", 'POS'), ('Dana-Farber', 'NNP'), ('Cancer', 'NNP'), ('Institute', 'NNP'), ('.', '.')]]


Sentence 1:
Neither Lorillard nor the researchers who *T*-3 studied the workers were aware of any research on smokers of the

In [3]:
# Unigram tagger w. training data
# Note that are also bigram, trigram, etc. taggers, but they will not prove to be useful for lemmatization

from nltk.tag import UnigramTagger
tagger = UnigramTagger(train_sents)

print(tagger.tag('Hello World'.split()))

[('Hello', None), ('World', 'NNP')]


In [4]:
print(tagger.tag(example_sent))

[('At', 'IN'), ('Tokyo', 'NNP'), (',', ','), ('the', 'DT'), ('Nikkei', None), ('index', 'NN'), ('of', 'IN'), ('225', 'CD'), ('selected', None), ('issues', 'NNS'), (',', ','), ('which', 'WDT'), ('*T*-1', '-NONE-'), ('gained', 'VBD'), ('132', None), ('points', 'NNS'), ('Tuesday', 'NNP'), (',', ','), ('added', 'VBD'), ('14.99', None), ('points', 'NNS'), ('to', 'TO'), ('35564.43', None), ('.', '.')]


In [5]:
print(tagger.evaluate(test_sents))

0.8572231104965213


In [6]:
# Unigram Tagger w. dictionary

tagger = UnigramTagger(model={'Nikkei': 'NNP', 'selected': 'VBN'})

print(tagger.tag(example_sent))
print('\n')
print(tagger.evaluate(test_sents))

[('At', None), ('Tokyo', None), (',', None), ('the', None), ('Nikkei', 'NNP'), ('index', None), ('of', None), ('225', None), ('selected', 'VBN'), ('issues', None), (',', None), ('which', None), ('*T*-1', None), ('gained', None), ('132', None), ('points', None), ('Tuesday', None), (',', None), ('added', None), ('14.99', None), ('points', None), ('to', None), ('35564.43', None), ('.', None)]


8.642668856142777e-05


In [7]:
# Backoff tagging

backoff_tagger = DefaultTagger('NN')
tagger = UnigramTagger(train_sents, backoff=backoff_tagger)

print(tagger.tag(example_sent))
print('\n')
print(tagger.evaluate(test_sents))

[('At', 'IN'), ('Tokyo', 'NNP'), (',', ','), ('the', 'DT'), ('Nikkei', 'NN'), ('index', 'NN'), ('of', 'IN'), ('225', 'CD'), ('selected', 'NN'), ('issues', 'NNS'), (',', ','), ('which', 'WDT'), ('*T*-1', '-NONE-'), ('gained', 'VBD'), ('132', 'NN'), ('points', 'NNS'), ('Tuesday', 'NNP'), (',', ','), ('added', 'VBD'), ('14.99', 'NN'), ('points', 'NNS'), ('to', 'TO'), ('35564.43', 'NN'), ('.', '.')]


0.8742059547988419


In [8]:
# Regex tagging

from nltk.tag import RegexpTagger

patterns = [
    (r'\b\d+\b', 'CD'),
    (r'\b.+ed\b', 'VBD')
]

tagger = RegexpTagger(patterns)

print(tagger.tag(example_sent))
print('\n')
print(tagger.evaluate(test_sents))

[('At', None), ('Tokyo', None), (',', None), ('the', None), ('Nikkei', None), ('index', None), ('of', None), ('225', 'CD'), ('selected', 'VBD'), ('issues', None), (',', None), ('which', None), ('*T*-1', None), ('gained', 'VBD'), ('132', 'CD'), ('points', None), ('Tuesday', None), (',', None), ('added', 'VBD'), ('14.99', 'CD'), ('points', None), ('to', None), ('35564.43', 'CD'), ('.', None)]


0.053584546908085215


In [9]:
# Another backoff chain

# default_tagger = DefaultTagger('NN')
default_tagger = None
train_tagger = UnigramTagger(train_sents, backoff=default_tagger)
dict_tagger = UnigramTagger(model={'Nikkei': 'NNP', 'selected': 'VBN'}, backoff=train_tagger)
tagger = dict_tagger

from pprint import pprint

pprint(tagger.tag(example_sent))
print('\n')
print(tagger.evaluate(test_sents))

[('At', 'IN'),
 ('Tokyo', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('Nikkei', 'NNP'),
 ('index', 'NN'),
 ('of', 'IN'),
 ('225', 'CD'),
 ('selected', 'VBN'),
 ('issues', 'NNS'),
 (',', ','),
 ('which', 'WDT'),
 ('*T*-1', '-NONE-'),
 ('gained', 'VBD'),
 ('132', None),
 ('points', 'NNS'),
 ('Tuesday', 'NNP'),
 (',', ','),
 ('added', 'VBD'),
 ('14.99', None),
 ('points', 'NNS'),
 ('to', 'TO'),
 ('35564.43', None),
 ('.', '.')]


0.8573095371850827


## Lemmatization as a backoff task