In [None]:
https://www.nltk.org/howto/tag.html

In [6]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('treebank')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [25]:
from nltk.tag import PerceptronTagger
from nltk.corpus import treebank
tagger = PerceptronTagger()
gold_data = treebank.tagged_sents()[10:20]
print(tagger.accuracy(gold_data))

0.8859315589353612


In [8]:
from nltk.tbl.template import Template
from nltk.tag.brill import Pos, Word
from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer, UnigramTagger

In [9]:
from nltk.corpus import treebank
training_data = treebank.tagged_sents()[:100]
baseline_data = treebank.tagged_sents()[100:200]
gold_data = treebank.tagged_sents()[200:300]
testing_data = [untag(s) for s in gold_data]

In [10]:
backoff = RegexpTagger([
... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
... (r'(The|the|A|a|An|an)$', 'AT'),   # articles
... (r'.*able$', 'JJ'),                # adjectives
... (r'.*ness$', 'NN'),                # nouns formed from adjectives
... (r'.*ly$', 'RB'),                  # adverbs
... (r'.*s$', 'NNS'),                  # plural nouns
... (r'.*ing$', 'VBG'),                # gerunds
... (r'.*ed$', 'VBD'),                 # past tense verbs
... (r'.*', 'NN')                      # nouns (default)
... ])

In [24]:
backoff.accuracy(gold_data)

0.245014245014245

In [23]:
unigram_tagger = UnigramTagger(baseline_data)
unigram_tagger.accuracy(gold_data)

0.5811965811965812

In [13]:
unigram_tagger.tag(["I", "would", "like", "this", "sentence", "to", "be", "tagged"])

[('I', 'NNP'),
 ('would', 'MD'),
 ('like', None),
 ('this', 'DT'),
 ('sentence', None),
 ('to', 'TO'),
 ('be', 'VB'),
 ('tagged', None)]

In [22]:
baseline = UnigramTagger(baseline_data, backoff=backoff)
baseline.accuracy(gold_data)

0.7537647537647537

In [15]:
Template._cleartemplates() #clear any templates created in earlier tests
templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

In [16]:
tt = BrillTaggerTrainer(baseline, templates, trace=3)
tagger1 = tt.train(training_data, max_rules=10)

TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None)
Finding initial useful rules...
    Found 618 useful rules.

           B      |
   S   F   r   O  |        Score = Fixed - Broken
   c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
   o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
   r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
   e   d   n   r  |  e
------------------+-------------------------------------------------------
  13  14   1   4  | NN->VB if Pos:TO@[-1]
   8   8   0   0  | NN->VB if Pos:MD@[-1]
   7  10   3  22  | NN->IN if Pos:NNS@[-1]
   5   5   0   0  | NN->VBP if Pos:PRP@[-1]
   5   5   0   0  | VBD->VBN if Pos:VBZ@[-1]
   5   5   0   0  | NNS->NN if Pos:IN@[-1] & Word:asbestos@[0]
   4   4   0   0  | NN->-NONE- if Pos:WP@[-1]
   4   4   0   3  | NN->NNP if Pos:-NONE-@[-1]
   4   6   2   2  | NN->NNP if Pos:NNP@[-1]
   4   4   0   0  | NNS->VBZ if Pos:PRP@[-1]


In [17]:
tagger1.rules()[1:3]

(Rule('000', 'NN', 'VB', [(Pos([-1]),'MD')]),
 Rule('000', 'NN', 'IN', [(Pos([-1]),'NNS')]))

In [18]:
tagger1.print_template_statistics(printunused=False)

TEMPLATE STATISTICS (TRAIN)  2 templates, 10 rules)
TRAIN (   2417 tokens) initial   555 0.7704 final:   496 0.7948
#ID | Score (train) |  #Rules     | Template
--------------------------------------------
000 |    54   0.915 |   9   0.900 | Template(Pos([-1]))
001 |     5   0.085 |   1   0.100 | Template(Pos([-1]),Word([0]))




In [21]:
tagger1.accuracy(gold_data)

0.7692307692307693

In [20]:
tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)
tagged[33][12:]

[('foreign', 'NN'),
 ('debt', 'NN'),
 ('of', 'IN'),
 ('$', '$'),
 ('64', 'CD'),
 ('billion', 'CD'),
 ('*U*', '-NONE-'),
 ('--', ':'),
 ('the', 'DT'),
 ('third-highest', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('developing', 'VBG'),
 ('world', 'NN'),
 ('.', '.')]

In [None]:
import nltk

In [26]:
print('The nltk version is {}.'.format(nltk.__version__))

The nltk version is 3.8.1.
