In [1]:
import nltk

In [2]:
from nltk.corpus import conll2002

In [3]:
conll2002.fileids()

['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']

In [4]:
tagged = conll2002.iob_sents()

In [5]:
tagged[0]

[('Sao', 'NC', 'B-LOC'),
 ('Paulo', 'VMI', 'I-LOC'),
 ('(', 'Fpa', 'O'),
 ('Brasil', 'NC', 'B-LOC'),
 (')', 'Fpt', 'O'),
 (',', 'Fc', 'O'),
 ('23', 'Z', 'O'),
 ('may', 'NC', 'O'),
 ('(', 'Fpa', 'O'),
 ('EFECOM', 'NP', 'B-ORG'),
 (')', 'Fpt', 'O'),
 ('.', 'Fp', 'O')]

In [6]:
def prepare_corpus(iob_sents):
    result = []
    for s in iob_sents:
        tagged_sent = [(w,iob) for (w,t,iob) in s]
        result.append(tagged_sent)
    return result

In [7]:
train = prepare_corpus(conll2002.iob_sents('esp.train'))

In [8]:
test = prepare_corpus(conll2002.iob_sents('esp.testa'))

In [9]:
train[0]

[('Melbourne', 'B-LOC'),
 ('(', 'O'),
 ('Australia', 'B-LOC'),
 (')', 'O'),
 (',', 'O'),
 ('25', 'O'),
 ('may', 'O'),
 ('(', 'O'),
 ('EFE', 'B-ORG'),
 (')', 'O'),
 ('.', 'O')]

# Using a HMM

In [10]:
hmm = nltk.tag.HiddenMarkovModelTagger.train(train)

In [11]:
hmm.test(test)

accuracy over 52923 tokens: 91.64


# Compare with a baseline

In [12]:
import collections
tagcounter = collections.Counter(iob for w,t,iob in conll2002.iob_words('esp.train'))

In [13]:
tagcounter.most_common()

[('O', 231920),
 ('B-ORG', 7390),
 ('I-ORG', 4992),
 ('B-LOC', 4913),
 ('B-PER', 4321),
 ('I-PER', 3903),
 ('I-MISC', 3212),
 ('B-MISC', 2173),
 ('I-LOC', 1891)]

In [14]:
default = nltk.DefaultTagger('O')
default.evaluate(test)

0.8570186875271621

# Compare with a unigram tagger

In [15]:
unitagger = nltk.UnigramTagger(train)
unitagger.evaluate(test)

0.8688094023392476

In [16]:
unitagger2 = nltk.UnigramTagger(train,backoff=default)
unitagger2.evaluate(test)

0.9184664512593769