In [1]:
import nltk

In [2]:
nltk.download("conll2002")
from nltk.corpus import conll2002

[nltk_data] Downloading package conll2002 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2002.zip.


In [3]:
conll2002.fileids()

['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']

In [4]:
tagged = conll2002.iob_sents()

In [5]:
tagged[0]

[('Sao', 'NC', 'B-LOC'),
 ('Paulo', 'VMI', 'I-LOC'),
 ('(', 'Fpa', 'O'),
 ('Brasil', 'NC', 'B-LOC'),
 (')', 'Fpt', 'O'),
 (',', 'Fc', 'O'),
 ('23', 'Z', 'O'),
 ('may', 'NC', 'O'),
 ('(', 'Fpa', 'O'),
 ('EFECOM', 'NP', 'B-ORG'),
 (')', 'Fpt', 'O'),
 ('.', 'Fp', 'O')]

In [6]:
def prepare_corpus(iob_sents):
    result = []
    for s in iob_sents:
        tagged_sent = [(w,iob) for w, t, iob in s]
        result.append(tagged_sent)
    return result

In [7]:
train = prepare_corpus(conll2002.iob_sents('esp.train'))

In [8]:
test = prepare_corpus(conll2002.iob_sents('esp.testa'))

In [9]:
train[0]

[('Melbourne', 'B-LOC'),
 ('(', 'O'),
 ('Australia', 'B-LOC'),
 (')', 'O'),
 (',', 'O'),
 ('25', 'O'),
 ('may', 'O'),
 ('(', 'O'),
 ('EFE', 'B-ORG'),
 (')', 'O'),
 ('.', 'O')]

# Using a HMM

In [10]:
hmm = nltk.tag.HiddenMarkovModelTagger.train(train)

In [11]:
hmm.test(test)

accuracy over 52923 tokens: 91.64


# Compare with a baseline

In [12]:
import collections
tagcounter = collections.Counter(iob for w, t, iob in conll2002.iob_words('esp.train'))

In [13]:
tagcounter.most_common()

[('O', 231920),
 ('B-ORG', 7390),
 ('I-ORG', 4992),
 ('B-LOC', 4913),
 ('B-PER', 4321),
 ('I-PER', 3903),
 ('I-MISC', 3212),
 ('B-MISC', 2173),
 ('I-LOC', 1891)]

In [14]:
default = nltk.DefaultTagger('O')
default.evaluate(test)

0.8570186875271621

# Compare with a unigram tagger

In [15]:
unitagger = nltk.UnigramTagger(train)
unitagger.evaluate(test)

0.8689038792207546

In [16]:
unitagger2 = nltk.UnigramTagger(train,backoff=default)
unitagger2.evaluate(test)

0.918560928140884

### Error Analysis

In [17]:
def confusion_matrix(true_tagged_sents, predicted_tagged_sents):
    true_labels = [l for s in true_tagged_sents for w, l in s]
    predicted_labels = [l for s in predicted_tagged_sents for w, l in s]
    cm = nltk.ConfusionMatrix(true_labels, predicted_labels)
    print(cm.key()) # Print all the labels
    print(cm.pretty_format(sort_by_count=True, show_percents=True))

In [18]:
untagged_test = [nltk.untag(s) for s in test]

In [19]:
confusion_matrix(test, [unitagger2.tag(s) for s in untagged_test])

Value key:
  0: B-LOC
  1: B-MISC
  2: B-ORG
  3: B-PER
  4: I-LOC
  5: I-MISC
  6: I-ORG
  7: I-PER
  8: O

       |                                                I      B        |
       |             B      I      B      B      I      -      -      I |
       |             -      -      -      -      -      M      M      - |
       |             O      O      P      L      P      I      I      L |
       |             R      R      E      O      E      S      S      O |
       |      O      G      G      R      C      R      C      C      C |
-------+----------------------------------------------------------------+
     O | <85.5%>  0.0%   0.0%   0.0%   0.0%   0.0%   0.1%   0.0%   0.0% |
 B-ORG |   0.8%  <1.9%>  0.0%   0.0%   0.3%   0.0%   0.0%   0.1%   0.0% |
 I-ORG |   1.3%   0.2%  <0.6%>  0.0%   0.3%   0.0%   0.1%   0.1%   0.0% |
 B-PER |   0.9%   0.0%   0.0%  <1.0%>  0.0%   0.3%   0.0%      .   0.0% |
 B-LOC |   0.2%   0.1%   0.0%   0.0%  <1.5%>  0.0%   0.0%   0.0%   0.0% |
 I-

The largest cause of errors seems to be the presence of unknown names of organisations and persons.