<a href="https://colab.research.google.com/github/gksthdals/NLTK/blob/main/07.%20Extracting_Information_from_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# How can we build a system that extracts structured data, such as tables, from unstructured text?
# What are some robust methods for identifying the entities and relationships described in a text?
# Which corpora are appropriate for this work, and how do we use them for training and evaluating our models?

## 1. Information Extraction

In [None]:
locs = [('Omnicom', 'IN', 'New York'),
        ('DDB Needham', 'IN', 'New York'),
        ('Kaplan Thaler Group', 'IN', 'New York'),
        ('BBDO South', 'IN', 'Atlanta'),
        ('Georgia-Pacific', 'IN', 'Atlanta')]

In [None]:
query = [e1 for (e1, rel, e2) in locs if e2 == 'Atlanta']
query

['BBDO South', 'Georgia-Pacific']

### Information Extraction Architecture

In [None]:
# 1. sentence segmentation
# 2. tokenization
# 3. part of speech tagging
# 4. entity detection
# 5. relation detection

In [None]:
import nltk, re, pprint
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
def ie_preprocess(document):
  sentences = nltk.sent_tokenize(document)
  sentences = [nltk.word_tokenize(sent) for sent in sentences]
  sentences = [nltk.pos_tag(sent) for sent in sentences]

## 2. Chunking

### Noun Phrase Chunking

In [None]:
sentence = [('the', 'DT'), ('little', 'JJ'), ('yellow', 'JJ'), ('dog', 'NN'), ('barked', 'VBD'), ('at', 'IN'), ('the', 'DT'), ('cat', 'NN')]

grammar = "NP: {<DT>?<JJ>*<NN>}"

cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


### Tag Patterns

In [None]:
# <DT>?<JJ.*>*<NN.*>+

### Chunking with Regular Expressions

In [None]:
grammar = r"""
  NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and noun
      {<NNP>+}              # chunk sequences of proper nouns
"""

cp = nltk.RegexpParser(grammar)
sentence = [("Rapunzel", "NNP"), ("let", "VBD"), ("down", "RP"),
                 ("her", "PP$"), ("long", "JJ"), ("golden", "JJ"), ("hair", "NN")]

In [None]:
print(cp.parse(sentence))

(S
  (NP Rapunzel/NNP)
  let/VBD
  down/RP
  (NP her/PP$ long/JJ golden/JJ hair/NN))


In [None]:
nouns = [('money', 'NN'), ('market', 'NN'), ('fund', 'NN')]
grammar = "NP: {<NN><NN>} # Chunk two consecutive nouns"
cp = nltk.RegexpParser(grammar)
print(cp.parse(nouns))

(S (NP money/NN market/NN) fund/NN)


In [None]:
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [None]:
cp = nltk.RegexpParser('CHUNK: {<V.*> <TO> <V.*>}')
brown = nltk.corpus.brown
for sent in brown.tagged_sents()[:100]:
  tree = cp.parse(sent)
  for subtree in tree.subtrees():
    if subtree.label() == 'CHUNK': print(subtree)

(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
(CHUNK serve/VB to/TO protect/VB)
(CHUNK wanted/VBD to/TO wait/VB)
(CHUNK allowed/VBN to/TO place/VB)
(CHUNK expected/VBN to/TO become/VB)
(CHUNK expected/VBN to/TO approve/VB)
(CHUNK expected/VBN to/TO make/VB)
(CHUNK intends/VBZ to/TO make/VB)
(CHUNK seek/VB to/TO set/VB)
(CHUNK like/VB to/TO see/VB)


### Chinking

In [None]:
grammar = r"""
  NP:
    {<.*>+}       # Chunk everything
    }<VBD|IN>+{   # Chink sequences of VBD and IN
"""

sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
       ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]

cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


### Representing Chunks: Tags vs Trees

## 3. Developing and Evaluating Chunkers

### Reading IOB Format and the CoNLL 2000 Corpus

In [None]:
text = """
he PRP B-NP
accepted VBD B-VP
the DT B-NP
position NN I-NP
of IN B-PP
vice NN B-NP
chairman NN I-NP
of IN B-PP
Carlyle NNP B-NP
Group NNP I-NP
, , O
a DT B-NP
merchant NN I-NP
banking NN I-NP
concern NN I-NP
. . O
"""
print(nltk.chunk.conllstr2tree(text, chunk_types=['NP']))

(S
  (NP he/PRP)
  accepted/VBD
  (NP the/DT position/NN)
  of/IN
  (NP vice/NN chairman/NN)
  of/IN
  (NP Carlyle/NNP Group/NNP)
  ,/,
  (NP a/DT merchant/NN banking/NN concern/NN)
  ./.)


In [None]:
nltk.download('conll2000')

[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!


True

In [None]:
from nltk.corpus import conll2000
print(conll2000.chunked_sents('train.txt')[99])

(S
  (PP Over/IN)
  (NP a/DT cup/NN)
  (PP of/IN)
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  (VP told/VBD)
  (NP his/PRP$ story/NN)
  ./.)


In [None]:
print(conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99])

(S
  Over/IN
  (NP a/DT cup/NN)
  of/IN
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  told/VBD
  (NP his/PRP$ story/NN)
  ./.)


### Simple Evaluation and Baselines

In [None]:
from nltk.corpus import conll2000
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


In [None]:
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  87.7%%
    Precision:     70.6%%
    Recall:        67.8%%
    F-Measure:     69.2%%


In [None]:
class UnigramChunker(nltk.ChunkParserI):
  def __init__(self, train_sents):
    train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
                  for sent in train_sents]
    self.tagger = nltk.UnigramTagger(train_data)

  def parse(self, sentence):
    pos_tags = [pos for (word, pos) in sentence]
    tagged_pos_tags = self.tagger.tag(pos_tags)
    chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
    conlltags = [(word, pos, chunktag) for ((word, pos), chunktag)
                 in zip(sentence, chunktags)]
    return nltk.chunk.conlltags2tree(conlltags)

In [None]:
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
unigram_chunker = UnigramChunker(train_sents)

print(unigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%


In [None]:
postags = sorted(set(pos for sent in train_sents
                         for (word, pos) in sent.leaves()))

print(unigram_chunker.tagger.tag(postags))

[('#', 'B-NP'), ('$', 'B-NP'), ("''", 'O'), ('(', 'O'), (')', 'O'), (',', 'O'), ('.', 'O'), (':', 'O'), ('CC', 'O'), ('CD', 'I-NP'), ('DT', 'B-NP'), ('EX', 'B-NP'), ('FW', 'I-NP'), ('IN', 'O'), ('JJ', 'I-NP'), ('JJR', 'B-NP'), ('JJS', 'I-NP'), ('MD', 'O'), ('NN', 'I-NP'), ('NNP', 'I-NP'), ('NNPS', 'I-NP'), ('NNS', 'I-NP'), ('PDT', 'B-NP'), ('POS', 'B-NP'), ('PRP', 'B-NP'), ('PRP$', 'B-NP'), ('RB', 'O'), ('RBR', 'O'), ('RBS', 'B-NP'), ('RP', 'O'), ('SYM', 'O'), ('TO', 'O'), ('UH', 'O'), ('VB', 'O'), ('VBD', 'O'), ('VBG', 'O'), ('VBN', 'O'), ('VBP', 'O'), ('VBZ', 'O'), ('WDT', 'B-NP'), ('WP', 'B-NP'), ('WP$', 'B-NP'), ('WRB', 'O'), ('``', 'O')]


In [None]:
class BigramChunker(nltk.ChunkParserI):
  def __init__(self, train_sents):
    train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
                  for sent in train_sents]
    self.tagger = nltk.BigramTagger(train_data)

  def parse(self, sentence):
    pos_tags = [pos for (word, pos) in sentence]
    tagged_pos_tags = self.tagger.tag(pos_tags)
    chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
    conlltags = [(word, pos, chunktag) for ((word, pos), chunktag)
                 in zip(sentence, chunktags)]
    return nltk.chunk.conlltags2tree(conlltags)

In [None]:
bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  93.3%%
    Precision:     82.3%%
    Recall:        86.8%%
    F-Measure:     84.5%%


### Training Classifier-Based Chunkers

In [None]:
class ConsecutiveNPChunkTagger(nltk.TaggerI):

  def __init__(self, train_sents):
    train_set = []
    for tagged_sent in train_sents:
      untagged_sent = nltk.tag.untag(tagged_sent)
      history = []
      for i, (word, tag) in enumerate(tagged_sent):
        featureset = npchunk_features(untagged_sent, i, history)
        train_set.append( (featureset, tag) )
        history.append(tag)
    self.classifier = nltk.MaxentClassifier.train(
        train_set, algorithm='megam', trace=0)

  def tag(self, sentence):
    history = []
    for i, word in enumerate(sentence):
      featureset = npchunk_features(sentence, i, history)
      tag = self.classifier.classify(featureset)
      history.append(tag)
    return zip(sentence, history)

In [None]:
class ConsecutiveNPChunker(nltk.ChunkParserI):
  def __init__(self, train_sents):
    tagged_sents = [[((w, t), c) for (w, t, c) in
                     nltk.chunk.tree2conlltags(sent)]
                    for sent in train_sents]
    self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

  def parse(self, sentence):
    tagged_sents = self.tagger.tag(sentence)
    conlltags = [(w, t, c) for ((w, t), c) in tagged_sents]
    return nltk.chunk.conlltags2tree(conlltags)

In [None]:
def npchunk_features(sentence, i, history):
  word, pos = sentence[i]
  return {'pos': pos}

chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

In [None]:
def npchunk_features(sentence, i, history):
  word, pos = sentence[i]
  if i == 0:
    prevword, prevpos = "<START>", "<START>"
  else:
    prevword, prevpos = sentence[i-1]
  
  return {'pos': pos, 'prevpos': prevpos}

chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

In [None]:
def npchunk_features(sentence, i, history):
  word, pos = sentence[i]
  if i == 0:
    prevword, prevpos = "<START>", "<START>"
  else:
    prevword, prevpos = sentence[i-1]
  return {'pos': pos, 'word': word, 'prevpos': prevpos}

chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

In [None]:
def npchunk_features(sentence, i, history):
  word, pos = sentence[i]
  if i == 0:
    prevword, prevpos = "<START>", "<START>"
  else:
    prevword, prevpos = sentence[i-1]
  
  if i == len(sentence) - 1:
    nextword, nextpos = "<END>", "<END>"
  else:
    nextword, nextpos = sentence[i+1]
  
  return {'pos': pos,
          'word': word,
          'prevpos': prevpos,
          'nextpos': nextpos,
          'prevpos+pos': "%s+%s" % (prevpos, pos),
          'pos+nextpos': "%s+%s" % (pos, nextpos),
          'tags-since-dt': tags_since_dt(sentence, i)}

In [None]:
def tags_since_dt(sentence, i):
  tags = set()
  for word, pos in sentence[:i]:
    if pos == 'DT':
      tags = set()
    else:
      tags.add(pos)
  
  return '+'.join(sorted(tags))

In [None]:
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

## 4. Recursion in Linguistic Structure

### Building Nested Structure with Cascaded Chunkers

In [None]:
grammar = r"""
NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>}               # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
CLAUSE: {<NP><VP>}           # Chunk NP, VP
"""

cp = nltk.RegexpParser(grammar)
sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"),
    ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]

print(cp.parse(sentence))

(S
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))


In [None]:
sentence = [("John", "NNP"), ("thinks", "VBZ"), ("Mary", "NN"),
            ("saw", "VBD"), ("the", "DT"), ("cat", "NN"), ("sit", "VB"),
            ("on", "IN"), ("the", "DT"), ("mat", "NN")]

print(cp.parse(sentence))

(S
  (NP John/NNP)
  thinks/VBZ
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))


In [None]:
cp = nltk.RegexpParser(grammar, loop=2)

print(cp.parse(sentence))

(S
  (NP John/NNP)
  thinks/VBZ
  (CLAUSE
    (NP Mary/NN)
    (VP
      saw/VBD
      (CLAUSE
        (NP the/DT cat/NN)
        (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))


### Trees

In [None]:
tree1 = nltk.Tree('NP', ['Alice'])
print(tree1)

(NP Alice)


In [None]:
tree2 = nltk.Tree('NP', ['the', 'rabbit'])
print(tree2)

(NP the rabbit)


In [None]:
tree3 = nltk.Tree('VP', ['chased', tree2])
tree4 = nltk.Tree('S', [tree1, tree3])
print(tree4)

(S (NP Alice) (VP chased (NP the rabbit)))


In [None]:
print(tree4[1])

(VP chased (NP the rabbit))


In [None]:
tree4[1].label()

'VP'

In [None]:
tree4.leaves()

['Alice', 'chased', 'the', 'rabbit']

In [None]:
tree4[1][1][1]

'rabbit'

### Tree Traversal

In [None]:
def traverse(t):
  try:
    t.label()
  except AttributeError:
    print(t, end=' ')
  else:
    # Now we know that t.node is defined
    print('(', t.label(), end=' ')
    for child in t:
      traverse(child)
    print(')', end=' ')

In [None]:
t = nltk.Tree('(S (NP Alice) (VP chased (NP the rabbit)))')
traverse(t)

TypeError: ignored

## 5. Named Entity Recognition

In [None]:
nltk.download('treebank')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
sent = nltk.corpus.treebank.tagged_sents()[22]
print(nltk.ne_chunk(sent, binary=True))

(S
  The/DT
  (NE U.S./NNP)
  is/VBZ
  one/CD
  of/IN
  the/DT
  few/JJ
  industrialized/VBN
  nations/NNS
  that/WDT
  *T*-7/-NONE-
  does/VBZ
  n't/RB
  have/VB
  a/DT
  higher/JJR
  standard/NN
  of/IN
  regulation/NN
  for/IN
  the/DT
  smooth/JJ
  ,/,
  needle-like/JJ
  fibers/NNS
  such/JJ
  as/IN
  crocidolite/NN
  that/WDT
  *T*-1/-NONE-
  are/VBP
  classified/VBN
  *-5/-NONE-
  as/IN
  amphobiles/NNS
  ,/,
  according/VBG
  to/TO
  (NE Brooke/NNP)
  T./NNP
  Mossman/NNP
  ,/,
  a/DT
  professor/NN
  of/IN
  pathlogy/NN
  at/IN
  the/DT
  (NE University/NNP)
  of/IN
  (NE Vermont/NNP College/NNP)
  of/IN
  (NE Medicine/NNP)
  ./.)


In [None]:
print(nltk.ne_chunk(sent))

(S
  The/DT
  (GPE U.S./NNP)
  is/VBZ
  one/CD
  of/IN
  the/DT
  few/JJ
  industrialized/VBN
  nations/NNS
  that/WDT
  *T*-7/-NONE-
  does/VBZ
  n't/RB
  have/VB
  a/DT
  higher/JJR
  standard/NN
  of/IN
  regulation/NN
  for/IN
  the/DT
  smooth/JJ
  ,/,
  needle-like/JJ
  fibers/NNS
  such/JJ
  as/IN
  crocidolite/NN
  that/WDT
  *T*-1/-NONE-
  are/VBP
  classified/VBN
  *-5/-NONE-
  as/IN
  amphobiles/NNS
  ,/,
  according/VBG
  to/TO
  (PERSON Brooke/NNP T./NNP Mossman/NNP)
  ,/,
  a/DT
  professor/NN
  of/IN
  pathlogy/NN
  at/IN
  the/DT
  (ORGANIZATION University/NNP)
  of/IN
  (PERSON Vermont/NNP College/NNP)
  of/IN
  (GPE Medicine/NNP)
  ./.)


## 6. Relation Extraction

In [82]:
nltk.download('ieer')

[nltk_data] Downloading package ieer to /root/nltk_data...
[nltk_data]   Unzipping corpora/ieer.zip.


True

In [83]:
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
  for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
    print(nltk.sem.rtuple(rel))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']


In [85]:
nltk.download('conll2002')

[nltk_data] Downloading package conll2002 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2002.zip.


True

In [86]:
from nltk.corpus import conll2002
vnv = """
(
  is/V|       # 3rd sing present and
  was/V|      # past forms of the verb zijn ('be')
  werd/V|     # and also present
  wordt/V|    # past of worden ('become')
)
.*            # followed by anything
van/Prep      # followed by van ('of')
"""

VAN = re.compile(vnv, re.VERBOSE)
for doc in conll2002.chunked_sents('ned.train'):
  for rel in nltk.sem.extract_rels('PER', 'ORG', doc,
                                   corpus='conll2002', pattern=VAN):
    print(nltk.sem.clause(rel, relsym="VAN"))

VAN('marco_pantani', 'mercatone_uno')
VAN('larmuseau', 'abc_containerline')
VAN('horst_köhler', 'imf')
VAN('simonet', 'binnenlandse_zaken')
VAN('guy_quaden', 'nationale_bank')
VAN('de_bauw', 'buitenlandse_zaken')
VAN("cornet_d'elzius", 'buitenlandse_handel')
VAN('rosenfeld', 'abc_containerline')
VAN('carlo_gepts', 'vt4')
VAN('lone_leth_larsen', 'deens_cultureel_centrum')
VAN('johan_rottiers', 'kardinaal_van_roey_instituut')
VAN('jean-louis_peninou', 'international_boundaries_research')
VAN('lieven', 'honda')
VAN('talal_g_shamoon', 'intertrust_technologies_corporation')
VAN('albert_frère', 'tractebel')
VAN('robert_spatz', 'okc-beweging')
VAN('bart_bode', 'broederlijk_delen')
VAN('guido_westerwelle', 'fdp')
VAN('martin_bril', 'vrij_nederland')
VAN('frank_rijkaard', 'vrij_nederland')
VAN('filip', 'telecommunicatie')
VAN('maurice_buckmaster', 'special_operations_executive')
VAN('mukamba', 'commissie-lumumba')
VAN('versnick', 'buitenlandse_zaken')
VAN('mukamba', 'miba')
VAN('bart_bode', 'br

In [87]:
from nltk.corpus import conll2002
vnv = """
(
  is/V|       # 3rd sing present and
  was/V|      # past forms of the verb zijn ('be')
  werd/V|     # and also present
  wordt/V|    # past of worden ('become')
)
.*            # followed by anything
van/Prep      # followed by van ('of')
"""

VAN = re.compile(vnv, re.VERBOSE)
for doc in conll2002.chunked_sents('ned.train'):
  for rel in nltk.sem.extract_rels('PER', 'ORG', doc,
                                   corpus='conll2002', pattern=VAN):
    print(nltk.rtuple(rel, lcon=True, rcon=True))

...'De/Art ploegmaat/N van/Prep')[PER: 'Marco/N Pantani/N'] 'en/Conj kopman/N van/Prep' [ORG: 'Mercatone/N Uno/N']('in/Prep deze/Pron'...
...'In/Prep dezelfde/Pron periode/N was/V')[PER: 'Larmuseau/N'] 'ook/Adv lid/N van/Prep de/Art interkabinettengroep/N rond/Prep' [ORG: 'ABC/N Containerline/N'](',/Punc die/Pron specifiek/Adj was/V opgericht/V'...
...'Dit/Pron heeft/V')[PER: 'Horst/N Köhler/Conj'] ',/Punc de/Art in/Prep mei/N aangetreden/V topman/N van/Prep het/Art' [ORG: 'IMF/N'](',/Punc gisteren/Adv gezegd/V in/Prep'...
...'')[PER: 'Simonet/N'] 'heeft/V de/Art bekommernissen/N overgemaakt/V aan/Prep minister/N van/Prep' [ORG: 'Binnenlandse/N Zaken/N'](''...
...'De/Art woorden/N komen/V van/Prep gouverneur/N')[PER: 'Guy/N Quaden/N'] 'van/Prep de/Art' [ORG: 'Nationale/Adj Bank/N'](',/Punc en/Conj ze/Pron werden/V woensdag/N'...
...'')[PER: 'De/Art Bauw/N'] '(/Punc 34/Num )/Punc was/V vroeger/Adj adjunct-woordvoerder/N van/Prep het/Art ministerie/N van/Prep' [ORG: 'Buitenlandse/N Zaken