<a href="https://colab.research.google.com/github/gamecicn/Note_NLTK/blob/main/NLTK_note7_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from nltk.corpus import conll2000
import nltk

In [7]:
nltk.download("conll2000")

[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.


True

In [5]:
class UnigramChunker(nltk.ChunkParserI):
  def __init__(self, train_sents):  
      train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                    for sent in train_sents]
      self.tagger = nltk.UnigramTagger(train_data)  

  def parse(self, sentence):  
      pos_tags = [pos for (word,pos) in sentence]
      tagged_pos_tags = self.tagger.tag(pos_tags)
      chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
      conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                    in zip(sentence, chunktags)]
      return nltk.chunk.conlltags2tree(conlltags)

In [8]:
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%


In [9]:
class ConsecutiveNPChunkTagger(nltk.TaggerI): 

  def __init__(self, train_sents):
      train_set = []
      for tagged_sent in train_sents:
          untagged_sent = nltk.tag.untag(tagged_sent)
          history = []
          for i, (word, tag) in enumerate(tagged_sent):
              featureset = npchunk_features(untagged_sent, i, history)  
              train_set.append( (featureset, tag) )
              history.append(tag)
      #self.classifier = nltk.MaxentClassifier.train(  
      #    train_set, algorithm='GIS', trace=0)
      
      self.classifier = nltk.NaiveBayesClassifier.train(train_set)


  def tag(self, sentence):
      history = []
      for i, word in enumerate(sentence):
          featureset = npchunk_features(sentence, i, history)
          tag = self.classifier.classify(featureset)
          history.append(tag)
      return zip(sentence, history)

class ConsecutiveNPChunker(nltk.ChunkParserI):  
  def __init__(self, train_sents):
      tagged_sents = [[((w,t),c) for (w,t,c) in
                        nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
      self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

  def parse(self, sentence):
      tagged_sents = self.tagger.tag(sentence)
      conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
      return nltk.chunk.conlltags2tree(conlltags)

In [10]:
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    return {"pos": pos}

chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%


In [13]:
def npchunk_features(sentence, i, history):
  word, pos = sentence[i]
  if i == 0:
    prevword, prevpos = "<START>", "<START>"
  else:
    prevword, prevpos = sentence[i-1]
  return {"pos": pos, "prevpos": prevpos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  93.6%%
    Precision:     81.9%%
    Recall:        88.6%%
    F-Measure:     85.1%%


In [14]:
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"pos": pos, "word": word, "prevpos": prevpos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  94.4%%
    Precision:     84.1%%
    Recall:        89.8%%
    F-Measure:     86.9%%


In [15]:
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    if i == len(sentence)-1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i+1]
    return {"pos": pos,
            "word": word,
            "prevpos": prevpos,
            "nextpos": nextpos, 
            "prevpos+pos": "%s+%s" % (prevpos, pos),  
            "pos+nextpos": "%s+%s" % (pos, nextpos),
            "tags-since-dt": tags_since_dt(sentence, i)} 

In [18]:
def tags_since_dt(sentence, i):
  tags = set()
  for word, pos in sentence[:i]:
      if pos == 'DT':
          tags = set()
      else:
          tags.add(pos)
  return '+'.join(sorted(tags))

In [19]:
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  95.0%%
    Precision:     85.9%%
    Recall:        90.0%%
    F-Measure:     87.9%%
