<a href="https://colab.research.google.com/github/gamecicn/sample_jupyter/blob/main/Supervised_NE_Chunker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Building a Supervised Classifier-based Chunker

Regular-expression based chunkers and  n-gram chunkers decide what chunks to create entirely based on part-of-speech tags. 

In this exercise, we will leverage additional features to improve the unigram chunker.

In [None]:
import nltk
from nltk.corpus import conll2000

In [None]:
test_sents  = conll2000.chunked_sents('test.txt',  chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])

---
### Unigram Chunker
The UnigramChunker class uses a unigram tagger to label sentences with chunk tags. The class defines two methods
1. a constructor is called when we build a new UnigramChunker
2. parse method is used to chunk new sentences.

The goal here is to assign IOB tags to words in a sentence, and then convert those tags to chunks.

In [None]:
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [None]:
unigram_chunker = UnigramChunker(train_sents)

In [None]:
print(unigram_chunker.evaluate(test_sents))

### A classifier-based chunker

For the classifier-based chunker also assigns IOB tags to the words in a sentence, 
and then convert those tags to chunks.

The below code defines two classes. 
- The first class calls a feature extractor and then uses a nlkt classifier method (e.g. Naive Bayes). 
- The second class is a wrapper around the tagger class that turns it into a chunker. 
  - During training, this second class maps the chunk trees in the training corpus into tag sequences; 
  - in the parse() method, it converts the tag sequence provided by the tagger back into a chunk tree.

In [None]:
class ConsecutiveNPChunkTagger(nltk.TaggerI):
    # this class inherits from base call nltk.TaggerI

    def __init__(self, train_sents):
        # train sents should be of form [[((w,t),c),...],[((w,t),c),...],...]
        train_set = []
        for tagged_sent in train_sents:
            # given a tagged sentence, untag() returns an untagged version of the sentence
            untagged_sent = nltk.tag.untag(tagged_sent)
            
            history = []
            # we are going to iterate through the list of trained sentences
            # and extract the index i along with the tag for each word
            for i, (word, tag) in enumerate(tagged_sent):
                # build an array of tuples (dict,label), where dict is dict of features
                featureset = npchunk_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        #self.classifier = nltk.MaxentClassifier.train(train_set,trace=0)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        # given a NE-tagged sentence in tree format, convert to tag format and
        # extract ((word,pos-tag),chunk-tag) tuples into an array
        tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        # pass array of sentence arrays of tuples to tagger 
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)

In [None]:
# this feature extractor just provides the part-of-speech tag of the current token
# basically a unigram chunker
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    return {"pos": pos}

In [None]:
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%


In [None]:
sent = train_sents[123]
sent

In [None]:
tagged_sent = [((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] 
tagged_sent

In [None]:
untagged_sent = nltk.tag.untag(tagged_sent)
untagged_sent

In [None]:
history = []
train_set = []
# we are going to iterate through the list of trained sentences
# and extract the index i along with the tag for each word
for i, (word, tag) in enumerate(tagged_sent):
    featureset = npchunk_features(untagged_sent, i, history)
    train_set.append( (featureset, tag) )
    history.append(tag)

In [None]:
# this feature extractor includes the part of speech for the previous word
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"pos": pos, "prevpos": prevpos}

In [None]:
def test_feature_extractor(tagged_sent):
    history = []
    train_set = []
    # we are going to iterate through the list of trained sentences
    # and extract the index i along with the tag for each word
    for i, (word, tag) in enumerate(tagged_sent):
        featureset = npchunk_features(untagged_sent, i, history)
        train_set.append( (featureset, tag) )
        history.append(tag)
        
    print(*train_set, sep='\n')

In [None]:
train_set

[({'pos': 'CD', 'prevpos': '<START>'}, 'O'),
 ({'pos': '.', 'prevpos': 'CD'}, 'O'),
 ({'pos': 'RB', 'prevpos': '.'}, 'O'),
 ({'pos': 'CC', 'prevpos': 'RB'}, 'O'),
 ({'pos': 'RB', 'prevpos': 'CC'}, 'O'),
 ({'pos': ',', 'prevpos': 'RB'}, 'O'),
 ({'pos': 'NN', 'prevpos': ','}, 'O'),
 ({'pos': 'DT', 'prevpos': 'NN'}, 'B-NP'),
 ({'pos': 'PRP$', 'prevpos': 'DT'}, 'I-NP'),
 ({'pos': 'NNS', 'prevpos': 'PRP$'}, 'I-NP'),
 ({'pos': 'CC', 'prevpos': 'NNS'}, 'I-NP'),
 ({'pos': 'NNS', 'prevpos': 'CC'}, 'I-NP'),
 ({'pos': 'IN', 'prevpos': 'NNS'}, 'O'),
 ({'pos': 'DT', 'prevpos': 'IN'}, 'B-NP'),
 ({'pos': 'NN', 'prevpos': 'DT'}, 'I-NP'),
 ({'pos': 'POS', 'prevpos': 'NN'}, 'B-NP'),
 ({'pos': 'NN', 'prevpos': 'POS'}, 'I-NP'),
 ({'pos': 'NN', 'prevpos': 'NN'}, 'I-NP'),
 ({'pos': '.', 'prevpos': 'NN'}, 'O')]

In [None]:
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  93.6%%
    Precision:     81.9%%
    Recall:        88.6%%
    F-Measure:     85.1%%


In [None]:
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"pos": pos, "word": word, "prevpos": prevpos}


In [None]:
test_feature_extractor(tagged_sent)

({'pos': 'CD', 'word': '2', 'prevpos': '<START>'}, 'O')
({'pos': '.', 'word': '.', 'prevpos': 'CD'}, 'O')
({'pos': 'RB', 'word': 'Formally', 'prevpos': '.'}, 'O')
({'pos': 'CC', 'word': 'or', 'prevpos': 'RB'}, 'O')
({'pos': 'RB', 'word': 'informally', 'prevpos': 'CC'}, 'O')
({'pos': ',', 'word': ',', 'prevpos': 'RB'}, 'O')
({'pos': 'NN', 'word': 'train', 'prevpos': ','}, 'O')
({'pos': 'DT', 'word': 'all', 'prevpos': 'NN'}, 'B-NP')
({'pos': 'PRP$', 'word': 'your', 'prevpos': 'DT'}, 'I-NP')
({'pos': 'NNS', 'word': 'managers', 'prevpos': 'PRP$'}, 'I-NP')
({'pos': 'CC', 'word': 'and', 'prevpos': 'NNS'}, 'I-NP')
({'pos': 'NNS', 'word': 'supervisors', 'prevpos': 'CC'}, 'I-NP')
({'pos': 'IN', 'word': 'in', 'prevpos': 'NNS'}, 'O')
({'pos': 'DT', 'word': 'the', 'prevpos': 'IN'}, 'B-NP')
({'pos': 'NN', 'word': 'company', 'prevpos': 'DT'}, 'I-NP')
({'pos': 'POS', 'word': "'s", 'prevpos': 'NN'}, 'B-NP')
({'pos': 'NN', 'word': 'due-process', 'prevpos': 'POS'}, 'I-NP')
({'pos': 'NN', 'word': 'approa

In [None]:
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  94.4%%
    Precision:     84.1%%
    Recall:        89.8%%
    F-Measure:     86.9%%
