# 读取IOB格式和分块语料库

In [2]:
import nltk
text ='''he PRP B-VP
accepted VBD B-VP
the DT B-NP
position NN I-NP
of IN B-PP
vice NN B-NP
chairman NN I-NP
of IN B-PP
Carlyle NNP B-NP
Group NNP I-NP
a DT B-NP
merchant NN I-NP
banking NN I-NP
concern NN I-NP
'''
nltk.chunk.conllstr2tree(text, chunk_types=('NP')).draw()

In [3]:
from nltk.corpus import conll2000
print (conll2000.chunked_sents('train.txt',chunk_types=['NP'])[99])

(S
  Over/IN
  (NP a/DT cup/NN)
  of/IN
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  told/VBD
  (NP his/PRP$ story/NN)
  ./.)


# 简单评估和基准

我们开始为琐碎的不创建任何 块的块分析器 cp 建立一个基准

现在让我们尝试一个初级 的正则表达式分块器，查找以名词短语标记的特征字母(如 CD、DT 和 JJ)开头的标记

In [4]:
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt',chunk_types=['NP'])
print (cp.evaluate(test_sents))
grammar = "NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print (cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
ChunkParse score:
    IOB Accuracy:  87.7%%
    Precision:     70.6%%
    Recall:        67.8%%
    F-Measure:     69.2%%


In [5]:
#使用n-gram标注器对名词短语分块
class ngramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        '''类初始化 
        chunk.tree2conlltags 返回包含（word、tag、iob标记）的三元组列表。将树转换为Conll IOB标记格式
        UnigramTagger(train_data) 将unigram标注器训练成一元分块器
        BigramTagger(train_data) 将Bigram标注器训练成二元分块器
        '''
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.tag.BigramTagger(train_data)
    
    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]#获取词性标记数列
        tagged_pos_tags = self.tagger.tag(pos_tags)#对数列进行分块
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]#获取块数据
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]#组合打包词、词性、块数据
        return nltk.chunk.conlltags2tree(conlltags)#返回树结构

test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
ngram_chunker = ngramChunker(train_sents)
print(ngram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  93.3%%
    Precision:     82.3%%
    Recall:        86.8%%
    F-Measure:     84.5%%


In [16]:
nltk.chunk.tree2conlltags(train_sents[0])[0]

('Confidence', 'NN', 'B-NP')

# 训练基于分类器的分块器

In [25]:
def tags_since_dt(sentence,i):
    '''描述自最近的限定词以来遇到的所有词性标记'''
    tags = set()
    for word,pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))

def npchunk_features(sentence,i,history):
    '''提取第i个词的词性标记、词、上一个词的词性标记、后一个词的词性标记'''
    word,pos = sentence[i]
    if i == 0:
        prevword,prevpos="<START>","<START>"
    else:
        prevword,prevpos = sentence[i-1]
    if i == len(sentence)-1:
        nextword,nextpos='<END>',"<END>"
    else:
        nextword,nextpos=sentence[i+1]
    return {"pos":pos,"word":word,"prevpos":prevpos,"nextpos":nextpos,
            "prevpos+pos":"%s+%s" % (prevpos,pos),
            "pos+nextpos":"%s+%s" % (pos,nextpos),
            "tags-since-dt":tags_since_dt(sentence,i)
           }

class ConsecutiveNPChunkTagger(nltk.TaggerI):
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)#去除分块标记
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                '''对句子的每个词的位置i进行遍历，提取(词,词性)特征featureset 以及分块标记tag'''
                featureset = npchunk_features(untagged_sent, i, history)
                train_set.append((featureset,tag))
                history.append(tag)
        self.classifier = nltk.classify.NaiveBayesClassifier.train(train_set)
    
    def tag(self, sentence):
        '''对数据进行打标'''
        history = []
        for i, word in enumerate(sentence):
            featureset=npchunk_features(sentence,i,history)
            tag=self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence,history)
    
class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        '''转化树结构为((词,词性),分块标记)元组结构'''
        tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)
    def parse(self, sentence):
        '''标记结构转换至树结构'''
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)
    
chunker = ConsecutiveNPChunker(train_sents)
print (chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  95.0%%
    Precision:     85.9%%
    Recall:        90.0%%
    F-Measure:     87.9%%
