# 用级联分块器构建嵌套结构

In [4]:
import nltk
#例 7-6 是名词短语、介词短语、动词短语和句子的模式。这是一个四级块语法器， 可以用来创建深度最多为 4 的结构
grammar = r"""
NP: {<DT|JJ|NN.*>+}
PP: {<IN><NP>}
VP: {<VB.*><NP|PP|CLAUSE>+$}
CLAUSE: {<NP><VP>}
"""
cp = nltk.RegexpParser(grammar)
sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"),
           ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
print (cp.parse(sentence))
sentence = [("John", "NNP"), ("thinks", "VBZ"), ("Mary", "NN"),
            ("saw", "VBD"), ("the", "DT"), ("cat", "NN"), ("sit", "VB"),
            ("on", "IN"), ("the", "DT"), ("mat", "NN")]
print (cp.parse(sentence))

#这些问题的解决方案是:让分块器在它的模式中循环:尝试完所有模式之后，重复此过程。我们添加一个可选的第二个参数 loop 指定这套模式应该循环的次数
cp = nltk.RegexpParser(grammar, loop=2)
print (cp.parse(sentence))

(S
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
(S
  (NP John/NNP)
  thinks/VBZ
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
(S
  (NP John/NNP)
  thinks/VBZ
  (CLAUSE
    (NP Mary/NN)
    (VP
      saw/VBD
      (CLAUSE
        (NP the/DT cat/NN)
        (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))


# 树

In [13]:
tree1 = nltk.Tree('NP', ['Alice'])
print (tree1)
tree2 = nltk.Tree('NP', ['the', 'rabbit'])
print (tree2)
tree3 = nltk.Tree('VP', ['chased', tree2])
tree4 = nltk.Tree('S', [tree1, tree3])
print (tree4)
print (tree4[1].label())
print (tree4.leaves())
tree3.draw()

(NP Alice)
(NP the rabbit)
(S (NP Alice) (VP chased (NP the rabbit)))
VP
['Alice', 'chased', 'the', 'rabbit']


# 树遍历

In [24]:
def traverse(t):
    try:
        t.label()
    except AttributeError:
        print (t,end=' ')
    else:
        print ('(',t.label(),end=' ')
        for child in t:
            traverse(child)
        print (')',end=' ')
traverse(tree4)

( S ( NP Alice ) ( VP chased ( NP the rabbit ) ) ) 

# 命名实体识别