/
PTB_Tree_eval.py
90 lines (82 loc) · 2.53 KB
/
PTB_Tree_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
import sys
import re
def quota_normalize(s):
if s.startswith("{'lat'"):
return "_{geo}"
elif s.startswith("{'name'"):
return "_{name}"
return s
class PTB_Tree_eval:
'''Tree for PTB format
>>> tree = PTB_Tree_eval()
>>> tree.set_by_text("(ROOT (NP (NNP Newspaper)))")
>>> print tree
(ROOT (NP (NNP Newspaper)))
>>> tree = PTB_Tree_eval()
>>> tree.set_by_text("(ROOT (S (NP-SBJ (NNP Ms.) (NNP Haag) ) (VP (VBZ plays) (NP (NNP Elianti) )) (. .) ))")
>>> print tree
(ROOT (S (NP-SBJ (NNP Ms.) (NNP Haag)) (VP (VBZ plays) (NP (NNP Elianti))) (. .)))
>>> print tree.word_yield()
Ms. Haag plays Elianti .
>>> tree = PTB_Tree_eval()
>>> tree.set_by_text("(ROOT (NFP ...))")
>>> print tree
(ROOT (NFP ...))
>>> tree.word_yield()
'...'
'''
# Convert text from the PTB to a tree. For example:
# ( (S (NP-SBJ (NNP Ms.) (NNP Haag) ) (VP (VBZ plays) (NP (NNP Elianti) )) (.
# .) ))
# This is a compressed form of:
# ( (S
# (NP-SBJ (NNP Ms.) (NNP Haag))
# (VP (VBZ plays)
# (NP (NNP Elianti)))
# (. .)))
def __init__(self, text=None):
self.subtrees = []
self.text = None
if text != None:
self.set_by_text(text)
def set_by_text(self, text, pos=0):
depth = 0
no_quot_flag = True
for i in xrange(pos + 1, len(text)):
char = text[i]
if char == '\"':
no_quot_flag = not no_quot_flag
elif no_quot_flag:
# update the depth
if char == '(':
depth += 1
if depth == 1:
subtree = PTB_Tree_eval()
subtree.set_by_text(text, i)
self.subtrees.append(subtree)
if self.text is None:
self.text = text[pos + 1:i].strip()
elif char == ')':
depth -= 1
if len(self.subtrees) == 0:
if depth >= 0:
print 'ERR: depth >= 0'
self.text = text[pos + 1:i]
# we've reached the end of the scope for this bracket
if depth < 0:
break
def get_production_list(self, depth=0):
r_list = []
if len(self.subtrees) > 0:
# add production of this level
if depth > 0:
prod = ['(', self.text]
for subtree in self.subtrees:
prod.append('(%s)' % (subtree.text,))
prod.append(')')
r_list.append(' '.join(prod))
# travel to subtree
for subtree in self.subtrees:
r_list.extend(subtree.get_production_list(depth + 1))
return r_list