Skip to content

Commit

Permalink
Add Penn Treebank format conversion routines for parsed tree.
Browse files Browse the repository at this point in the history
  • Loading branch information
emfomy committed May 12, 2020
1 parent ea20039 commit 4072348
Show file tree
Hide file tree
Showing 4 changed files with 151 additions and 22 deletions.
8 changes: 4 additions & 4 deletions ckipnlp/container/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def to_text(self):
@classmethod
@_abstractmethod
def from_dict(cls, data):
"""Construct an instance a from python built-in containers."""
"""Construct an instance from python built-in containers."""
return NotImplemented

@_abstractmethod
Expand All @@ -66,7 +66,7 @@ def to_dict(self):
@classmethod
@_abstractmethod
def from_list(cls, data):
"""Construct an instance a from python built-in containers."""
"""Construct an instance from python built-in containers."""
return NotImplemented

@_abstractmethod
Expand Down Expand Up @@ -187,7 +187,7 @@ def to_text(self):

@classmethod
def from_dict(cls, data):
"""Construct an instance a from python built-in containers.
"""Construct an instance from python built-in containers.
Parameters
----------
Expand All @@ -209,7 +209,7 @@ def to_dict(self):

@classmethod
def from_list(cls, data):
"""Construct an instance a from python built-in containers.
"""Construct an instance from python built-in containers.
Parameters
----------
Expand Down
6 changes: 3 additions & 3 deletions ckipnlp/container/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class NerToken(_BaseTuple, _NerToken):

@classmethod
def from_tagger(cls, data):
"""Construct an instance a from CkipTagger format."""
"""Construct an instance from CkipTagger format."""
idx0, idx1, ner, word = data
return cls(word=word, ner=ner, idx=(idx0, idx1,)) # pylint: disable=no-value-for-parameter

Expand Down Expand Up @@ -148,7 +148,7 @@ class NerSentence(_BaseSentence):

@classmethod
def from_tagger(cls, data):
"""Construct an instance a from CkipTagger format."""
"""Construct an instance from CkipTagger format."""
return cls(map(cls.item_class.from_tagger, data))

def to_tagger(self):
Expand Down Expand Up @@ -220,7 +220,7 @@ class NerParagraph(_BaseList):

@classmethod
def from_tagger(cls, data):
"""Construct an instance a from CkipTagger format."""
"""Construct an instance from CkipTagger format."""
return cls(map(cls.item_class.from_tagger, data))

def to_tagger(self):
Expand Down
109 changes: 94 additions & 15 deletions ckipnlp/container/util/parsed_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,17 @@ class ParsedTree(_Base, _Tree):
List format
Not implemented.
Penn Treebank format
Used for :meth:`from_penn` and :meth:`to_penn`.
.. code-block:: python
[
'S',
[ 'Head:Nab', '中文字', ],
[ 'particle:Td', '耶', ],
]
"""

node_class = ParsedNode
Expand All @@ -303,6 +314,8 @@ def normalize_text(tree_text):
def __str__(self):
self.to_text()

########################################################################################################################

@classmethod
def from_text(cls, data, *, normalize=True):
"""Construct an instance from text format.
Expand All @@ -319,33 +332,33 @@ def from_text(cls, data, *, normalize=True):

tree = cls()
node_id = 0
node_queue = [None]
node_stack = [None]
text = ''
ending = True

for char in data:
if char == '(':
node_data = cls.node_class.data_class.from_text(text)
tree.create_node(tag=text, identifier=node_id, parent=node_queue[-1], data=node_data)
tree.create_node(tag=text, identifier=node_id, parent=node_stack[-1], data=node_data)

node_queue.append(node_id)
node_stack.append(node_id)
node_id += 1
text = ''

elif char == ')':
if not ending:
node_data = cls.node_class.data_class.from_text(text)
tree.create_node(tag=text, identifier=node_id, parent=node_queue[-1], data=node_data)
tree.create_node(tag=text, identifier=node_id, parent=node_stack[-1], data=node_data)
node_id += 1

node_queue.pop()
node_stack.pop()
text = ''
ending = True

elif char == '|':
if not ending:
node_data = cls.node_class.data_class.from_text(text)
tree.create_node(tag=text, identifier=node_id, parent=node_queue[-1], data=node_data)
tree.create_node(tag=text, identifier=node_id, parent=node_stack[-1], data=node_data)
node_id += 1

text = ''
Expand Down Expand Up @@ -383,7 +396,7 @@ def to_text(self, node_id=None):

@classmethod
def from_dict(cls, data):
"""Construct an instance a from python built-in containers.
"""Construct an instance from python built-in containers.
Parameters
----------
Expand All @@ -392,22 +405,22 @@ def from_dict(cls, data):
"""
tree = cls()

queue = _deque()
queue.append((data, None,))
node_queue = _deque()
node_queue.append((data, None,))

while queue:
node_dict, parent_id = queue.popleft()
while node_queue:
node_dict, parent_id = node_queue.popleft()
node_id = node_dict['id']
node_data = cls.node_class.data_class.from_dict(node_dict['data'])
tree.create_node(tag=node_data.to_text(), identifier=node_id, parent=parent_id, data=node_data)

for child in node_dict['children']:
queue.append((child, node_id,))
node_queue.append((child, node_id,))

return tree

def to_dict(self, node_id=None):
"""Construct an instance a from python built-in containers.
"""Transform to python built-in containers.
Parameters
----------
Expand All @@ -429,6 +442,68 @@ def to_dict(self, node_id=None):

return tree_dict

@classmethod
def from_penn(cls, data):
"""Construct an instance from Penn Treebank format."""
tree = cls()

node_stack = _deque()
node_stack.append((data, None,))

node_id = 0

while node_stack:
penn_data, parent_id = node_stack.pop()

if not penn_data:
raise SyntaxError(f'Empty node #{node_id}')

if not isinstance(penn_data[0], str):
raise SyntaxError(f'First element of a node must be string, got {type(penn_data[0])}')

if len(penn_data) == 2 and isinstance(penn_data[-1], str):
penn_data = (':'.join(penn_data),)

node_data = cls.node_class.data_class.from_text(penn_data[0])
tree.create_node(tag=node_data.to_text(), identifier=node_id, parent=parent_id, data=node_data)

for child in penn_data[-1:0:-1]:
node_stack.append((child, node_id,))
node_id += 1

return tree

def to_penn(self, node_id=None, with_role=True):
"""Transform to Penn Treebank format.
Parameters
----------
node_id : int
Output the plain text format for the subtree under **node_id**.
with_role : bool
Contains role-tag or not.
Returns
-------
list
"""
if node_id is None:
node_id = self.root

node = self[node_id]

penn_data = [f'{node.data.role}:{node.data.pos}' if with_role and node.data.role else node.data.pos,]

if node.data.word:
penn_data.append(node.data.word)

for child in self.children(node_id):
penn_data.append(self.to_penn(child.identifier))

return penn_data

########################################################################################################################

def show(self, *,
key=lambda node: node.identifier,
idhidden=False,
Expand Down Expand Up @@ -541,10 +616,14 @@ def get_relations(self, root_id=None, *, semantic=True):
for tail in children:
if tail.data.role != 'Head' and tail not in head_children:
if tail.is_leaf():
yield ParsedRelation(head=head_node, tail=tail, relation=tail) # pylint: disable=no-value-for-parameter
yield ParsedRelation( # pylint: disable=no-value-for-parameter
head=head_node, tail=tail, relation=tail,
)
else:
for node in self.get_heads(tail.identifier, semantic=semantic):
yield ParsedRelation(head=head_node, tail=node, relation=tail) # pylint: disable=no-value-for-parameter
yield ParsedRelation( # pylint: disable=no-value-for-parameter
head=head_node, tail=node, relation=tail,
)

# Recursion
for child in children:
Expand Down
50 changes: 50 additions & 0 deletions test/container/util_parsed_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,48 @@ class TestParsedTree(unittest.TestCase, _TestCaseBase):
],
}

penn_in = [
'S',
[
'goal:NP',
[
'possessor:N‧的',
[ 'head:Nhaa', '我', ],
[ 'Head:DE', '的', ],
],
[
'Head:Nab',
[
'DUMMY1:Nab',
[ 'DUMMY1:Nab', '早餐', ],
[ 'Head:Caa', '、', ],
[ 'DUMMY2:Naa', '午餐', ],
],
[ 'Head:Caa', '和' ],
[ 'DUMMY2:Nab', '晚餐' ],
],
],
[ 'quantity:Dab', '都', ],
[
'condition:PP',
[ 'Head:P21', '在', ],
[
'DUMMY:GP',
[
'DUMMY:NP',
[ 'Head:Nac', '比賽' ],
],
[ 'Head:Ng', '中', ],
],
],
[
'agent:PP',
[ 'Head:P02', '被', ],
],
[ 'Head:VC31', '吃掉', ],
[ 'aspect:Di', '了', ],
]

def _assertEqual(self, obj):
self.assertEqual(len(obj), 23)
self._assertEqualNode(obj, 0, None, None, 'S', None)
Expand Down Expand Up @@ -239,6 +281,14 @@ def _assertEqualNode(self, obj, node_id, parent_id, role, pos, word):
self.assertEqual(node_data.pos, pos)
self.assertEqual(node_data.word, word)

def test_io_penn(self):

obj = self.obj_class.from_penn(self.penn_in)
self._assertEqual(obj)
penn_out = obj.to_penn()

self.assertSequenceEqual(penn_out, self.penn_in)

def test_normalize_text(self):
text_orig = '#1:1.[0] ' + self.text_in + '#'
text_out = self.obj_class.normalize_text(text_orig)
Expand Down

0 comments on commit 4072348

Please sign in to comment.