Merge tag '0.8.0' into develop

no message
ckiplab · Apr 27, 2020 · 062ce79 · 062ce79
2 parents 2655e14 + f8f993b
commit 062ce79
Show file tree

Hide file tree

Showing 20 changed files with 278 additions and 114 deletions.
diff --git a/Makefile b/Makefile
@@ -7,9 +7,9 @@ TWINE = twine
 
 all: dist check test
 
-dist: sdist
+dist: sdist bdist_wheel
 
-sdist:
+sdist bdist_wheel:
 	$(PY) setup.py $@
 
 check:
@@ -29,7 +29,7 @@ doc:
 
 upload: dist check
 	ls dist/*.tar.gz
-	$(TWINE) upload --repository-url https://test.pypi.org/legacy/ dist/*.tar.gz --verbose
+	$(TWINE) upload --repository-url https://test.pypi.org/legacy/ dist/* --verbose
 
 clean:
 	( cd docs ; make clean )

diff --git a/README.rst b/README.rst
@@ -101,16 +101,16 @@ Requirements
 Tool Requirements
 -----------------
 
-================================  ==========  ============  =============
- Tool                              Built-in    CkipTagger    CkipClassic
-================================  ==========  ============  =============
-Sentence Segmentation              ✔
-Word Segmentation†                             ✔             ✔
-Part-of-Speech Tagging†                        ✔             ✔
-Sentence Parsing                                             ✔
-Named-Entity Recognition                       ✔
-Co-Reference Delectation‡          ✔           ✔             ✔
-================================  ==========  ============  =============
+================================  ========  ==========  ===========
+Tool                              Built-in  CkipTagger  CkipClassic
+================================  ========  ==========  ===========
+Sentence Segmentation             ✔
+Word Segmentation†                          ✔           ✔
+Part-of-Speech Tagging†                     ✔           ✔
+Sentence Parsing                                        ✔
+Named-Entity Recognition                    ✔
+Co-Reference Delectation‡         ✔         ✔           ✔
+================================  ========  ==========  ===========
 
 - † These tools require only one of either backends.
 - ‡ Co-Reference implementation does not require any backend, but requires results from word segmentation, part-of-speech tagging, sentence parsing, and named-entity recognition.
@@ -119,35 +119,28 @@ Installation via Pip
 --------------------
 
 - No backend (not recommended): ``pip install ckipnlp``.
-- With CkipTagger backend (recommended): ``pip install ckipnlp[tagger]``.
-- With CkipClassic backend: ``pip install ckipnlp[classic]``.
-- With both backend: ``pip install ckipnlp[tagger,classic]``.
-
-Please refer https://ckip-classic.readthedocs.io for CkipClassic installation guide.
+- With CkipTagger backend (recommended): ``pip install ckipnlp[tagger]``
+- With CkipClassic backend: Please refer https://ckip-classic.readthedocs.io/en/latest/src/readme.html#installation for CkipClassic installation guide.
 
 Usage
 =====
 
-See http://ckipnlp.readthedocs.io/en/latest/_api/ckipnlp.html for API details.
+See https://ckipnlp.readthedocs.io/en/latest/_api/ckipnlp.html for API details.
 
-Pipeline
---------
+Pipelines
+---------
+
+Core Pipeline
+^^^^^^^^^^^^^
 
 .. image:: _static/image/pipeline.svg
 
 .. code-block:: python
 
-   import ckipnlp
-   print(ckipnlp.__name__, ckipnlp.__version__)
-
-   ################################################################
-
    from ckipnlp.pipeline import CkipPipeline, CkipDocument
 
    pipeline = CkipPipeline()
-   doc = CkipDocument(
-      raw='中文字喔，啊哈哈哈',
-   )
+   doc = CkipDocument(raw='中文字喔，啊哈哈哈')
 
    # Word Segmentation
    pipeline.get_ws(doc)
@@ -161,47 +154,156 @@ Pipeline
    for line in doc.pos:
        print(line.to_text())
 
-   # Sentence Parsing
-   pipeline.get_parsed(doc)
-   print(doc.parsed)
-
    # Named-Entity Recognition
    pipeline.get_ner(doc)
    print(doc.ner)
 
+   # Sentence Parsing
+   pipeline.get_parsed(doc)
+   print(doc.parsed)
+
    ################################################################
 
-   from ckipnlp.container.wspos import WsPosParagraph
+   from ckipnlp.container.util.wspos import WsPosParagraph
 
    # Word Segmentation & Part-of-Speech Tagging
    for line in WsPosParagraph.to_text(doc.ws, doc.pos):
        print(line)
 
 Co-Reference Pipeline
----------------------
+^^^^^^^^^^^^^^^^^^^^^
 
 .. image:: _static/image/coref_pipeline.svg
 
 .. code-block:: python
 
-   import ckipnlp
-   print(ckipnlp.__name__, ckipnlp.__version__)
-
-   ################################################################
-
    from ckipnlp.pipeline import CkipCorefPipeline, CkipDocument
 
    pipeline = CkipCorefPipeline()
-   doc = CkipDocument(
-      raw='畢卡索他想，完蛋了',
-   )
+   doc = CkipDocument(raw='畢卡索他想，完蛋了')
 
    # Co-Reference
    corefdoc = pipeline(doc)
    print(corefdoc.coref)
    for line in corefdoc.coref:
        print(line.to_text())
 
+Containers
+----------
+
+The container objects provides following methods:
+
+-  |from_text|, |to_text| for plain-text format conversions;
+-  |from_dict|, |to_dict| for dictionary-like format conversions;
+-  |from_list|, |to_list| for list-like format conversions;
+-  |from_json|, |to_json| for JSON format conversions (based-on dictionary-like format conversions).
+
+The following are the interfaces, where ``CONTAINER_CLASS`` refers to the container class.
+
+.. code-block:: python
+
+   obj = CONTAINER_CLASS.from_text(plain_text)
+   plain_text = obj.to_text()
+
+   obj = CONTAINER_CLASS.from_dict({ key: value })
+   dict_obj = obj.to_dict()
+
+   obj = CONTAINER_CLASS.from_list([ value1, value2 ])
+   list_obj = obj.to_list()
+
+   obj = CONTAINER_CLASS.from_json(json_str)
+   json_str = obj.to_json()
+
+Note that not all container provide all above methods. Here is the table of implemented methods. Please refer the documentation of each container for detail formats.
+
+========================  ========================  ============  ========================
+Container                 Item                      from/to text  from/to dict, list, json
+========================  ========================  ============  ========================
+|TextParagraph|           |str|                     ✔             ✔
+|SegSentence|             |str|                     ✔             ✔
+|SegParagraph|            |SegSentence|             ✔             ✔
+|NerToken|                ✘                                       ✔
+|NerSentence|             |NerToken|                              ✔
+|NerParagraph|            |NerSentence|                           ✔
+|ParsedParagraph|         |str|                     ✔             ✔
+|CorefToken|              ✘                         only to       ✔
+|CorefSentence|           |CorefToken|              only to       ✔
+|CorefParagraph|          |CorefSentence|           only to       ✔
+========================  ========================  ============  ========================
+
+WS with POS
+^^^^^^^^^^^
+
+There are also conversion routines for word-segmentation and POS containers jointly. For example, |WsPosToken| provides routines for a word (|str|) with POS-tag (|str|):
+
+.. code-block:: python
+
+   ws_obj, pos_obj = WsPosToken.from_text('中文字(Na)')
+   plain_text = WsPosToken.to_text(ws_obj, pos_obj)
+
+   ws_obj, pos_obj = WsPosToken.from_dict({ 'word': '中文字', 'pos': 'Na', })
+   dict_obj = WsPosToken.to_dict(ws_obj, pos_obj)
+
+   ws_obj, pos_obj = WsPosToken.from_list([ '中文字', 'Na' ])
+   list_obj = WsPosToken.to_list(ws_obj, pos_obj)
+
+   ws_obj, pos_obj = WsPosToken.from_json(json_str)
+   json_str = WsPosToken.to_json(ws_obj, pos_obj)
+
+Similarly, |WsPosSentence|/|WsPosParagraph| provides routines for word-segmented and POS sentence/paragraph (|SegSentence|/|SegParagraph|) respectively.
+
+Parsed Tree
+^^^^^^^^^^^
+
+In addition to |ParsedParagraph|, we have implemented tree utilities base on `TreeLib <https://treelib.readthedocs.io>`_.
+
+|ParsedTree| is the tree structure of a parsed sentence. One may use |from_text| and |to_text| for plain-text conversion; |from_dict|, |to_dict| for dictionary-like object conversion; and also |from_json|, |to_json| for JSON string conversion.
+
+The |ParsedTree| is a `TreeLib <https://treelib.readthedocs.io>`_ tree with |ParsedNode| as its nodes. The data of these nodes is stored in a |ParsedNodeData| (accessed by ``node.data``), which is a tuple of ``role`` (semantic role), ``pos`` (part-of-speech tagging), ``word``.
+
+|ParsedTree| provides useful methods: |get_heads| finds the head words of the sentence; |get_relations| extracts all relations in the sentence; |get_subjects| returns the subjects of the sentence.
+
+.. code-block:: python
+
+   from ckipnlp.container import ParsedTree
+
+   # 我的早餐、午餐和晚餐都在那場比賽中被吃掉了
+   tree_text = 'S(goal:NP(possessor:N‧的(head:Nhaa:我|Head:DE:的)|Head:Nab(DUMMY1:Nab(DUMMY1:Nab:早餐|Head:Caa:、|DUMMY2:Naa:午餐)|Head:Caa:和|DUMMY2:Nab:晚餐))|quantity:Dab:都|condition:PP(Head:P21:在|DUMMY:GP(DUMMY:NP(Head:Nac:比賽)|Head:Ng:中))|agent:PP(Head:P02:被)|Head:VC31:吃掉|aspect:Di:了)'
+
+   tree = ParsedTree.from_text(tree_text, normalize=False)
+
+   print('Show Tree')
+   tree.show()
+
+   print('Get Heads of {}'.format(tree[5]))
+   print('-- Semantic --')
+   for head in tree.get_heads(5, semantic=True): print(repr(head))
+   print('-- Syntactic --')
+   for head in tree.get_heads(5, semantic=False): print(repr(head))
+   print()
+
+   print('Get Relations of {}'.format(tree[0]))
+   print('-- Semantic --')
+   for rel in tree.get_relations(0, semantic=True): print(repr(rel))
+   print('-- Syntactic --')
+   for rel in tree.get_relations(0, semantic=False): print(repr(rel))
+   print()
+
+   # 我和食物真的都很不開心
+   tree_text = 'S(theme:NP(DUMMY1:NP(Head:Nhaa:我)|Head:Caa:和|DUMMY2:NP(Head:Naa:食物))|evaluation:Dbb:真的|quantity:Dab:都|degree:Dfa:很|negation:Dc:不|Head:VH21:開心)'
+
+   tree = ParsedTree.from_text(tree_text, normalize=False)
+
+   print('Show Tree')
+   tree.show()
+
+   print('Get get_subjects of {}'.format(tree[0]))
+   print('-- Semantic --')
+   for subject in tree.get_subjects(0, semantic=True): print(repr(subject))
+   print('-- Syntactic --')
+   for subject in tree.get_subjects(0, semantic=False): print(repr(subject))
+   print()
+
 License
 =======
 
@@ -211,3 +313,40 @@ Copyright (c) 2018-2020 `CKIP Lab <https://ckip.iis.sinica.edu.tw>`_ under the `
 
 .. |CC BY-NC-SA 4.0| image:: https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png
    :target: http://creativecommons.org/licenses/by-nc-sa/4.0/
+
+
+
+.. |from_text| replace:: ``from_text()``
+.. |to_text| replace:: ``to_text()``
+.. |from_dict| replace:: ``from_dict()``
+.. |to_dict| replace:: ``to_dict()``
+.. |from_list| replace:: ``from_list()``
+.. |to_list| replace:: ``to_list()``
+.. |from_json| replace:: ``from_json()``
+.. |to_json| replace:: ``to_json()``
+
+.. |get_heads| replace:: ``get_heads()``
+.. |get_relations| replace:: ``get_relations()``
+.. |get_subjects| replace:: ``get_subjects()``
+
+.. |str| replace:: ``str``
+
+.. |TextParagraph| replace:: ``TextParagraph``
+.. |SegSentence| replace:: ``SegSentence``
+.. |SegParagraph| replace:: ``SegParagraph``
+.. |NerToken| replace:: ``NerToken``
+.. |NerSentence| replace:: ``NerSentence``
+.. |NerParagraph| replace:: ``NerParagraph``
+.. |ParsedParagraph| replace:: ``ParsedParagraph``
+.. |CorefToken| replace:: ``CorefToken``
+.. |CorefSentence| replace:: ``CorefSentence``
+.. |CorefParagraph| replace:: ``CorefParagraph``
+
+.. |WsPosToken| replace:: ``WsPosToken``
+.. |WsPosSentence| replace:: ``WsPosSentence``
+.. |WsPosParagraph| replace:: ``WsPosParagraph``
+
+.. |ParsedNodeData| replace:: ``ParsedNodeData``
+.. |ParsedNode| replace:: ``ParsedNode``
+.. |ParsedRelation| replace:: ``ParsedRelation``
+.. |ParsedTree| replace:: ``ParsedTree``
diff --git a/ckipnlp/__init__.py b/ckipnlp/__init__.py
@@ -10,7 +10,7 @@
 __copyright__ = '2018-2020 CKIP Lab'
 
 __title__ = 'CKIPNLP'
-__version__ = '0.8.0dev'
+__version__ = '0.8.0'
 __description__ = 'CKIP CoreNLP'
 __license__ = 'CC BY-NC-SA 4.0'
 

diff --git a/ckipnlp/container/__init__.py b/ckipnlp/container/__init__.py
@@ -11,9 +11,9 @@
 
 from .text import *
 from .seg import *
-from .parsed import *
 from .ner import *
+from .parsed import *
 from .coref import *
 
-from .wspos import *
-from .tree.parsed import *
+from .util.wspos import *
+from .util.parsed_tree import *
diff --git a/ckipnlp/container/tree/__init__.py → ckipnlp/container/util/__init__.py b/ckipnlp/container/tree/__init__.py → ckipnlp/container/util/__init__.py
@@ -2,7 +2,7 @@
 # -*- coding:utf-8 -*-
 
 """
-This module implements specialized tree containers for CKIPNLP.
+This module implements specialized utilities for CKIPNLP containers.
 """
 
 __author__ = 'Mu Yang <http://muyang.pro>'

diff --git a/ckipnlp/container/tree/parsed.py → ckipnlp/container/util/parsed_tree.py b/ckipnlp/container/tree/parsed.py → ckipnlp/container/util/parsed_tree.py
@@ -47,7 +47,7 @@ class ParsedNodeData(_BaseTuple, _ParsedNodeData):
     Attributes
     ----------
         role : str
-            the role.
+            the semantic role.
         pos : str
             the POS-tag.
         word : str
@@ -176,7 +176,7 @@ class ParsedRelation(_Base, _ParsedRelation):
         tail : :class:`ParsedNode`
             the tail node.
         relation : :class:`ParsedNode`
-            the relation node. (the role of this node is the relation.)
+            the relation node. (the semantic role of this node is the relation.)
 
     Notes
     -----
@@ -245,7 +245,8 @@ class ParsedTree(_Base, _Tree):
                 'S(Head:Nab:中文字|particle:Td:耶)'
 
         Dict format
-            Used for :meth:`to_dict`. A dictionary such as ``{ 'id': 0, 'data': { ... }, 'children': [ ... ] }``,
+            Used for :meth:`from_dict` and :meth:`to_dict`.
+            A dictionary such as ``{ 'id': 0, 'data': { ... }, 'children': [ ... ] }``,
             where ``'data'`` is a dictionary with the same format as :meth:`ParsedNodeData.to_dict`,
             and ``'children'`` is a list of dictionaries of subtrees with the same format as this tree.
 

diff --git a/ckipnlp/container/wspos.py → ckipnlp/container/util/wspos.py b/ckipnlp/container/wspos.py → ckipnlp/container/util/wspos.py
@@ -18,11 +18,11 @@
     NamedTuple as _NamedTuple,
 )
 
-from .base import (
+from ..base import (
     BaseTuple as _BaseTuple,
 )
 
-from .seg import (
+from ..seg import (
     SegSentence as _SegSentence,
     SegParagraph as _SegParagraph,
 )

diff --git a/ckipnlp/driver/__init__.py b/ckipnlp/driver/__init__.py
@@ -2,7 +2,7 @@
 # -*- coding:utf-8 -*-
 
 """
-This module implements drivers for CKIPNLP.
+This module implements CKIPNLP drivers.
 """
 
 __author__ = 'Mu Yang <http://muyang.pro>'

diff --git a/ckipnlp/driver/base.py b/ckipnlp/driver/base.py
@@ -30,8 +30,8 @@ class DriverType(_IntEnum):
     SENTENCE_SEGMENTER = _enum_auto()  #: Sentence segmentation
     WORD_SEGMENTER = _enum_auto()      #: Word segmentation
     POS_TAGGER = _enum_auto()          #: Part-of-speech tagging
-    SENTENCE_PARSER = _enum_auto()     #: Sentence parsing
     NER_CHUNKER = _enum_auto()         #: Named-entity recognition
+    SENTENCE_PARSER = _enum_auto()     #: Sentence parsing
     COREF_CHUNKER = _enum_auto()       #: Co-reference delectation
 
 class DriverKind(_IntEnum):