Japanese model: add user_dict entries and small refactor (#5573)

* user_dict fields: adding inflections, reading_forms, sub_tokens deleting: unidic_tags improve code readability around the token alignment procedure * add test cases, replace fugashi with sudachipy in conftest * move bunsetu.py to spaCy Universe as a pipeline component BunsetuRecognizer * tag is space -> both surface and tag are spaces * consider len(text)==0
explosion · Jun 22, 2020 · 150a39c · 150a39c
1 parent c344207
commit 150a39c
Show file tree

Hide file tree

Showing 3 changed files with 153 additions and 249 deletions.
diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
@@ -20,12 +20,7 @@
 
 
 # Hold the attributes we need with convenient names
-DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"])
-
-# Handling for multiple spaces in a row is somewhat awkward, this simplifies
-# the flow by creating a dummy with the same interface.
-DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"])
-DummySpace = DummyNode(" ", " ", " ")
+DetailedToken = namedtuple("DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"])
 
 
 def try_sudachi_import(split_mode="A"):
@@ -53,7 +48,7 @@ def try_sudachi_import(split_mode="A"):
         )
 
 
-def resolve_pos(orth, pos, next_pos):
+def resolve_pos(orth, tag, next_tag):
     """If necessary, add a field to the POS tag for UD mapping.
     Under Universal Dependencies, sometimes the same Unidic POS tag can
     be mapped differently depending on the literal token or its context
@@ -64,124 +59,77 @@ def resolve_pos(orth, pos, next_pos):
     # Some tokens have their UD tag decided based on the POS of the following
     # token.
 
-    # orth based rules
-    if pos[0] in TAG_ORTH_MAP:
-        orth_map = TAG_ORTH_MAP[pos[0]]
+    # apply orth based mapping
+    if tag in TAG_ORTH_MAP:
+        orth_map = TAG_ORTH_MAP[tag]
         if orth in orth_map:
-            return orth_map[orth], None
+            return orth_map[orth], None  # current_pos, next_pos
 
-    # tag bi-gram mapping
-    if next_pos:
-        tag_bigram = pos[0], next_pos[0]
+    # apply tag bi-gram mapping
+    if next_tag:
+        tag_bigram = tag, next_tag
         if tag_bigram in TAG_BIGRAM_MAP:
-            bipos = TAG_BIGRAM_MAP[tag_bigram]
-            if bipos[0] is None:
-                return TAG_MAP[pos[0]][POS], bipos[1]
+            current_pos, next_pos = TAG_BIGRAM_MAP[tag_bigram]
+            if current_pos is None:  # apply tag uni-gram mapping for current_pos
+                return TAG_MAP[tag][POS], next_pos  # only next_pos is identified by tag bi-gram mapping
             else:
-                return bipos
-
-    return TAG_MAP[pos[0]][POS], None
-
-
-# Use a mapping of paired punctuation to avoid splitting quoted sentences.
-pairpunct = {'「':'」', '『': '』', '【': '】'}
-
+                return current_pos, next_pos
 
-def separate_sentences(doc):
-    """Given a doc, mark tokens that start sentences based on Unidic tags.
-    """
-
-    stack = [] # save paired punctuation
-
-    for i, token in enumerate(doc[:-2]):
-        # Set all tokens after the first to false by default. This is necessary
-        # for the doc code to be aware we've done sentencization, see
-        # `is_sentenced`.
-        token.sent_start = (i == 0)
-        if token.tag_:
-            if token.tag_ == "補助記号-括弧開":
-                ts = str(token)
-                if ts in pairpunct:
-                    stack.append(pairpunct[ts])
-                elif stack and ts == stack[-1]:
-                    stack.pop()
-
-            if token.tag_ == "補助記号-句点":
-                next_token = doc[i+1]
-                if next_token.tag_ != token.tag_ and not stack:
-                    next_token.sent_start = True
-
-
-def get_dtokens(tokenizer, text):
-    tokens = tokenizer.tokenize(text)
-    words = []
-    for ti, token in enumerate(tokens):
-        tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*'])
-        inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*'])
-        dtoken = DetailedToken(
-                token.surface(),
-                (tag, inf),
-                token.dictionary_form())
-        if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白':
-            # don't add multiple space tokens in a row
-            continue
-        words.append(dtoken)
+    # apply tag uni-gram mapping
+    return TAG_MAP[tag][POS], None
 
-    # remove empty tokens. These can be produced with characters like … that
-    # Sudachi normalizes internally. 
-    words = [ww for ww in words if len(ww.surface) > 0]
-    return words
 
-
-def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")):
+def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
+    # Compare the content of tokens and text, first
     words = [x.surface for x in dtokens]
     if "".join("".join(words).split()) != "".join(text.split()):
         raise ValueError(Errors.E194.format(text=text, words=words))
-    text_words = []
-    text_lemmas = []
-    text_tags = []
+
+    text_dtokens = []
     text_spaces = []
     text_pos = 0
     # handle empty and whitespace-only texts
     if len(words) == 0:
-        return text_words, text_lemmas, text_tags, text_spaces
+        return text_dtokens, text_spaces
     elif len([word for word in words if not word.isspace()]) == 0:
         assert text.isspace()
-        text_words = [text]
-        text_lemmas = [text]
-        text_tags = [gap_tag]
+        text_dtokens = [DetailedToken(text, gap_tag, '', text, None, None)]
         text_spaces = [False]
-        return text_words, text_lemmas, text_tags, text_spaces
-    # normalize words to remove all whitespace tokens
-    norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()])
-    # align words with text
-    for word, dtoken in zip(norm_words, norm_dtokens):
+        return text_dtokens, text_spaces
+
+    # align words and dtokens by referring text, and insert gap tokens for the space char spans
+    for word, dtoken in zip(words, dtokens):
+        # skip all space tokens
+        if word.isspace():
+            continue
         try:
             word_start = text[text_pos:].index(word)
         except ValueError:
             raise ValueError(Errors.E194.format(text=text, words=words))
+
+        # space token
         if word_start > 0:
             w = text[text_pos:text_pos + word_start]
-            text_words.append(w)
-            text_lemmas.append(w)
-            text_tags.append(gap_tag)
+            text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None))
             text_spaces.append(False)
             text_pos += word_start
-        text_words.append(word)
-        text_lemmas.append(dtoken.lemma)
-        text_tags.append(dtoken.pos)
+
+        # content word
+        text_dtokens.append(dtoken)
         text_spaces.append(False)
         text_pos += len(word)
+        # poll a space char after the word
         if text_pos < len(text) and text[text_pos] == " ":
             text_spaces[-1] = True
             text_pos += 1
+
+    # trailing space token
     if text_pos < len(text):
         w = text[text_pos:]
-        text_words.append(w)
-        text_lemmas.append(w)
-        text_tags.append(gap_tag)
+        text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None))
         text_spaces.append(False)
-    return text_words, text_lemmas, text_tags, text_spaces
+
+    return text_dtokens, text_spaces
 
 
 class JapaneseTokenizer(DummyTokenizer):
@@ -191,29 +139,78 @@ def __init__(self, cls, nlp=None, config={}):
         self.tokenizer = try_sudachi_import(self.split_mode)
 
     def __call__(self, text):
-        dtokens = get_dtokens(self.tokenizer, text)
-
-        words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text)
+        # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
+        sudachipy_tokens = self.tokenizer.tokenize(text)
+        dtokens = self._get_dtokens(sudachipy_tokens)
+        dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
+
+        # create Doc with tag bi-gram based part-of-speech identification rules
+        words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6
+        sub_tokens_list = list(sub_tokens_list)
         doc = Doc(self.vocab, words=words, spaces=spaces)
-        next_pos = None
-        for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)):
-            token.tag_ = unidic_tag[0]
-            if next_pos:
+        next_pos = None  # for bi-gram rules
+        for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
+            token.tag_ = dtoken.tag
+            if next_pos:  # already identified in previous iteration
                 token.pos = next_pos
                 next_pos = None
             else:
                 token.pos, next_pos = resolve_pos(
                     token.orth_,
-                    unidic_tag,
-                    unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None
+                    dtoken.tag,
+                    tags[idx + 1] if idx + 1 < len(tags) else None
                 )
-
             # if there's no lemma info (it's an unk) just use the surface
-            token.lemma_ = lemma
-        doc.user_data["unidic_tags"] = unidic_tags
+            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
+
+        doc.user_data["inflections"] = inflections
+        doc.user_data["reading_forms"] = readings
+        doc.user_data["sub_tokens"] = sub_tokens_list
 
         return doc
 
+    def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True):
+        sub_tokens_list = self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
+        dtokens = [
+            DetailedToken(
+                token.surface(),  # orth
+                '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']),  # tag
+                ','.join([xx for xx in token.part_of_speech()[4:] if xx != '*']),  # inf
+                token.dictionary_form(),  # lemma
+                token.reading_form(),  # user_data['reading_forms']
+                sub_tokens_list[idx] if sub_tokens_list else None,  # user_data['sub_tokens']
+            ) for idx, token in enumerate(sudachipy_tokens) if len(token.surface()) > 0
+            # remove empty tokens which can be produced with characters like … that
+        ]
+        # Sudachi normalizes internally and outputs each space char as a token.
+        # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
+        return [
+            t for idx, t in enumerate(dtokens) if
+            idx == 0 or
+            not t.surface.isspace() or t.tag != '空白' or
+            not dtokens[idx - 1].surface.isspace() or dtokens[idx - 1].tag != '空白'
+        ]
+
+    def _get_sub_tokens(self, sudachipy_tokens):
+        if self.split_mode is None or self.split_mode == "A":  # do nothing for default split mode
+            return None
+
+        sub_tokens_list = []  # list of (list of list of DetailedToken | None)
+        for token in sudachipy_tokens:
+            sub_a = token.split(self.tokenizer.SplitMode.A)
+            if len(sub_a) == 1:  # no sub tokens
+                sub_tokens_list.append(None)
+            elif self.split_mode == "B":
+                sub_tokens_list.append([self._get_dtokens(sub_a, False)])
+            else:  # "C"
+                sub_b = token.split(self.tokenizer.SplitMode.B)
+                if len(sub_a) == len(sub_b):
+                    dtokens = self._get_dtokens(sub_a, False)
+                    sub_tokens_list.append([dtokens, dtokens])
+                else:
+                    sub_tokens_list.append([self._get_dtokens(sub_a, False), self._get_dtokens(sub_b, False)])
+        return sub_tokens_list
+
     def _get_config(self):
         config = OrderedDict(
             (