Skip to content

Commit

Permalink
Japanese model: add user_dict entries and small refactor (#5573)
Browse files Browse the repository at this point in the history
* user_dict fields: adding inflections, reading_forms, sub_tokens
deleting: unidic_tags
improve code readability around the token alignment procedure

* add test cases, replace fugashi with sudachipy in conftest

* move bunsetu.py to spaCy Universe as a pipeline component BunsetuRecognizer

* tag is space -> both surface and tag are spaces

* consider len(text)==0
  • Loading branch information
hiroshi-matsuda-rit committed Jun 22, 2020
1 parent c344207 commit 150a39c
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 249 deletions.
205 changes: 101 additions & 104 deletions spacy/lang/ja/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,7 @@


# Hold the attributes we need with convenient names
DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"])

# Handling for multiple spaces in a row is somewhat awkward, this simplifies
# the flow by creating a dummy with the same interface.
DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"])
DummySpace = DummyNode(" ", " ", " ")
DetailedToken = namedtuple("DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"])


def try_sudachi_import(split_mode="A"):
Expand Down Expand Up @@ -53,7 +48,7 @@ def try_sudachi_import(split_mode="A"):
)


def resolve_pos(orth, pos, next_pos):
def resolve_pos(orth, tag, next_tag):
"""If necessary, add a field to the POS tag for UD mapping.
Under Universal Dependencies, sometimes the same Unidic POS tag can
be mapped differently depending on the literal token or its context
Expand All @@ -64,124 +59,77 @@ def resolve_pos(orth, pos, next_pos):
# Some tokens have their UD tag decided based on the POS of the following
# token.

# orth based rules
if pos[0] in TAG_ORTH_MAP:
orth_map = TAG_ORTH_MAP[pos[0]]
# apply orth based mapping
if tag in TAG_ORTH_MAP:
orth_map = TAG_ORTH_MAP[tag]
if orth in orth_map:
return orth_map[orth], None
return orth_map[orth], None # current_pos, next_pos

# tag bi-gram mapping
if next_pos:
tag_bigram = pos[0], next_pos[0]
# apply tag bi-gram mapping
if next_tag:
tag_bigram = tag, next_tag
if tag_bigram in TAG_BIGRAM_MAP:
bipos = TAG_BIGRAM_MAP[tag_bigram]
if bipos[0] is None:
return TAG_MAP[pos[0]][POS], bipos[1]
current_pos, next_pos = TAG_BIGRAM_MAP[tag_bigram]
if current_pos is None: # apply tag uni-gram mapping for current_pos
return TAG_MAP[tag][POS], next_pos # only next_pos is identified by tag bi-gram mapping
else:
return bipos

return TAG_MAP[pos[0]][POS], None


# Use a mapping of paired punctuation to avoid splitting quoted sentences.
pairpunct = {'「':'」', '『': '』', '【': '】'}

return current_pos, next_pos

def separate_sentences(doc):
"""Given a doc, mark tokens that start sentences based on Unidic tags.
"""

stack = [] # save paired punctuation

for i, token in enumerate(doc[:-2]):
# Set all tokens after the first to false by default. This is necessary
# for the doc code to be aware we've done sentencization, see
# `is_sentenced`.
token.sent_start = (i == 0)
if token.tag_:
if token.tag_ == "補助記号-括弧開":
ts = str(token)
if ts in pairpunct:
stack.append(pairpunct[ts])
elif stack and ts == stack[-1]:
stack.pop()

if token.tag_ == "補助記号-句点":
next_token = doc[i+1]
if next_token.tag_ != token.tag_ and not stack:
next_token.sent_start = True


def get_dtokens(tokenizer, text):
tokens = tokenizer.tokenize(text)
words = []
for ti, token in enumerate(tokens):
tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*'])
inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*'])
dtoken = DetailedToken(
token.surface(),
(tag, inf),
token.dictionary_form())
if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白':
# don't add multiple space tokens in a row
continue
words.append(dtoken)
# apply tag uni-gram mapping
return TAG_MAP[tag][POS], None

# remove empty tokens. These can be produced with characters like … that
# Sudachi normalizes internally.
words = [ww for ww in words if len(ww.surface) > 0]
return words


def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")):
def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
# Compare the content of tokens and text, first
words = [x.surface for x in dtokens]
if "".join("".join(words).split()) != "".join(text.split()):
raise ValueError(Errors.E194.format(text=text, words=words))
text_words = []
text_lemmas = []
text_tags = []

text_dtokens = []
text_spaces = []
text_pos = 0
# handle empty and whitespace-only texts
if len(words) == 0:
return text_words, text_lemmas, text_tags, text_spaces
return text_dtokens, text_spaces
elif len([word for word in words if not word.isspace()]) == 0:
assert text.isspace()
text_words = [text]
text_lemmas = [text]
text_tags = [gap_tag]
text_dtokens = [DetailedToken(text, gap_tag, '', text, None, None)]
text_spaces = [False]
return text_words, text_lemmas, text_tags, text_spaces
# normalize words to remove all whitespace tokens
norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()])
# align words with text
for word, dtoken in zip(norm_words, norm_dtokens):
return text_dtokens, text_spaces

# align words and dtokens by referring text, and insert gap tokens for the space char spans
for word, dtoken in zip(words, dtokens):
# skip all space tokens
if word.isspace():
continue
try:
word_start = text[text_pos:].index(word)
except ValueError:
raise ValueError(Errors.E194.format(text=text, words=words))

# space token
if word_start > 0:
w = text[text_pos:text_pos + word_start]
text_words.append(w)
text_lemmas.append(w)
text_tags.append(gap_tag)
text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None))
text_spaces.append(False)
text_pos += word_start
text_words.append(word)
text_lemmas.append(dtoken.lemma)
text_tags.append(dtoken.pos)

# content word
text_dtokens.append(dtoken)
text_spaces.append(False)
text_pos += len(word)
# poll a space char after the word
if text_pos < len(text) and text[text_pos] == " ":
text_spaces[-1] = True
text_pos += 1

# trailing space token
if text_pos < len(text):
w = text[text_pos:]
text_words.append(w)
text_lemmas.append(w)
text_tags.append(gap_tag)
text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None))
text_spaces.append(False)
return text_words, text_lemmas, text_tags, text_spaces

return text_dtokens, text_spaces


class JapaneseTokenizer(DummyTokenizer):
Expand All @@ -191,29 +139,78 @@ def __init__(self, cls, nlp=None, config={}):
self.tokenizer = try_sudachi_import(self.split_mode)

def __call__(self, text):
dtokens = get_dtokens(self.tokenizer, text)

words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text)
# convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces
sudachipy_tokens = self.tokenizer.tokenize(text)
dtokens = self._get_dtokens(sudachipy_tokens)
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)

# create Doc with tag bi-gram based part-of-speech identification rules
words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6
sub_tokens_list = list(sub_tokens_list)
doc = Doc(self.vocab, words=words, spaces=spaces)
next_pos = None
for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)):
token.tag_ = unidic_tag[0]
if next_pos:
next_pos = None # for bi-gram rules
for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
token.tag_ = dtoken.tag
if next_pos: # already identified in previous iteration
token.pos = next_pos
next_pos = None
else:
token.pos, next_pos = resolve_pos(
token.orth_,
unidic_tag,
unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None
dtoken.tag,
tags[idx + 1] if idx + 1 < len(tags) else None
)

# if there's no lemma info (it's an unk) just use the surface
token.lemma_ = lemma
doc.user_data["unidic_tags"] = unidic_tags
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface

doc.user_data["inflections"] = inflections
doc.user_data["reading_forms"] = readings
doc.user_data["sub_tokens"] = sub_tokens_list

return doc

def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True):
sub_tokens_list = self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None
dtokens = [
DetailedToken(
token.surface(), # orth
'-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']), # tag
','.join([xx for xx in token.part_of_speech()[4:] if xx != '*']), # inf
token.dictionary_form(), # lemma
token.reading_form(), # user_data['reading_forms']
sub_tokens_list[idx] if sub_tokens_list else None, # user_data['sub_tokens']
) for idx, token in enumerate(sudachipy_tokens) if len(token.surface()) > 0
# remove empty tokens which can be produced with characters like … that
]
# Sudachi normalizes internally and outputs each space char as a token.
# This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens
return [
t for idx, t in enumerate(dtokens) if
idx == 0 or
not t.surface.isspace() or t.tag != '空白' or
not dtokens[idx - 1].surface.isspace() or dtokens[idx - 1].tag != '空白'
]

def _get_sub_tokens(self, sudachipy_tokens):
if self.split_mode is None or self.split_mode == "A": # do nothing for default split mode
return None

sub_tokens_list = [] # list of (list of list of DetailedToken | None)
for token in sudachipy_tokens:
sub_a = token.split(self.tokenizer.SplitMode.A)
if len(sub_a) == 1: # no sub tokens
sub_tokens_list.append(None)
elif self.split_mode == "B":
sub_tokens_list.append([self._get_dtokens(sub_a, False)])
else: # "C"
sub_b = token.split(self.tokenizer.SplitMode.B)
if len(sub_a) == len(sub_b):
dtokens = self._get_dtokens(sub_a, False)
sub_tokens_list.append([dtokens, dtokens])
else:
sub_tokens_list.append([self._get_dtokens(sub_a, False), self._get_dtokens(sub_b, False)])
return sub_tokens_list

def _get_config(self):
config = OrderedDict(
(
Expand Down
Loading

0 comments on commit 150a39c

Please sign in to comment.