Skip to content

Commit

Permalink
Add return_pos argument to true_lemmatise (+ black)
Browse files Browse the repository at this point in the history
  • Loading branch information
frankier committed Nov 1, 2020
1 parent 7926062 commit f4f9630
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 11 deletions.
28 changes: 20 additions & 8 deletions finntk/omor/anlys.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,8 @@ def dict_to_analysis(d):


def chunk_subwords(it):

def is_cmp_bound(kv):
return (kv[0] == "BOUNDARY" and kv[1] == "COMPOUND")
return kv[0] == "BOUNDARY" and kv[1] == "COMPOUND"

return split_at(it, is_cmp_bound)

Expand Down Expand Up @@ -130,9 +129,13 @@ def default_return():
upos = subword_dict.get("upos")
if upos not in ("VERB", "AUX", "NOUN", "PROPN", "ADJ", "PRON"):
if strict:
assert upos is not None, "no upos found in subword_dict passed to true_lemmatise"
assert (
upos is not None
), "no upos found in subword_dict passed to true_lemmatise"
# As far as I know only verb, noun and adj can have drv
assert "drv" in subword_dict, "true_lemmatise in strict mode found drv in subword for unsupported UPOS"
assert (
"drv" in subword_dict
), "true_lemmatise in strict mode found drv in subword for unsupported UPOS"
return default_return()
new_subword_dict = {}
ending = None
Expand All @@ -159,7 +162,9 @@ def default_return():
new_subword_dict[k] = v
if ending is None:
if strict:
assert False, "true_lemmatise in strict mode couldn't determine which ending to add"
assert (
False
), "true_lemmatise in strict mode couldn't determine which ending to add"
else:
return default_return()
elif ending == "blacklisted":
Expand All @@ -186,7 +191,10 @@ def default_return():


def lemmas_of_subword_dicts(
subword_dicts, lemmatise_func=default_lemmatise, return_feats=False
subword_dicts,
lemmatise_func=default_lemmatise,
return_feats=False,
return_pos=False,
):
subword_dicts = list(subword_dicts)
res = {} if return_feats else set()
Expand All @@ -203,7 +211,10 @@ def form_lemma(lemma):
else:
for lemma in lemmatise_func(subword_dicts[-1]):
res.add(form_lemma(lemma))
return res
if return_pos:
return res, subword_dicts[-1].get("upos")
else:
return res


EXTRA_WORD_ID = re.compile(r"_\d+$")
Expand All @@ -212,7 +223,7 @@ def form_lemma(lemma):
def norm_word_id(word_id):
extra_match = EXTRA_WORD_ID.search(word_id)
if extra_match:
word_id = word_id[:extra_match.start()]
word_id = word_id[: extra_match.start()]
return word_id.lower()


Expand Down Expand Up @@ -274,6 +285,7 @@ def ud_to_omor(lemma, pos, feats=None):
PART_FORM_MAP,
INF_FORM_MAP,
)

pos = pos.upper()
if feats is None:
feats = {}
Expand Down
16 changes: 13 additions & 3 deletions finntk/omor/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ def _extract_lemmas(
lemmatise_func=default_lemmatise,
norm_func=iden_func,
return_feats=False,
return_pos=False,
):
assert not return_pos or return_feats # return_pos => return_feats
omorfi = get_omorfi()
analyses = omorfi.analyse(word_form)
res = {} if return_feats else set()
Expand All @@ -41,9 +43,16 @@ def _extract_lemmas(
lemma_feats = lemmas_of_subword_dicts(
analysis_slice,
lemmatise_func=lemmatise_func,
**({"return_feats": True} if return_feats else {})
return_feats=return_feats,
return_pos=return_pos,
)
if return_feats:
if return_pos:
lemma_feats_inner, upos = lemma_feats
for lemma, feats in lemma_feats_inner.items():
ext_lemma_feats(
res, norm_func(lemma), ((upos, feat) for feat in feats)
)
elif return_feats:
for lemma, feats in lemma_feats.items():
ext_lemma_feats(res, norm_func(lemma), feats)
else:
Expand Down Expand Up @@ -71,7 +80,7 @@ def extract_lemmas_span(word_form):
return _extract_lemmas(word_form, lambda analysis_dicts: [analysis_dicts])


def extract_true_lemmas_span(word_form, norm_func=iden_func):
def extract_true_lemmas_span(word_form, norm_func=iden_func, return_pos=False):
"""
Works like `extract_lemmas_span`, but uses `true_lemmatise`. It also
returns some of the features associated with each lemma.
Expand All @@ -82,6 +91,7 @@ def extract_true_lemmas_span(word_form, norm_func=iden_func):
lemmatise_func=true_lemmatise,
norm_func=norm_func,
return_feats=True,
return_pos=return_pos,
)


Expand Down

0 comments on commit f4f9630

Please sign in to comment.