Skip to content

Commit

Permalink
Fixed a bug where building declension on disambiguated lemma would fa…
Browse files Browse the repository at this point in the history
…il (#963)

* (Issue #962) Fixed a bug where building declension on desambiguated lemma root such as aggero2 would not remove the 2
in building forms (aggeroo instead of aggero)

* (CollatinusDecliner) Fixed a small issue that led to multiple lemmas not being used

* (Collatinus) Somehow, ablative seems to be gone...

* (Collatinus Decliner) Fixed few bugs in Collatinus decliner mostly tied to data from latin_models_cltk. This had impact on tests, so fixed the tests.

* (Collatinus) Moved doctest to unittests

Doctest would not run because it had no access to cltk-latin_model_data

Co-authored-by: Kyle P. Johnson <kyle@kyle-p-johnson.com>
  • Loading branch information
PonteIneptique and kylepjohnson committed Mar 11, 2020
1 parent baadb7f commit a0463a8
Show file tree
Hide file tree
Showing 2 changed files with 195 additions and 95 deletions.
71 changes: 51 additions & 20 deletions cltk/stem/latin/declension.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@

import os
import json
import re
from cltk.exceptions import UnknownLemma



class CollatinusDecliner:
""" Latin Decliner based on Collatinus data and approach to declining words for Latin
Expand All @@ -35,37 +37,49 @@ class CollatinusDecliner:
"""

_dism = re.compile(r"(\d+)")

def __init__(self):
path = os.path.join(get_cltk_data_dir(),
'latin', 'model', 'latin_models_cltk',
'lemmata', 'collatinus', 'collected.json')
path = os.path.expanduser(path)
with open(path) as data_file:
self.__data__ = json.load(data_file)
self._data = json.load(data_file)

self.__models__ = self.__data__["models"]
self.__lemmas__ = self.__data__["lemmas"]
self._models = self._data["models"]
self._lemmas = self._data["lemmas"]
self._mapped = self._data["maps"]

def __getPOS(self, key):
""" Get POS tag for key
:param key: Key Index of Collatinus Morphos
:return: Part-Of-Speech tag
"""
return self.__data__["pos"][str(key)]
return self._data["pos"][str(key)]

def _remove_disambiguation(self, root):
"""Remove disambiguation index from lemma root
def __getRoots(self, lemma, model=None):
:param root: Root in Collatinus
:return: Cleaned root
"""
return CollatinusDecliner._dism.sub("", root)

def _getRoots(self, lemma, model=None):
""" Retrieve the known roots of a lemma
:param lemma: Canonical form of the word (lemma)
:type lemma: str
:param model_roots: Model data from the loaded self.__data__. Can be passed by decline()
:type model_roots: dict
:param model: Model data from the loaded self.__data__. Can be passed by decline()
:type model: dict
:return: Dictionary of roots with their root identifier as key
:rtype: dict
"""

if lemma not in self.__lemmas__:
if lemma not in self._lemmas:
raise UnknownLemma("%s is unknown" % lemma)

ROOT_IDS = {
Expand All @@ -74,7 +88,12 @@ def __getRoots(self, lemma, model=None):
"2": "perf"
}

lemma_entry = self.__lemmas__[lemma]
lemma_entry = self._lemmas[lemma]
if "quantity" in lemma_entry and lemma_entry["quantity"]:
lemma_in_lemma_entry = lemma_entry["quantity"]
else:
lemma_in_lemma_entry = self._remove_disambiguation(lemma_entry["lemma"])

original_roots = {
root_id: lemma_entry[root_name].split(",")
for root_id, root_name in ROOT_IDS.items()
Expand All @@ -83,14 +102,14 @@ def __getRoots(self, lemma, model=None):
returned_roots = {}

if not model:
model = self.__models__[lemma_entry["model"]]
model = self._models[lemma_entry["model"]]

# For each registered root in the model,
for model_root_id, model_root_data in model["R"].items():

# If we have K, it's equivalent to canonical form
if model_root_data[0] == "K":
returned_roots[model_root_id] = [lemma_entry["lemma"]]
returned_roots[model_root_id] = lemma_in_lemma_entry.split(",")
# Otherwise we have deletion number and addition char
else:
deletion, addition = int(model_root_data[0]), model_root_data[1] or ""
Expand All @@ -100,14 +119,18 @@ def __getRoots(self, lemma, model=None):
if model_root_id != "1" and model_root_id in returned_roots:
lemma_roots = returned_roots[model_root_id]
else:
lemma_roots = lemma_entry["lemma"].split(",")
lemma_roots = lemma_in_lemma_entry.split(",")
# We construct the roots
returned_roots[model_root_id] = [
lemma_root[:-deletion] + addition
for lemma_root in lemma_roots
]

if model_root_id in original_roots:
returned_roots[model_root_id].extend(original_roots[model_root_id])
returned_roots[model_root_id] = list(set(returned_roots[model_root_id]))
original_roots.update(returned_roots)

return original_roots

def decline(self, lemma, flatten=False, collatinus_dict=False):
Expand All @@ -126,18 +149,22 @@ def decline(self, lemma, flatten=False, collatinus_dict=False):
:type collatinus_dict: bool
:return: List of tuple where first value is the form and second the pos, ie [("sum", "v1ppip---")]
:rtype: list or dict
"""

if lemma not in self.__lemmas__:
if lemma in self._lemmas:
# Get data information
lemma_entry = self._lemmas[lemma]
elif lemma in self._mapped and self._mapped[lemma] in self._lemmas:
# Get data information
lemma = self._mapped[lemma]
lemma_entry = self._lemmas[self._mapped[lemma]]
else:
raise UnknownLemma("%s is unknown" % lemma)

# Get data information
lemma_entry = self.__lemmas__[lemma]
model = self.__models__[lemma_entry["model"]]
model = self._models[lemma_entry["model"]]

# Get the roots
roots = self.__getRoots(lemma, model=model)

roots = self._getRoots(lemma, model=model)
# Get the known forms in order
keys = sorted([int(key) for key in model["des"].keys()])
forms_data = [(key, model["des"][str(key)]) for key in keys]
Expand Down Expand Up @@ -186,4 +213,8 @@ def decline(self, lemma, flatten=False, collatinus_dict=False):
else:
return list(
[(form, self.__getPOS(key)) for key, case_forms in forms.items() for form in case_forms]
)
)

@property
def lemmas(self):
return self._lemmas

0 comments on commit a0463a8

Please sign in to comment.