Skip to content

Commit

Permalink
Merge pull request #239 from dkpro/bugfix/238-Error-parsing-FSList-in…
Browse files Browse the repository at this point in the history
…-cTAKES-XMI

#238 - Error parsing FSList in cTAKES xmi
  • Loading branch information
reckart committed Dec 12, 2021
2 parents 9aab091 + 7ce6586 commit 0b802b3
Show file tree
Hide file tree
Showing 7 changed files with 393 additions and 19 deletions.
23 changes: 16 additions & 7 deletions cassis/cas.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from attr import validators
from sortedcontainers import SortedKeyList

from cassis.typesystem import TYPE_NAME_SOFA, FeatureStructure, TypeCheckError, TypeSystem
from cassis.typesystem import TYPE_NAME_SOFA, FeatureStructure, TypeCheckError, TypeSystem, TYPE_NAME_FS_LIST, \
TYPE_NAME_FS_ARRAY, FEATURE_BASE_NAME_HEAD

_validator_optional_string = validators.optional(validators.instance_of(str))

Expand Down Expand Up @@ -590,7 +591,7 @@ def typecheck(self) -> List[TypeCheckError]:
def _find_all_fs(
self,
generate_missing_ids: bool = True,
include_inlinable_arrays: bool = False,
include_inlinable_arrays_and_lists: bool = False,
seeds: Iterable = None,
) -> Iterable[FeatureStructure]:
"""This function traverses the whole CAS in order to find all directly and indirectly referenced
Expand Down Expand Up @@ -656,21 +657,29 @@ def _find_all_fs(
continue

if (
not include_inlinable_arrays
not include_inlinable_arrays_and_lists
and not feature.multipleReferencesAllowed
and ts.is_array(feature.rangeType)
and (ts.is_array(feature.rangeType) or ts.is_list(feature.rangeType))
):
# For inlined FSArrays, we still need to scan their members
if feature.rangeType.name == "uima.cas.FSArray" and feature_value.elements:
# For inlined FSArrays / FSList, we still need to scan their members
if feature.rangeType.name == TYPE_NAME_FS_ARRAY and feature_value.elements:
for ref in feature_value.elements:
if not ref or ref.xmiID in all_fs:
continue
openlist.append(ref)
elif feature.rangeType.name == TYPE_NAME_FS_LIST and hasattr(feature_value, FEATURE_BASE_NAME_HEAD):
v = feature_value
while hasattr(v, FEATURE_BASE_NAME_HEAD):
if not v.head or v.head.xmiID in all_fs:
continue
openlist.append(v.head)
v = v.tail
# For primitive arrays / lists, we do not need to handle the elements
continue

if not hasattr(feature_value, "xmiID"):
raise AttributeError(
f"Feature [{feature_name}] should point to a [{feature.rangeType.name}] but the feature value is a [{type(feature_value)}] with the value [{feature_value}]"
f"Feature [{feature.domainType.name}:{feature_name}] should point to a [{feature.rangeType.name}] but the feature value is a [{type(feature_value)}] with the value [{feature_value}]"
)

if feature_value.xmiID in all_fs:
Expand Down
73 changes: 72 additions & 1 deletion cassis/typesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,21 @@
TYPE_NAME_DOUBLE = UIMA_CAS_PREFIX + "Double"
TYPE_NAME_ARRAY_BASE = UIMA_CAS_PREFIX + "ArrayBase"
TYPE_NAME_FS_ARRAY = UIMA_CAS_PREFIX + "FSArray"
TYPE_NAME_FS_LIST = UIMA_CAS_PREFIX + "FSList"
TYPE_NAME_EMPTY_FS_LIST = UIMA_CAS_PREFIX + "EmptyFSList"
TYPE_NAME_NON_EMPTY_FS_LIST = UIMA_CAS_PREFIX + "NonEmptyFSList"
TYPE_NAME_INTEGER_ARRAY = UIMA_CAS_PREFIX + "IntegerArray"
TYPE_NAME_INTEGER_LIST = UIMA_CAS_PREFIX + "IntegerList"
TYPE_NAME_EMPTY_INTEGER_LIST = UIMA_CAS_PREFIX + "EmptyIntegerList"
TYPE_NAME_NON_EMPTY_INTEGER_LIST = UIMA_CAS_PREFIX + "NonEmptyIntegerList"
TYPE_NAME_FLOAT_ARRAY = UIMA_CAS_PREFIX + "FloatArray"
TYPE_NAME_FLOAT_LIST = UIMA_CAS_PREFIX + "FloatList"
TYPE_NAME_EMPTY_FLOAT_LIST = UIMA_CAS_PREFIX + "EmptyFloatList"
TYPE_NAME_NON_EMPTY_FLOAT_LIST = UIMA_CAS_PREFIX + "NonEmptyFloatList"
TYPE_NAME_STRING_ARRAY = UIMA_CAS_PREFIX + "StringArray"
TYPE_NAME_STRING_LIST = UIMA_CAS_PREFIX + "StringList"
TYPE_NAME_EMPTY_STRING_LIST = UIMA_CAS_PREFIX + "EmptyStringList"
TYPE_NAME_NON_EMPTY_STRING_LIST = UIMA_CAS_PREFIX + "NonEmptyStringList"
TYPE_NAME_BOOLEAN_ARRAY = UIMA_CAS_PREFIX + "BooleanArray"
TYPE_NAME_BYTE_ARRAY = UIMA_CAS_PREFIX + "ByteArray"
TYPE_NAME_SHORT_ARRAY = UIMA_CAS_PREFIX + "ShortArray"
Expand All @@ -56,6 +68,8 @@
FEATURE_BASE_NAME_BEGIN = "begin"
FEATURE_BASE_NAME_END = "end"
FEATURE_BASE_NAME_LANGUAGE = "language"
FEATURE_BASE_NAME_HEAD = "head"
FEATURE_BASE_NAME_TAIL = "tail"

_DOCUMENT_ANNOTATION_TYPE = "uima.tcas.DocumentAnnotation"

Expand Down Expand Up @@ -166,10 +180,13 @@
"uima.cas.StringArray",
}

_PRIMITIVE_LIST_TYPES = {TYPE_NAME_INTEGER_LIST, TYPE_NAME_FLOAT_LIST, TYPE_NAME_STRING_LIST}

_INHERITANCE_FINAL_TYPES = _PRIMITIVE_ARRAY_TYPES

_ARRAY_TYPES = _PRIMITIVE_ARRAY_TYPES | {"uima.cas.FSArray"}
_ARRAY_TYPES = _PRIMITIVE_ARRAY_TYPES | {TYPE_NAME_FS_ARRAY}

_LIST_TYPES = _PRIMITIVE_LIST_TYPES | {TYPE_NAME_FS_LIST}

def _string_to_valid_classname(name: str):
return re.sub("[^a-zA-Z0-9_]", "_", name)
Expand Down Expand Up @@ -245,6 +262,23 @@ def is_primitive_array(type_: Union[str, "Type"]) -> bool:
return type_name in _PRIMITIVE_ARRAY_TYPES


def is_primitive_list(type_: Union[str, "Type"]) -> bool:
"""Checks if the type identified by `type` is a primitive list, e.g. list of primitives.
Args:
type_: Type to query for (`Type` or name as string)
Returns:
Returns `True` if the type identified by `type` is a primitive array type, else `False`
"""
type_name = type_ if isinstance(type_, str) else type_.name

if type_name == TOP_TYPE_NAME:
return False

# Arrays are inheritance-final, so we do not need to check the inheritance hierarchy
return type_name in _PRIMITIVE_LIST_TYPES


def is_array(type_: Union[str, "Type"]) -> bool:
"""Checks if the type identified by `type` is an array.
Expand All @@ -262,6 +296,23 @@ def is_array(type_: Union[str, "Type"]) -> bool:
return type_name in _ARRAY_TYPES


def is_list(type_: Union[str, "Type"]) -> bool:
"""Checks if the type identified by `type` is a list.
Args:
type_: Type to query for (`Type` or name as string)
Returns:
Returns `True` if the type identified by `type` is a list type, else `False`
"""
type_name = type_ if isinstance(type_, str) else type_.name

if type_name == TOP_TYPE_NAME:
return False

# Lists are inheritance-final, so we do not need to check the inheritance hierarchy
return type_name in _LIST_TYPES


@attr.s
class TypeCheckError(Exception):
xmiID: int = attr.ib() # xmiID of the feature structure with type error
Expand Down Expand Up @@ -811,6 +862,16 @@ def is_primitive_array(self, type_: Union[str, Type]) -> bool:
"""
return is_primitive_array(type_)

def is_primitive_list(self, type_: Union[str, Type]) -> bool:
"""Checks if the type identified by `type` is a primitive list, e.g. list of primitives.
Args:
type_: Type to query for (`Type` or name as string)
Returns:
Returns `True` if the type identified by `type` is a primitive array type, else `False`
"""
return is_primitive_list(type_)

def is_array(self, type_: Union[str, Type]) -> bool:
"""Checks if the type identified by `type` is an array.
Expand All @@ -821,6 +882,16 @@ def is_array(self, type_: Union[str, Type]) -> bool:
"""
return is_array(type_)

def is_list(self, type_: Union[str, Type]) -> bool:
"""Checks if the type identified by `type` is a list.
Args:
type_: Type to query for (`Type` or name as string)
Returns:
Returns `True` if the type identified by `type` is a list type, else `False`
"""
return is_list(type_)

def subsumes(self, parent: Union[str, Type], child: Union[str, Type]) -> bool:
"""Determines if the type `child` is a child of `parent`.
Expand Down
Loading

0 comments on commit 0b802b3

Please sign in to comment.