feat(excel2json): add optional column "subject" to properties.xlsx (D…

…EV-3253) (#777)
dasch-swiss · Jan 31, 2024 · cf491e9 · cf491e9
1 parent 3676c35
commit cf491e9
Show file tree

Hide file tree

Showing 8 changed files with 83 additions and 65 deletions.
diff --git a/docs/assets/data_model_templates/rosetta (rosetta)/properties.xlsx b/docs/assets/data_model_templates/rosetta (rosetta)/properties.xlsx
diff --git a/docs/file-formats/excel2json.md b/docs/file-formats/excel2json.md
@@ -147,6 +147,9 @@ The expected columns are:
       e.g. `TextValue`, `ListValue`, or `IntValue`. 
     - If the property is derived from `hasLinkTo`, 
       the `object` specifies the resource class that this property refers to.
+- [`subject`](./json-project/ontologies.md#property-subject)
+  (optional): The subject defines the resource class the property can be used on. 
+  It has to be provided as prefixed name of the resource class.
 - [`gui_element`](./json-project/ontologies.md#property-object-gui_element-gui_attributes) 
   (mandatory): The graphic component, defines how this property should be displayed.
   Depends on the value of `object`: 

diff --git a/src/dsp_tools/commands/excel2json/properties.py b/src/dsp_tools/commands/excel2json/properties.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import importlib.resources
 import json
 import warnings
@@ -18,7 +20,6 @@
     Problem,
 )
 from dsp_tools.commands.excel2json.utils import (
-    add_optional_columns,
     check_column_for_duplicate,
     check_contains_required_columns,
     check_required_values,
@@ -63,21 +64,7 @@ def excel2properties(
     _do_property_excel_compliance(df=property_df, excelfile=excelfile)
 
     # Not all columns have to be filled, users may delete some for ease of use, but it would generate an error later
-    property_df = add_optional_columns(
-        df=property_df,
-        optional_col_set={
-            "label_en",
-            "label_de",
-            "label_fr",
-            "label_it",
-            "label_rm",
-            "comment_en",
-            "comment_de",
-            "comment_fr",
-            "comment_it",
-            "comment_rm",
-        },
-    )
+    property_df = _add_optional_columns(df=property_df)
 
     # transform every row into a property
     props = [
@@ -165,6 +152,28 @@ def _do_property_excel_compliance(df: pd.DataFrame, excelfile: str) -> None:
         raise InputError("\n\n".join(msg))
 
 
+def _add_optional_columns(df: pd.DataFrame) -> pd.DataFrame:
+    optional_col_set = {
+        "label_en",
+        "label_de",
+        "label_fr",
+        "label_it",
+        "label_rm",
+        "comment_en",
+        "comment_de",
+        "comment_fr",
+        "comment_it",
+        "comment_rm",
+        "subject",
+    }
+    in_df_cols = set(df.columns)
+    if not optional_col_set.issubset(in_df_cols):
+        additional_col = list(optional_col_set.difference(in_df_cols))
+        additional_df = pd.DataFrame(columns=additional_col, index=df.index, data=pd.NA)
+        df = pd.concat(objs=[df, additional_df], axis=1)
+    return df
+
+
 def _check_missing_values_in_row(df: pd.DataFrame) -> None | list[MissingValuesInRowProblem]:
     required_values = ["name", "super", "object", "gui_element"]
     missing_dict = check_required_values(df=df, required_values_columns=required_values)
@@ -215,6 +224,8 @@ def _row2prop(df_row: pd.Series, row_num: int, excelfile: str) -> dict[str, Any]
         "labels": get_labels(df_row=df_row),
         "super": [s.strip() for s in df_row["super"].split(",")],
     }
+    if not pd.isna(df_row["subject"]):
+        _property["subject"] = df_row["subject"]
 
     gui_attrib = _get_gui_attribute(df_row=df_row, row_num=row_num)
     match gui_attrib:

diff --git a/src/dsp_tools/commands/excel2json/utils.py b/src/dsp_tools/commands/excel2json/utils.py
@@ -314,25 +314,3 @@ def col_must_or_not_empty_based_on_other_col(
     # If both are True logical_and returns True otherwise False
     combined_array = np.logical_and(na_series, substring_array)
     return pd.Series(combined_array) if any(combined_array) else None
-
-
-def add_optional_columns(df: pd.DataFrame, optional_col_set: set[str]) -> pd.DataFrame:
-    """
-    This function takes a df and a set of columns which may not be in the df,
-    but whose absence could cause errors in the code following.
-    The columns are added, without any values in the rows.
-
-    Args:
-        df: Original df
-        optional_col_set: set of columns that may not be in the df, if they are not, they will be added.
-
-    Returns:
-        The df with the added columns.
-        If all are already there, the df is returned unchanged.
-    """
-    in_df_cols = set(df.columns)
-    if not optional_col_set.issubset(in_df_cols):
-        additional_col = list(optional_col_set.difference(in_df_cols))
-        additional_df = pd.DataFrame(columns=additional_col, index=df.index, data=pd.NA)
-        df = pd.concat(objs=[df, additional_df], axis=1)
-    return df
diff --git a/test/unittests/commands/excel2json/test_properties.py b/test/unittests/commands/excel2json/test_properties.py
@@ -504,6 +504,7 @@ def test_row2prop(self) -> None:
         expected_dict = {
             "name": "name_1",
             "object": "object_1",
+            "subject": "subject_1",
             "gui_element": "Simple",
             "labels": {
                 "en": "label_en_1",
@@ -531,6 +532,7 @@ def test_row2prop(self) -> None:
             "labels": {"en": "label_en_2"},
             "name": "name_2",
             "object": "object_2",
+            "subject": "subject_2",
             "super": ["super_2.1", "super_2.2"],
         }
         self.assertDictEqual(expected_dict, returned_dict)
@@ -626,6 +628,54 @@ def test_invalid_gui_attrib_values(self) -> None:
                 path_to_output_file="",
             )
 
+    def test_add_optional_columns_with_missing_cols(self) -> None:
+        original_df = pd.DataFrame(
+            {
+                "comment_en": ["text_en", pd.NA],
+                "comment_it": ["text_it", pd.NA],
+                "comment_rm": [pd.NA, pd.NA],
+            }
+        )
+        expected_df = pd.DataFrame(
+            {
+                "comment_de": [pd.NA, pd.NA],
+                "comment_en": ["text_en", pd.NA],
+                "comment_fr": [pd.NA, pd.NA],
+                "comment_it": ["text_it", pd.NA],
+                "comment_rm": [pd.NA, pd.NA],
+                "label_de": [pd.NA, pd.NA],
+                "label_en": [pd.NA, pd.NA],
+                "label_fr": [pd.NA, pd.NA],
+                "label_it": [pd.NA, pd.NA],
+                "label_rm": [pd.NA, pd.NA],
+                "subject": [pd.NA, pd.NA],
+            }
+        )
+        returned_df = e2j._add_optional_columns(df=original_df)
+        # as the columns are extracted via a set, they are not sorted and may appear in any order,
+        # this would cause the validation to fail
+        returned_df = returned_df.sort_index(axis=1)
+        assert_frame_equal(expected_df, returned_df)
+
+    def test_add_optional_columns_no_missing_cols(self) -> None:
+        expected_df = pd.DataFrame(
+            {
+                "comment_de": [pd.NA, pd.NA],
+                "comment_en": ["text_en", pd.NA],
+                "comment_fr": [pd.NA, pd.NA],
+                "comment_it": ["text_it", pd.NA],
+                "comment_rm": [pd.NA, pd.NA],
+                "label_de": [pd.NA, pd.NA],
+                "label_en": [pd.NA, pd.NA],
+                "label_fr": [pd.NA, pd.NA],
+                "label_it": [pd.NA, pd.NA],
+                "label_rm": [pd.NA, pd.NA],
+                "subject": [pd.NA, pd.NA],
+            }
+        )
+        unchanged_df = e2j._add_optional_columns(df=expected_df)
+        assert_frame_equal(expected_df, unchanged_df)
+
 
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/test/unittests/commands/excel2json/test_utils.py b/test/unittests/commands/excel2json/test_utils.py
@@ -176,33 +176,6 @@ def test_get_comments(self) -> None:
 
         assert not utl.get_comments(original_df.loc[1, :])
 
-    def test_add_optional_columns(self) -> None:
-        original_df = pd.DataFrame(
-            {
-                "comment_en": ["text_en", pd.NA],
-                "comment_it": ["text_it", pd.NA],
-                "comment_rm": [pd.NA, pd.NA],
-            }
-        )
-        optional_cols = {"comment_en", "comment_de", "comment_fr", "comment_it", "comment_rm"}
-        expected_df = pd.DataFrame(
-            {
-                "comment_de": [pd.NA, pd.NA],
-                "comment_en": ["text_en", pd.NA],
-                "comment_fr": [pd.NA, pd.NA],
-                "comment_it": ["text_it", pd.NA],
-                "comment_rm": [pd.NA, pd.NA],
-            }
-        )
-        returned_df = utl.add_optional_columns(df=original_df, optional_col_set=optional_cols)
-        # as the columns are extracted via a set, they are not sorted and may appear in any order,
-        # this would cause the validation to fail
-        returned_df = returned_df.sort_index(axis=1)
-        assert_frame_equal(expected_df, returned_df)
-        # if all columns exist, the df should be returned unchanged
-        unchanged_df = utl.add_optional_columns(df=expected_df, optional_col_set=optional_cols)
-        assert_frame_equal(expected_df, unchanged_df)
-
 
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/testdata/excel2json/excel2json-expected-output.json b/testdata/excel2json/excel2json-expected-output.json
@@ -162,6 +162,7 @@
                         "super": [
                             "hasValue"
                         ],
+                        "subject": ":Person",
                         "object": "ListValue",
                         "labels": {
                             "fr": "only French"
@@ -241,6 +242,7 @@
                         "super": [
                             "hasValue"
                         ],
+                        "subject": ":Person",
                         "object": "IntValue",
                         "labels": {
                             "en": "age",
@@ -458,6 +460,7 @@
                         "super": [
                             "isPartOf"
                         ],
+                        "subject": ":Image",
                         "object": ":Documents",
                         "labels": {
                             "en": "is part of a document",

diff --git a/testdata/excel2json/excel2json_files/test-name (test_label)/properties.xlsx b/testdata/excel2json/excel2json_files/test-name (test_label)/properties.xlsx