refactor(excel2json): new excel reader (DEV-3049) (#665)

Co-authored-by: Johannes Nussbaum <39048939+jnussbaum@users.noreply.github.com>
dasch-swiss · Dec 4, 2023 · a0d5776 · a0d5776
1 parent fdbc545
commit a0d5776
Show file tree

Hide file tree

Showing 4 changed files with 158 additions and 63 deletions.
diff --git a/src/dsp_tools/commands/excel2json/input_error.py b/src/dsp_tools/commands/excel2json/input_error.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Protocol
+from typing import Any, Protocol
 
 # pylint: disable=too-few-public-methods
 
@@ -114,6 +114,60 @@ def execute_error_protocol(self) -> str:
         )
 
 
+@dataclass(frozen=True)
+class InvalidSheetNameProblem:
+    """This class contains information if the excel sheet names are not strings."""
+
+    excelfile: str
+    excel_sheet_names: list[Any]
+
+    def execute_error_protocol(self) -> str:
+        """
+        This function initiates all the steps for successful problem communication with the user.
+
+        Returns:
+            message for the error
+        """
+        sheet_types = [f"Name: {x} | Type {type(x)}" for x in self.excel_sheet_names if not isinstance(x, str)]
+        return (
+            f"The names sheets in the excel '{self.excelfile}' are not all valid.\n"
+            f"They must be of type string. The following names are problematic:\n"
+            f"{list_separator}{list_separator.join(sheet_types)}\n"
+            f"Please rename them."
+        )
+
+
+@dataclass(frozen=True)
+class ResourcesSheetsNotAsExpected:
+    """This class contains information if the excel sheet names are not a subset of the expected ones."""
+
+    names_classes: set[str]
+    names_sheets: set[str]
+
+    def execute_error_protocol(self) -> str:
+        """
+        This function initiates all the steps for successful problem communication with the user.
+
+        Returns:
+            message for the error
+        """
+        msg = (
+            "The excel file 'resources.xlsx' has problems.\n"
+            "The names of the excel sheets must be 'classes' "
+            "plus all the entries in the column 'name' from the sheet 'classes'.\n"
+        )
+        missing_sheets = self.names_classes - self.names_sheets
+        if missing_sheets:
+            msg += f"The following sheet(s) are missing:{list_separator}" + list_separator.join(missing_sheets)
+        missing_names = self.names_sheets - self.names_classes
+        if missing_names:
+            msg += (
+                f"The following sheet(s) do not have an entry in the 'name' column "
+                f"of the sheet 'classes':{list_separator}"
+            ) + list_separator.join(missing_names)
+        return msg
+
+
 @dataclass(frozen=True)
 class JsonValidationPropertyProblem:
     """This class contains information about a JSON property section that fails its validation against the schema."""

diff --git a/src/dsp_tools/commands/excel2json/resources.py b/src/dsp_tools/commands/excel2json/resources.py
@@ -8,25 +8,25 @@
 import pandas as pd
 import regex
 
-from dsp_tools.commands.excel2json.input_error import JsonValidationResourceProblem, PositionInExcel
-from dsp_tools.commands.excel2json.utils import read_and_clean_excel_file
+from dsp_tools.commands.excel2json.input_error import (
+    JsonValidationResourceProblem,
+    PositionInExcel,
+    ResourcesSheetsNotAsExpected,
+)
+from dsp_tools.commands.excel2json.utils import check_column_for_duplicate, read_and_clean_all_sheets
 from dsp_tools.models.exceptions import InputError, UserError
 from dsp_tools.utils.shared import check_notna, prepare_dataframe
 
 languages = ["en", "de", "fr", "it", "rm"]
 
 
-def _validate_resources(
-    resources_list: list[dict[str, Any]],
-    excelfile: str,
-) -> None:
+def _validate_resources(resources_list: list[dict[str, Any]]) -> None:
     """
     This function checks if the "resources" section of a JSON project file is valid according to the JSON schema,
     and if the resource names are unique.
 
     Args:
         resources_list: the "resources" section of a JSON project as a list of dicts
-        excelfile: path to the Excel file containing the resources
 
     Raises:
         InputError: if the validation fails
@@ -38,11 +38,11 @@ def _validate_resources(
     try:
         jsonschema.validate(instance=resources_list, schema=resources_schema)
     except jsonschema.ValidationError as err:
-        err_msg = _find_validation_problem(
+        validation_problem = _find_validation_problem(
             validation_error=err,
             resources_list=resources_list,
         )
-        msg = f"\nThe Excel file '{excelfile}' did not pass validation." + err_msg.execute_error_protocol()
+        msg = "\nThe Excel file 'resources.xlsx' did not pass validation." + validation_problem.execute_error_protocol()
         raise InputError(msg) from None
 
 
@@ -92,7 +92,7 @@ def _find_validation_problem(
 
 def _row2resource(
     df_row: pd.Series,
-    excelfile: str,
+    details_df: pd.DataFrame,
 ) -> dict[str, Any]:
     """
     Method that reads one row from the "classes" DataFrame,
@@ -101,7 +101,7 @@ def _row2resource(
 
     Args:
         df_row: row from the "classes" DataFrame
-        excelfile: Excel file where the data comes from
+        details_df: Excel sheet of the individual class
 
     Raises:
         UserError: if the row or the details sheet contains invalid data
@@ -117,16 +117,10 @@ def _row2resource(
     comments = {lang: df_row[f"comment_{lang}"] for lang in languages if df_row.get(f"comment_{lang}")}
     supers = [s.strip() for s in df_row["super"].split(",")]
 
-    # load the cardinalities of this resource
-    # if the excel sheet does not exist, pandas raises a ValueError
-    try:
-        details_df = read_and_clean_excel_file(excelfile=excelfile, sheetname=name)
-    except ValueError as err:
-        raise UserError(str(err)) from None
     details_df = prepare_dataframe(
         df=details_df,
         required_columns=["Property", "Cardinality"],
-        location_of_sheet=f"Sheet '{name}' in file '{excelfile}'",
+        location_of_sheet=f"Sheet '{name}' in file 'resources.xlsx'",
     )
 
     # validation
@@ -150,7 +144,7 @@ def _row2resource(
         validation_passed = False
     if not validation_passed:
         raise UserError(
-            f"Sheet '{name}' in file '{excelfile}' has invalid content in column 'gui_order': "
+            f"Sheet '{name}' in file 'resources.xlsx' has invalid content in column 'gui_order': "
             f"only positive integers allowed (or leave column empty altogether)"
         )
 
@@ -197,47 +191,52 @@ def excel2resources(
             and the success status (True if everything went well)
     """
 
-    # load file
-    all_classes_df = read_and_clean_excel_file(excelfile=excelfile)
-    all_classes_df = prepare_dataframe(
-        df=all_classes_df,
+    resource_dfs = read_and_clean_all_sheets(excelfile)
+    classes_df = resource_dfs.pop("classes")
+    classes_df = prepare_dataframe(
+        df=classes_df,
         required_columns=["name"],
         location_of_sheet=f"Sheet 'classes' in file '{excelfile}'",
     )
 
-    # validation
-    for index, row in all_classes_df.iterrows():
-        index = int(str(index))  # index is a label/index/hashable, but we need an int
-        if not check_notna(row["super"]):
-            raise UserError(f"Sheet 'classes' of '{excelfile}' has a missing value in row {index + 2}, column 'super'")
-    if any(all_classes_df.get(lang) is not None for lang in languages):
-        warnings.warn(
-            f"The file {excelfile} uses {languages} as column titles, which is deprecated. "
-            f"Please use {[f'label_{lang}' for lang in languages]}"
-        )
+    if validation_problem := _validate_excel_file(classes_df, resource_dfs):
+        err_msg = validation_problem.execute_error_protocol()
+        raise InputError(err_msg)
 
     # transform every row into a resource
-    resources = [_row2resource(row, excelfile) for i, row in all_classes_df.iterrows()]
-
-    # check if resource names are unique
-    all_names = [r["name"] for r in resources]
-    if duplicates := {
-        index + 2: resdef["name"] for index, resdef in enumerate(resources) if all_names.count(resdef["name"]) > 1
-    }:
-        err_msg = (
-            f"Resource names must be unique inside every ontology, "
-            f"but your Excel file '{excelfile}' contains duplicates:\n"
-        )
-        for row_no, resname in duplicates.items():
-            err_msg += f" - Row {row_no}: {resname}\n"
-        raise UserError(err_msg)
+    resources = [_row2resource(row, resource_dfs[row["name"]]) for i, row in classes_df.iterrows()]
 
     # write final "resources" section into a JSON file
-    _validate_resources(resources_list=resources, excelfile=excelfile)
+    _validate_resources(resources_list=resources)
 
     if path_to_output_file:
         with open(file=path_to_output_file, mode="w", encoding="utf-8") as file:
             json.dump(resources, file, indent=4, ensure_ascii=False)
             print(f"resources section was created successfully and written to file '{path_to_output_file}'")
 
     return resources, True
+
+
+def _validate_excel_file(
+    classes_df: pd.DataFrame, df_dict: dict[str, pd.DataFrame]
+) -> ResourcesSheetsNotAsExpected | None:
+    for index, row in classes_df.iterrows():
+        index = int(str(index))  # index is a label/index/hashable, but we need an int
+        if not check_notna(row["super"]):
+            raise UserError(
+                f"Sheet 'classes' of 'resources.xlsx' has a missing value in row {index + 2}, column 'super'"
+            )
+    if any(classes_df.get(lang) is not None for lang in languages):
+        warnings.warn(
+            f"The file 'resources.xlsx' uses {languages} as column titles, which is deprecated. "
+            f"Please use {[f'label_{lang}' for lang in languages]}"
+        )
+    duplicate_check = check_column_for_duplicate(classes_df, "name")
+    if duplicate_check:
+        msg = "The excel file 'resources.xlsx', sheet 'classes' has a problem.\n"
+        msg += duplicate_check.execute_error_protocol()
+        raise InputError(msg)
+    # check that all the sheets have an entry in the names column and vice versa
+    if (all_names := set(classes_df["name"].tolist())) != (all_sheets := set(df_dict.keys())):
+        return ResourcesSheetsNotAsExpected(all_names, all_sheets)
+    return None
diff --git a/src/dsp_tools/commands/excel2json/utils.py b/src/dsp_tools/commands/excel2json/utils.py
@@ -7,7 +7,12 @@
 import pandas as pd
 import regex
 
-from dsp_tools.commands.excel2json.input_error import DuplicatesInColumnProblem, RequiredColumnMissingProblem
+from dsp_tools.commands.excel2json.input_error import (
+    DuplicatesInColumnProblem,
+    InvalidSheetNameProblem,
+    RequiredColumnMissingProblem,
+)
+from dsp_tools.models.exceptions import InputError
 
 languages = ["en", "de", "fr", "it", "rm"]
 
@@ -39,6 +44,39 @@ def read_and_clean_excel_file(excelfile: str, sheetname: str | int = 0) -> pd.Da
     return read_df
 
 
+def read_and_clean_all_sheets(excelfile: str) -> dict[str, pd.DataFrame]:
+    """
+    This function reads an Excel file with all its sheets.
+    If there is a ValueError, it patches the openpyxl part that causes the error
+    and opens it with that patch.
+    It cleans the dataframes and then returns them in the form {sheet_name: dataframe}.
+
+    Args:
+        excelfile: path to the Excel file
+
+    Returns:
+        All sheets of the excel file, in the form of a dictionary {sheet_name: dataframe}
+
+    Raises:
+        InputError: If the sheets are not correctly named
+    """
+    try:
+        df_dict = pd.read_excel(excelfile, sheet_name=None)
+    except ValueError:
+        # Pandas relies on openpyxl to parse XLSX files.
+        # A strange behavior of openpyxl prevents pandas from opening files with some formatting properties
+        # (unclear which formatting properties exactly).
+        # Apparently, the excel2json test files have one of the unsupported formatting properties.
+        # Credits: https://stackoverflow.com/a/70537454/14414188
+        with mock.patch("openpyxl.styles.fonts.Font.family.max", new=100):
+            df_dict = pd.read_excel(excelfile, sheet_name=None)
+    try:
+        return {name.strip(""): clean_data_frame(df) for name, df in df_dict.items()}
+    except AttributeError:
+        msg = InvalidSheetNameProblem(excelfile, list(df_dict.keys())).execute_error_protocol()
+        raise InputError(msg) from None
+
+
 def clean_data_frame(df: pd.DataFrame) -> pd.DataFrame:
     """
     This function takes a pd.DataFrame and removes:

diff --git a/test/unittests/commands/excel2json/test_resources.py b/test/unittests/commands/excel2json/test_resources.py
@@ -181,10 +181,10 @@ def test_cardinalities(self) -> None:
 class TestValidateWithSchema:
     # it is not possible to call the method to be tested directly.
     # So let's make a reference to it, so that it can be found by the usage search
-    lambda x: e2j._validate_resources([], "file")  # pylint: disable=expression-not-assigned,protected-access
+    lambda x: e2j._validate_resources([])  # pylint: disable=expression-not-assigned,protected-access
 
     def test_invalid_super(self) -> None:
-        expected_msg = re.escape(
+        expected_msg = (
             "\nThe Excel file 'testdata/invalid-testdata/excel2json/resources-invalid-super.xlsx' "
             "did not pass validation.\n"
             "    Section of the problem: 'Resources'\n"
@@ -197,7 +197,7 @@ def test_invalid_super(self) -> None:
             e2j.excel2resources("testdata/invalid-testdata/excel2json/resources-invalid-super.xlsx", "")
 
     def test_sheet_invalid_cardinality(self) -> None:
-        expected_msg = re.escape(
+        expected_msg = (
             "\nThe Excel file 'testdata/invalid-testdata/excel2json/resources-invalid-cardinality.xlsx' "
             "did not pass validation.\n"
             "    Section of the problem: 'Resources'\n"
@@ -209,7 +209,7 @@ def test_sheet_invalid_cardinality(self) -> None:
             e2j.excel2resources("testdata/invalid-testdata/excel2json/resources-invalid-cardinality.xlsx", "")
 
     def test_invalid_property(self) -> None:
-        expected_msg = re.escape(
+        expected_msg = (
             "\nThe Excel file 'testdata/invalid-testdata/excel2json/resources-invalid-property.xlsx' "
             "did not pass validation.\n"
             "    Section of the problem: 'Resources'\n"
@@ -221,20 +221,24 @@ def test_invalid_property(self) -> None:
             e2j.excel2resources("testdata/invalid-testdata/excel2json/resources-invalid-property.xlsx", "")
 
     def test_duplicate_name(self) -> None:
-        expected_msg = re.escape(
-            (
-                "Resource names must be unique inside every ontology, but your Excel file "
-                "'testdata/invalid-testdata/excel2json/resources-duplicate-name.xlsx' contains duplicates:\n"
-                " - Row 3: MentionedPerson\n"
-                " - Row 4: MentionedPerson"
-            )
+        expected_msg = (
+            "The excel file 'resources.xlsx', sheet 'classes' has a problem.\n"
+            "No duplicates are allowed in the column 'name'\n"
+            "The following values appear several times:\n"
+            "    - MentionedPerson"
         )
         with pytest.raises(BaseError, match=expected_msg):
             e2j.excel2resources("testdata/invalid-testdata/excel2json/resources-duplicate-name.xlsx", "")
 
     def test_missing_sheet(self) -> None:
-        expected_msg = re.escape("Worksheet named 'GenericAnthroponym' not found")
-        with pytest.raises(BaseError, match=expected_msg):
+        expected_msg = re.escape(
+            "The excel file 'resources.xlsx' has problems.\n"
+            "The names of the excel sheets must be 'classes' "
+            "plus all the entries in the column 'name' from the sheet 'classes'.\n"
+            "The following sheet(s) are missing:\n"
+            "    - GenericAnthroponym"
+        )
+        with pytest.raises(InputError, match=expected_msg):
             e2j.excel2resources("testdata/invalid-testdata/excel2json/resources-invalid-missing-sheet.xlsx", "")