diff --git a/src/dsp_tools/commands/excel2json/input_error.py b/src/dsp_tools/commands/excel2json/input_error.py index a07fecb78..150be32ec 100644 --- a/src/dsp_tools/commands/excel2json/input_error.py +++ b/src/dsp_tools/commands/excel2json/input_error.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Protocol +from typing import Any, Protocol # pylint: disable=too-few-public-methods @@ -114,6 +114,60 @@ def execute_error_protocol(self) -> str: ) +@dataclass(frozen=True) +class InvalidSheetNameProblem: + """This class contains information if the excel sheet names are not strings.""" + + excelfile: str + excel_sheet_names: list[Any] + + def execute_error_protocol(self) -> str: + """ + This function initiates all the steps for successful problem communication with the user. + + Returns: + message for the error + """ + sheet_types = [f"Name: {x} | Type {type(x)}" for x in self.excel_sheet_names if not isinstance(x, str)] + return ( + f"The names sheets in the excel '{self.excelfile}' are not all valid.\n" + f"They must be of type string. The following names are problematic:\n" + f"{list_separator}{list_separator.join(sheet_types)}\n" + f"Please rename them." + ) + + +@dataclass(frozen=True) +class ResourcesSheetsNotAsExpected: + """This class contains information if the excel sheet names are not a subset of the expected ones.""" + + names_classes: set[str] + names_sheets: set[str] + + def execute_error_protocol(self) -> str: + """ + This function initiates all the steps for successful problem communication with the user. + + Returns: + message for the error + """ + msg = ( + "The excel file 'resources.xlsx' has problems.\n" + "The names of the excel sheets must be 'classes' " + "plus all the entries in the column 'name' from the sheet 'classes'.\n" + ) + missing_sheets = self.names_classes - self.names_sheets + if missing_sheets: + msg += f"The following sheet(s) are missing:{list_separator}" + list_separator.join(missing_sheets) + missing_names = self.names_sheets - self.names_classes + if missing_names: + msg += ( + f"The following sheet(s) do not have an entry in the 'name' column " + f"of the sheet 'classes':{list_separator}" + ) + list_separator.join(missing_names) + return msg + + @dataclass(frozen=True) class JsonValidationPropertyProblem: """This class contains information about a JSON property section that fails its validation against the schema.""" diff --git a/src/dsp_tools/commands/excel2json/resources.py b/src/dsp_tools/commands/excel2json/resources.py index 703b03afc..f59c7ad66 100644 --- a/src/dsp_tools/commands/excel2json/resources.py +++ b/src/dsp_tools/commands/excel2json/resources.py @@ -8,25 +8,25 @@ import pandas as pd import regex -from dsp_tools.commands.excel2json.input_error import JsonValidationResourceProblem, PositionInExcel -from dsp_tools.commands.excel2json.utils import read_and_clean_excel_file +from dsp_tools.commands.excel2json.input_error import ( + JsonValidationResourceProblem, + PositionInExcel, + ResourcesSheetsNotAsExpected, +) +from dsp_tools.commands.excel2json.utils import check_column_for_duplicate, read_and_clean_all_sheets from dsp_tools.models.exceptions import InputError, UserError from dsp_tools.utils.shared import check_notna, prepare_dataframe languages = ["en", "de", "fr", "it", "rm"] -def _validate_resources( - resources_list: list[dict[str, Any]], - excelfile: str, -) -> None: +def _validate_resources(resources_list: list[dict[str, Any]]) -> None: """ This function checks if the "resources" section of a JSON project file is valid according to the JSON schema, and if the resource names are unique. Args: resources_list: the "resources" section of a JSON project as a list of dicts - excelfile: path to the Excel file containing the resources Raises: InputError: if the validation fails @@ -38,11 +38,11 @@ def _validate_resources( try: jsonschema.validate(instance=resources_list, schema=resources_schema) except jsonschema.ValidationError as err: - err_msg = _find_validation_problem( + validation_problem = _find_validation_problem( validation_error=err, resources_list=resources_list, ) - msg = f"\nThe Excel file '{excelfile}' did not pass validation." + err_msg.execute_error_protocol() + msg = "\nThe Excel file 'resources.xlsx' did not pass validation." + validation_problem.execute_error_protocol() raise InputError(msg) from None @@ -92,7 +92,7 @@ def _find_validation_problem( def _row2resource( df_row: pd.Series, - excelfile: str, + details_df: pd.DataFrame, ) -> dict[str, Any]: """ Method that reads one row from the "classes" DataFrame, @@ -101,7 +101,7 @@ def _row2resource( Args: df_row: row from the "classes" DataFrame - excelfile: Excel file where the data comes from + details_df: Excel sheet of the individual class Raises: UserError: if the row or the details sheet contains invalid data @@ -117,16 +117,10 @@ def _row2resource( comments = {lang: df_row[f"comment_{lang}"] for lang in languages if df_row.get(f"comment_{lang}")} supers = [s.strip() for s in df_row["super"].split(",")] - # load the cardinalities of this resource - # if the excel sheet does not exist, pandas raises a ValueError - try: - details_df = read_and_clean_excel_file(excelfile=excelfile, sheetname=name) - except ValueError as err: - raise UserError(str(err)) from None details_df = prepare_dataframe( df=details_df, required_columns=["Property", "Cardinality"], - location_of_sheet=f"Sheet '{name}' in file '{excelfile}'", + location_of_sheet=f"Sheet '{name}' in file 'resources.xlsx'", ) # validation @@ -150,7 +144,7 @@ def _row2resource( validation_passed = False if not validation_passed: raise UserError( - f"Sheet '{name}' in file '{excelfile}' has invalid content in column 'gui_order': " + f"Sheet '{name}' in file 'resources.xlsx' has invalid content in column 'gui_order': " f"only positive integers allowed (or leave column empty altogether)" ) @@ -197,43 +191,23 @@ def excel2resources( and the success status (True if everything went well) """ - # load file - all_classes_df = read_and_clean_excel_file(excelfile=excelfile) - all_classes_df = prepare_dataframe( - df=all_classes_df, + resource_dfs = read_and_clean_all_sheets(excelfile) + classes_df = resource_dfs.pop("classes") + classes_df = prepare_dataframe( + df=classes_df, required_columns=["name"], location_of_sheet=f"Sheet 'classes' in file '{excelfile}'", ) - # validation - for index, row in all_classes_df.iterrows(): - index = int(str(index)) # index is a label/index/hashable, but we need an int - if not check_notna(row["super"]): - raise UserError(f"Sheet 'classes' of '{excelfile}' has a missing value in row {index + 2}, column 'super'") - if any(all_classes_df.get(lang) is not None for lang in languages): - warnings.warn( - f"The file {excelfile} uses {languages} as column titles, which is deprecated. " - f"Please use {[f'label_{lang}' for lang in languages]}" - ) + if validation_problem := _validate_excel_file(classes_df, resource_dfs): + err_msg = validation_problem.execute_error_protocol() + raise InputError(err_msg) # transform every row into a resource - resources = [_row2resource(row, excelfile) for i, row in all_classes_df.iterrows()] - - # check if resource names are unique - all_names = [r["name"] for r in resources] - if duplicates := { - index + 2: resdef["name"] for index, resdef in enumerate(resources) if all_names.count(resdef["name"]) > 1 - }: - err_msg = ( - f"Resource names must be unique inside every ontology, " - f"but your Excel file '{excelfile}' contains duplicates:\n" - ) - for row_no, resname in duplicates.items(): - err_msg += f" - Row {row_no}: {resname}\n" - raise UserError(err_msg) + resources = [_row2resource(row, resource_dfs[row["name"]]) for i, row in classes_df.iterrows()] # write final "resources" section into a JSON file - _validate_resources(resources_list=resources, excelfile=excelfile) + _validate_resources(resources_list=resources) if path_to_output_file: with open(file=path_to_output_file, mode="w", encoding="utf-8") as file: @@ -241,3 +215,28 @@ def excel2resources( print(f"resources section was created successfully and written to file '{path_to_output_file}'") return resources, True + + +def _validate_excel_file( + classes_df: pd.DataFrame, df_dict: dict[str, pd.DataFrame] +) -> ResourcesSheetsNotAsExpected | None: + for index, row in classes_df.iterrows(): + index = int(str(index)) # index is a label/index/hashable, but we need an int + if not check_notna(row["super"]): + raise UserError( + f"Sheet 'classes' of 'resources.xlsx' has a missing value in row {index + 2}, column 'super'" + ) + if any(classes_df.get(lang) is not None for lang in languages): + warnings.warn( + f"The file 'resources.xlsx' uses {languages} as column titles, which is deprecated. " + f"Please use {[f'label_{lang}' for lang in languages]}" + ) + duplicate_check = check_column_for_duplicate(classes_df, "name") + if duplicate_check: + msg = "The excel file 'resources.xlsx', sheet 'classes' has a problem.\n" + msg += duplicate_check.execute_error_protocol() + raise InputError(msg) + # check that all the sheets have an entry in the names column and vice versa + if (all_names := set(classes_df["name"].tolist())) != (all_sheets := set(df_dict.keys())): + return ResourcesSheetsNotAsExpected(all_names, all_sheets) + return None diff --git a/src/dsp_tools/commands/excel2json/utils.py b/src/dsp_tools/commands/excel2json/utils.py index 2247cbe60..ab78926eb 100644 --- a/src/dsp_tools/commands/excel2json/utils.py +++ b/src/dsp_tools/commands/excel2json/utils.py @@ -7,7 +7,12 @@ import pandas as pd import regex -from dsp_tools.commands.excel2json.input_error import DuplicatesInColumnProblem, RequiredColumnMissingProblem +from dsp_tools.commands.excel2json.input_error import ( + DuplicatesInColumnProblem, + InvalidSheetNameProblem, + RequiredColumnMissingProblem, +) +from dsp_tools.models.exceptions import InputError languages = ["en", "de", "fr", "it", "rm"] @@ -39,6 +44,39 @@ def read_and_clean_excel_file(excelfile: str, sheetname: str | int = 0) -> pd.Da return read_df +def read_and_clean_all_sheets(excelfile: str) -> dict[str, pd.DataFrame]: + """ + This function reads an Excel file with all its sheets. + If there is a ValueError, it patches the openpyxl part that causes the error + and opens it with that patch. + It cleans the dataframes and then returns them in the form {sheet_name: dataframe}. + + Args: + excelfile: path to the Excel file + + Returns: + All sheets of the excel file, in the form of a dictionary {sheet_name: dataframe} + + Raises: + InputError: If the sheets are not correctly named + """ + try: + df_dict = pd.read_excel(excelfile, sheet_name=None) + except ValueError: + # Pandas relies on openpyxl to parse XLSX files. + # A strange behavior of openpyxl prevents pandas from opening files with some formatting properties + # (unclear which formatting properties exactly). + # Apparently, the excel2json test files have one of the unsupported formatting properties. + # Credits: https://stackoverflow.com/a/70537454/14414188 + with mock.patch("openpyxl.styles.fonts.Font.family.max", new=100): + df_dict = pd.read_excel(excelfile, sheet_name=None) + try: + return {name.strip(""): clean_data_frame(df) for name, df in df_dict.items()} + except AttributeError: + msg = InvalidSheetNameProblem(excelfile, list(df_dict.keys())).execute_error_protocol() + raise InputError(msg) from None + + def clean_data_frame(df: pd.DataFrame) -> pd.DataFrame: """ This function takes a pd.DataFrame and removes: diff --git a/test/unittests/commands/excel2json/test_resources.py b/test/unittests/commands/excel2json/test_resources.py index 3c014c5c4..8e6a4b66e 100644 --- a/test/unittests/commands/excel2json/test_resources.py +++ b/test/unittests/commands/excel2json/test_resources.py @@ -181,10 +181,10 @@ def test_cardinalities(self) -> None: class TestValidateWithSchema: # it is not possible to call the method to be tested directly. # So let's make a reference to it, so that it can be found by the usage search - lambda x: e2j._validate_resources([], "file") # pylint: disable=expression-not-assigned,protected-access + lambda x: e2j._validate_resources([]) # pylint: disable=expression-not-assigned,protected-access def test_invalid_super(self) -> None: - expected_msg = re.escape( + expected_msg = ( "\nThe Excel file 'testdata/invalid-testdata/excel2json/resources-invalid-super.xlsx' " "did not pass validation.\n" " Section of the problem: 'Resources'\n" @@ -197,7 +197,7 @@ def test_invalid_super(self) -> None: e2j.excel2resources("testdata/invalid-testdata/excel2json/resources-invalid-super.xlsx", "") def test_sheet_invalid_cardinality(self) -> None: - expected_msg = re.escape( + expected_msg = ( "\nThe Excel file 'testdata/invalid-testdata/excel2json/resources-invalid-cardinality.xlsx' " "did not pass validation.\n" " Section of the problem: 'Resources'\n" @@ -209,7 +209,7 @@ def test_sheet_invalid_cardinality(self) -> None: e2j.excel2resources("testdata/invalid-testdata/excel2json/resources-invalid-cardinality.xlsx", "") def test_invalid_property(self) -> None: - expected_msg = re.escape( + expected_msg = ( "\nThe Excel file 'testdata/invalid-testdata/excel2json/resources-invalid-property.xlsx' " "did not pass validation.\n" " Section of the problem: 'Resources'\n" @@ -221,20 +221,24 @@ def test_invalid_property(self) -> None: e2j.excel2resources("testdata/invalid-testdata/excel2json/resources-invalid-property.xlsx", "") def test_duplicate_name(self) -> None: - expected_msg = re.escape( - ( - "Resource names must be unique inside every ontology, but your Excel file " - "'testdata/invalid-testdata/excel2json/resources-duplicate-name.xlsx' contains duplicates:\n" - " - Row 3: MentionedPerson\n" - " - Row 4: MentionedPerson" - ) + expected_msg = ( + "The excel file 'resources.xlsx', sheet 'classes' has a problem.\n" + "No duplicates are allowed in the column 'name'\n" + "The following values appear several times:\n" + " - MentionedPerson" ) with pytest.raises(BaseError, match=expected_msg): e2j.excel2resources("testdata/invalid-testdata/excel2json/resources-duplicate-name.xlsx", "") def test_missing_sheet(self) -> None: - expected_msg = re.escape("Worksheet named 'GenericAnthroponym' not found") - with pytest.raises(BaseError, match=expected_msg): + expected_msg = re.escape( + "The excel file 'resources.xlsx' has problems.\n" + "The names of the excel sheets must be 'classes' " + "plus all the entries in the column 'name' from the sheet 'classes'.\n" + "The following sheet(s) are missing:\n" + " - GenericAnthroponym" + ) + with pytest.raises(InputError, match=expected_msg): e2j.excel2resources("testdata/invalid-testdata/excel2json/resources-invalid-missing-sheet.xlsx", "")