Skip to content

Commit

Permalink
refactor(excel2json): new excel reader (DEV-3049) (#665)
Browse files Browse the repository at this point in the history
Co-authored-by: Johannes Nussbaum <39048939+jnussbaum@users.noreply.github.com>
  • Loading branch information
Nora-Olivia-Ammann and jnussbaum committed Dec 4, 2023
1 parent fdbc545 commit a0d5776
Show file tree
Hide file tree
Showing 4 changed files with 158 additions and 63 deletions.
56 changes: 55 additions & 1 deletion src/dsp_tools/commands/excel2json/input_error.py
@@ -1,5 +1,5 @@
from dataclasses import dataclass
from typing import Protocol
from typing import Any, Protocol

# pylint: disable=too-few-public-methods

Expand Down Expand Up @@ -114,6 +114,60 @@ def execute_error_protocol(self) -> str:
)


@dataclass(frozen=True)
class InvalidSheetNameProblem:
"""This class contains information if the excel sheet names are not strings."""

excelfile: str
excel_sheet_names: list[Any]

def execute_error_protocol(self) -> str:
"""
This function initiates all the steps for successful problem communication with the user.
Returns:
message for the error
"""
sheet_types = [f"Name: {x} | Type {type(x)}" for x in self.excel_sheet_names if not isinstance(x, str)]
return (
f"The names sheets in the excel '{self.excelfile}' are not all valid.\n"
f"They must be of type string. The following names are problematic:\n"
f"{list_separator}{list_separator.join(sheet_types)}\n"
f"Please rename them."
)


@dataclass(frozen=True)
class ResourcesSheetsNotAsExpected:
"""This class contains information if the excel sheet names are not a subset of the expected ones."""

names_classes: set[str]
names_sheets: set[str]

def execute_error_protocol(self) -> str:
"""
This function initiates all the steps for successful problem communication with the user.
Returns:
message for the error
"""
msg = (
"The excel file 'resources.xlsx' has problems.\n"
"The names of the excel sheets must be 'classes' "
"plus all the entries in the column 'name' from the sheet 'classes'.\n"
)
missing_sheets = self.names_classes - self.names_sheets
if missing_sheets:
msg += f"The following sheet(s) are missing:{list_separator}" + list_separator.join(missing_sheets)
missing_names = self.names_sheets - self.names_classes
if missing_names:
msg += (
f"The following sheet(s) do not have an entry in the 'name' column "
f"of the sheet 'classes':{list_separator}"
) + list_separator.join(missing_names)
return msg


@dataclass(frozen=True)
class JsonValidationPropertyProblem:
"""This class contains information about a JSON property section that fails its validation against the schema."""
Expand Down
95 changes: 47 additions & 48 deletions src/dsp_tools/commands/excel2json/resources.py
Expand Up @@ -8,25 +8,25 @@
import pandas as pd
import regex

from dsp_tools.commands.excel2json.input_error import JsonValidationResourceProblem, PositionInExcel
from dsp_tools.commands.excel2json.utils import read_and_clean_excel_file
from dsp_tools.commands.excel2json.input_error import (
JsonValidationResourceProblem,
PositionInExcel,
ResourcesSheetsNotAsExpected,
)
from dsp_tools.commands.excel2json.utils import check_column_for_duplicate, read_and_clean_all_sheets
from dsp_tools.models.exceptions import InputError, UserError
from dsp_tools.utils.shared import check_notna, prepare_dataframe

languages = ["en", "de", "fr", "it", "rm"]


def _validate_resources(
resources_list: list[dict[str, Any]],
excelfile: str,
) -> None:
def _validate_resources(resources_list: list[dict[str, Any]]) -> None:
"""
This function checks if the "resources" section of a JSON project file is valid according to the JSON schema,
and if the resource names are unique.
Args:
resources_list: the "resources" section of a JSON project as a list of dicts
excelfile: path to the Excel file containing the resources
Raises:
InputError: if the validation fails
Expand All @@ -38,11 +38,11 @@ def _validate_resources(
try:
jsonschema.validate(instance=resources_list, schema=resources_schema)
except jsonschema.ValidationError as err:
err_msg = _find_validation_problem(
validation_problem = _find_validation_problem(
validation_error=err,
resources_list=resources_list,
)
msg = f"\nThe Excel file '{excelfile}' did not pass validation." + err_msg.execute_error_protocol()
msg = "\nThe Excel file 'resources.xlsx' did not pass validation." + validation_problem.execute_error_protocol()
raise InputError(msg) from None


Expand Down Expand Up @@ -92,7 +92,7 @@ def _find_validation_problem(

def _row2resource(
df_row: pd.Series,
excelfile: str,
details_df: pd.DataFrame,
) -> dict[str, Any]:
"""
Method that reads one row from the "classes" DataFrame,
Expand All @@ -101,7 +101,7 @@ def _row2resource(
Args:
df_row: row from the "classes" DataFrame
excelfile: Excel file where the data comes from
details_df: Excel sheet of the individual class
Raises:
UserError: if the row or the details sheet contains invalid data
Expand All @@ -117,16 +117,10 @@ def _row2resource(
comments = {lang: df_row[f"comment_{lang}"] for lang in languages if df_row.get(f"comment_{lang}")}
supers = [s.strip() for s in df_row["super"].split(",")]

# load the cardinalities of this resource
# if the excel sheet does not exist, pandas raises a ValueError
try:
details_df = read_and_clean_excel_file(excelfile=excelfile, sheetname=name)
except ValueError as err:
raise UserError(str(err)) from None
details_df = prepare_dataframe(
df=details_df,
required_columns=["Property", "Cardinality"],
location_of_sheet=f"Sheet '{name}' in file '{excelfile}'",
location_of_sheet=f"Sheet '{name}' in file 'resources.xlsx'",
)

# validation
Expand All @@ -150,7 +144,7 @@ def _row2resource(
validation_passed = False
if not validation_passed:
raise UserError(
f"Sheet '{name}' in file '{excelfile}' has invalid content in column 'gui_order': "
f"Sheet '{name}' in file 'resources.xlsx' has invalid content in column 'gui_order': "
f"only positive integers allowed (or leave column empty altogether)"
)

Expand Down Expand Up @@ -197,47 +191,52 @@ def excel2resources(
and the success status (True if everything went well)
"""

# load file
all_classes_df = read_and_clean_excel_file(excelfile=excelfile)
all_classes_df = prepare_dataframe(
df=all_classes_df,
resource_dfs = read_and_clean_all_sheets(excelfile)
classes_df = resource_dfs.pop("classes")
classes_df = prepare_dataframe(
df=classes_df,
required_columns=["name"],
location_of_sheet=f"Sheet 'classes' in file '{excelfile}'",
)

# validation
for index, row in all_classes_df.iterrows():
index = int(str(index)) # index is a label/index/hashable, but we need an int
if not check_notna(row["super"]):
raise UserError(f"Sheet 'classes' of '{excelfile}' has a missing value in row {index + 2}, column 'super'")
if any(all_classes_df.get(lang) is not None for lang in languages):
warnings.warn(
f"The file {excelfile} uses {languages} as column titles, which is deprecated. "
f"Please use {[f'label_{lang}' for lang in languages]}"
)
if validation_problem := _validate_excel_file(classes_df, resource_dfs):
err_msg = validation_problem.execute_error_protocol()
raise InputError(err_msg)

# transform every row into a resource
resources = [_row2resource(row, excelfile) for i, row in all_classes_df.iterrows()]

# check if resource names are unique
all_names = [r["name"] for r in resources]
if duplicates := {
index + 2: resdef["name"] for index, resdef in enumerate(resources) if all_names.count(resdef["name"]) > 1
}:
err_msg = (
f"Resource names must be unique inside every ontology, "
f"but your Excel file '{excelfile}' contains duplicates:\n"
)
for row_no, resname in duplicates.items():
err_msg += f" - Row {row_no}: {resname}\n"
raise UserError(err_msg)
resources = [_row2resource(row, resource_dfs[row["name"]]) for i, row in classes_df.iterrows()]

# write final "resources" section into a JSON file
_validate_resources(resources_list=resources, excelfile=excelfile)
_validate_resources(resources_list=resources)

if path_to_output_file:
with open(file=path_to_output_file, mode="w", encoding="utf-8") as file:
json.dump(resources, file, indent=4, ensure_ascii=False)
print(f"resources section was created successfully and written to file '{path_to_output_file}'")

return resources, True


def _validate_excel_file(
classes_df: pd.DataFrame, df_dict: dict[str, pd.DataFrame]
) -> ResourcesSheetsNotAsExpected | None:
for index, row in classes_df.iterrows():
index = int(str(index)) # index is a label/index/hashable, but we need an int
if not check_notna(row["super"]):
raise UserError(
f"Sheet 'classes' of 'resources.xlsx' has a missing value in row {index + 2}, column 'super'"
)
if any(classes_df.get(lang) is not None for lang in languages):
warnings.warn(
f"The file 'resources.xlsx' uses {languages} as column titles, which is deprecated. "
f"Please use {[f'label_{lang}' for lang in languages]}"
)
duplicate_check = check_column_for_duplicate(classes_df, "name")
if duplicate_check:
msg = "The excel file 'resources.xlsx', sheet 'classes' has a problem.\n"
msg += duplicate_check.execute_error_protocol()
raise InputError(msg)
# check that all the sheets have an entry in the names column and vice versa
if (all_names := set(classes_df["name"].tolist())) != (all_sheets := set(df_dict.keys())):
return ResourcesSheetsNotAsExpected(all_names, all_sheets)
return None
40 changes: 39 additions & 1 deletion src/dsp_tools/commands/excel2json/utils.py
Expand Up @@ -7,7 +7,12 @@
import pandas as pd
import regex

from dsp_tools.commands.excel2json.input_error import DuplicatesInColumnProblem, RequiredColumnMissingProblem
from dsp_tools.commands.excel2json.input_error import (
DuplicatesInColumnProblem,
InvalidSheetNameProblem,
RequiredColumnMissingProblem,
)
from dsp_tools.models.exceptions import InputError

languages = ["en", "de", "fr", "it", "rm"]

Expand Down Expand Up @@ -39,6 +44,39 @@ def read_and_clean_excel_file(excelfile: str, sheetname: str | int = 0) -> pd.Da
return read_df


def read_and_clean_all_sheets(excelfile: str) -> dict[str, pd.DataFrame]:
"""
This function reads an Excel file with all its sheets.
If there is a ValueError, it patches the openpyxl part that causes the error
and opens it with that patch.
It cleans the dataframes and then returns them in the form {sheet_name: dataframe}.
Args:
excelfile: path to the Excel file
Returns:
All sheets of the excel file, in the form of a dictionary {sheet_name: dataframe}
Raises:
InputError: If the sheets are not correctly named
"""
try:
df_dict = pd.read_excel(excelfile, sheet_name=None)
except ValueError:
# Pandas relies on openpyxl to parse XLSX files.
# A strange behavior of openpyxl prevents pandas from opening files with some formatting properties
# (unclear which formatting properties exactly).
# Apparently, the excel2json test files have one of the unsupported formatting properties.
# Credits: https://stackoverflow.com/a/70537454/14414188
with mock.patch("openpyxl.styles.fonts.Font.family.max", new=100):
df_dict = pd.read_excel(excelfile, sheet_name=None)
try:
return {name.strip(""): clean_data_frame(df) for name, df in df_dict.items()}
except AttributeError:
msg = InvalidSheetNameProblem(excelfile, list(df_dict.keys())).execute_error_protocol()
raise InputError(msg) from None


def clean_data_frame(df: pd.DataFrame) -> pd.DataFrame:
"""
This function takes a pd.DataFrame and removes:
Expand Down
30 changes: 17 additions & 13 deletions test/unittests/commands/excel2json/test_resources.py
Expand Up @@ -181,10 +181,10 @@ def test_cardinalities(self) -> None:
class TestValidateWithSchema:
# it is not possible to call the method to be tested directly.
# So let's make a reference to it, so that it can be found by the usage search
lambda x: e2j._validate_resources([], "file") # pylint: disable=expression-not-assigned,protected-access
lambda x: e2j._validate_resources([]) # pylint: disable=expression-not-assigned,protected-access

def test_invalid_super(self) -> None:
expected_msg = re.escape(
expected_msg = (
"\nThe Excel file 'testdata/invalid-testdata/excel2json/resources-invalid-super.xlsx' "
"did not pass validation.\n"
" Section of the problem: 'Resources'\n"
Expand All @@ -197,7 +197,7 @@ def test_invalid_super(self) -> None:
e2j.excel2resources("testdata/invalid-testdata/excel2json/resources-invalid-super.xlsx", "")

def test_sheet_invalid_cardinality(self) -> None:
expected_msg = re.escape(
expected_msg = (
"\nThe Excel file 'testdata/invalid-testdata/excel2json/resources-invalid-cardinality.xlsx' "
"did not pass validation.\n"
" Section of the problem: 'Resources'\n"
Expand All @@ -209,7 +209,7 @@ def test_sheet_invalid_cardinality(self) -> None:
e2j.excel2resources("testdata/invalid-testdata/excel2json/resources-invalid-cardinality.xlsx", "")

def test_invalid_property(self) -> None:
expected_msg = re.escape(
expected_msg = (
"\nThe Excel file 'testdata/invalid-testdata/excel2json/resources-invalid-property.xlsx' "
"did not pass validation.\n"
" Section of the problem: 'Resources'\n"
Expand All @@ -221,20 +221,24 @@ def test_invalid_property(self) -> None:
e2j.excel2resources("testdata/invalid-testdata/excel2json/resources-invalid-property.xlsx", "")

def test_duplicate_name(self) -> None:
expected_msg = re.escape(
(
"Resource names must be unique inside every ontology, but your Excel file "
"'testdata/invalid-testdata/excel2json/resources-duplicate-name.xlsx' contains duplicates:\n"
" - Row 3: MentionedPerson\n"
" - Row 4: MentionedPerson"
)
expected_msg = (
"The excel file 'resources.xlsx', sheet 'classes' has a problem.\n"
"No duplicates are allowed in the column 'name'\n"
"The following values appear several times:\n"
" - MentionedPerson"
)
with pytest.raises(BaseError, match=expected_msg):
e2j.excel2resources("testdata/invalid-testdata/excel2json/resources-duplicate-name.xlsx", "")

def test_missing_sheet(self) -> None:
expected_msg = re.escape("Worksheet named 'GenericAnthroponym' not found")
with pytest.raises(BaseError, match=expected_msg):
expected_msg = re.escape(
"The excel file 'resources.xlsx' has problems.\n"
"The names of the excel sheets must be 'classes' "
"plus all the entries in the column 'name' from the sheet 'classes'.\n"
"The following sheet(s) are missing:\n"
" - GenericAnthroponym"
)
with pytest.raises(InputError, match=expected_msg):
e2j.excel2resources("testdata/invalid-testdata/excel2json/resources-invalid-missing-sheet.xlsx", "")


Expand Down

0 comments on commit a0d5776

Please sign in to comment.