Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(excel2json): support uppercase classes sheet in resources.xlsx (DEV-3109) #683

Merged
merged 7 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/dsp_tools/commands/excel2json/input_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ class MoreThanOneSheetProblem:
excelname: str
sheet_names: list[str]

def __str__(self) -> str:
def execute_error_protocol(self) -> str:
msg = [
f"\nIn the '{self.excelname}' file only one sheet is allowed.",
f"The excel used contains the following sheets:{list_separator}{list_separator.join(self.sheet_names)}",
Expand Down
2 changes: 1 addition & 1 deletion src/dsp_tools/commands/excel2json/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def _make_json_lists_from_excel(
def _read_and_check_workbook(excelpath: Path) -> Worksheet:
all_worksheets = load_workbook(excelpath, read_only=True).worksheets
if len(all_worksheets) != 1:
msg = str(MoreThanOneSheetProblem(excelpath.name, [x.title for x in all_worksheets]))
msg = MoreThanOneSheetProblem(excelpath.name, [x.title for x in all_worksheets]).execute_error_protocol()
raise InputError(msg)
return all_worksheets[0]

Expand Down
2 changes: 1 addition & 1 deletion src/dsp_tools/commands/excel2json/properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,6 @@ def excel2properties(
def _read_check_property_df(excelfile: str) -> pd.DataFrame | None:
sheets_df_dict = read_and_clean_all_sheets(excelfile=excelfile)
if len(sheets_df_dict) != 1:
msg = str(MoreThanOneSheetProblem("properties.xlsx", list(sheets_df_dict.keys())))
msg = MoreThanOneSheetProblem("properties.xlsx", list(sheets_df_dict.keys())).execute_error_protocol()
raise InputError(msg)
return next(iter(sheets_df_dict.values()))
71 changes: 44 additions & 27 deletions src/dsp_tools/commands/excel2json/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@

from dsp_tools.commands.excel2json.input_error import (
JsonValidationResourceProblem,
MissingValuesInRowProblem,
PositionInExcel,
Problem,
ResourcesSheetsNotAsExpected,
)
from dsp_tools.commands.excel2json.utils import check_column_for_duplicate, read_and_clean_all_sheets
Expand Down Expand Up @@ -191,17 +193,14 @@ def excel2resources(
and the success status (True if everything went well)
"""

resource_dfs = read_and_clean_all_sheets(excelfile)
classes_df = resource_dfs.pop("classes")
classes_df = prepare_dataframe(
df=classes_df,
required_columns=["name"],
location_of_sheet=f"Sheet 'classes' in file '{excelfile}'",
)
all_dfs = read_and_clean_all_sheets(excelfile)
classes_df, resource_dfs = _prepare_classes_df(all_dfs)

if validation_problem := _validate_excel_file(classes_df, resource_dfs):
err_msg = validation_problem.execute_error_protocol()
raise InputError(err_msg)
if validation_problems := _validate_excel_file(classes_df, resource_dfs):
msg = "The excel file 'resources.xlsx', sheet 'classes' has a problem.\n" + "\n\n".join(
(x.execute_error_protocol() for x in validation_problems)
)
raise InputError(msg)

# transform every row into a resource
resources = [_row2resource(row, resource_dfs[row["name"]]) for i, row in classes_df.iterrows()]
Expand All @@ -217,26 +216,44 @@ def excel2resources(
return resources, True


def _validate_excel_file(
classes_df: pd.DataFrame, df_dict: dict[str, pd.DataFrame]
) -> ResourcesSheetsNotAsExpected | None:
for i, row in classes_df.iterrows():
index = int(str(i)) # index is a label/index/hashable, but we need an int
if not check_notna(row["super"]):
raise UserError(
f"Sheet 'classes' of 'resources.xlsx' has a missing value in row {index + 2}, column 'super'"
)
def _prepare_classes_df(resource_dfs: dict[str, pd.DataFrame]) -> tuple[pd.DataFrame, dict[str, pd.DataFrame]]:
resource_dfs = {k.strip(): v for k, v in resource_dfs.items()}
sheet_name_list = list(resource_dfs)
cls_sheet_name = [
ok.group(0) for x in sheet_name_list if (ok := regex.search(r"classes", flags=regex.IGNORECASE, string=x))
]
if not cls_sheet_name:
msg = ResourcesSheetsNotAsExpected(set(), names_sheets={"classes"}).execute_error_protocol()
raise InputError(msg)
elif len(cls_sheet_name) == 1:
classes_df = resource_dfs.pop(cls_sheet_name[0])
else:
msg = (
"The excel file 'resources.xlsx' has some problems.\n"
"There is more than one excel sheet called 'classes'.\n"
"This is a protected name and cannot be used for other sheets."
)
raise InputError(msg)
classes_df = prepare_dataframe(
df=classes_df,
required_columns=["name"],
location_of_sheet="Sheet 'classes' in file 'resources.xlsx'",
)
return classes_df, resource_dfs


def _validate_excel_file(classes_df: pd.DataFrame, df_dict: dict[str, pd.DataFrame]) -> list[Problem]:
if any(classes_df.get(lang) is not None for lang in languages):
warnings.warn(
f"The file 'resources.xlsx' uses {languages} as column titles, which is deprecated. "
f"Please use {[f'label_{lang}' for lang in languages]}"
)
duplicate_check = check_column_for_duplicate(classes_df, "name")
if duplicate_check:
msg = "The excel file 'resources.xlsx', sheet 'classes' has a problem.\n"
msg += duplicate_check.execute_error_protocol()
raise InputError(msg)
problems: list[Problem] = []
if missing_super_rows := [int(index) + 2 for index, row in classes_df.iterrows() if not check_notna(row["super"])]:
problems.append(MissingValuesInRowProblem(column="super", row_numbers=missing_super_rows))
if duplicate_check := check_column_for_duplicate(classes_df, "name"):
problems.append(duplicate_check)
# check that all the sheets have an entry in the names column and vice versa
if (all_names := set(classes_df["name"].tolist())) != (all_sheets := set(df_dict.keys())):
return ResourcesSheetsNotAsExpected(all_names, all_sheets)
return None
if (all_names := set(classes_df["name"].tolist())) != (all_sheets := set(df_dict)):
problems.append(ResourcesSheetsNotAsExpected(all_names, all_sheets))
return problems
2 changes: 1 addition & 1 deletion src/dsp_tools/commands/excel2xml/excel2xml_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def find_date_in_string(string: str) -> Optional[str]:


def prepare_value(
value: Union[PropertyElement, str, int, float, bool, Iterable[Union[PropertyElement, str, int, float, bool]]]
value: Union[PropertyElement, str, int, float, bool, Iterable[Union[PropertyElement, str, int, float, bool]]],
) -> list[PropertyElement]:
"""
This method transforms the parameter "value" from a make_*_prop() method into a list of PropertyElements. "value" is
Expand Down