From 504a4ec0d52c1199d42db5fa7eba70f35c0f4ac6 Mon Sep 17 00:00:00 2001 From: Nora-Olivia-Ammann <103038637+Nora-Olivia-Ammann@users.noreply.github.com> Date: Wed, 30 Aug 2023 15:27:09 +0200 Subject: [PATCH] refactor: excel2json (DEV-2547) (#487) * Creation of excel_to_json folder Creation of folder and changes of import paths. * Update test_shared.py Cleaning Datatype * Update test_shared.py Removal of unnecessary statements. * Excel_to_json_properties compliance The compliance checks for the files were changed and testing added. To this end several generic functions were created and tested. The creation and validation of the json string remains wip. * Cleaning of names * Documentation * Style Fix * Style Fix * Update test_utils.py * linting corrections * mock excel upload cleaning * Linting Fixes * Linting Fixes * Fix Testing * Update Test Data * Rename Files * Rename Test files * Shortening docstring in properties * Formatting * Changes according to review * Update src/dsp_tools/utils/excel_to_json/properties.py Co-authored-by: Johannes Nussbaum <39048939+jnussbaum@users.noreply.github.com> * Changes PR Review * Consistency Naming Convention * Naming changes * Cleaning up docstrings --------- Co-authored-by: Johannes Nussbaum <39048939+jnussbaum@users.noreply.github.com> --- src/dsp_tools/cli.py | 8 +- src/dsp_tools/fast_xmlupload/process_files.py | 1 - src/dsp_tools/fast_xmlupload/upload_xml.py | 3 +- src/dsp_tools/models/sipi.py | 4 +- src/dsp_tools/models/xmlbitstream.py | 2 +- src/dsp_tools/models/xmlvalue.py | 2 +- src/dsp_tools/utils/excel_to_json/__init__.py | 0 .../lists.py} | 0 .../project.py} | 6 +- .../utils/excel_to_json/properties.py | 463 ++++++++++++++ .../resources.py} | 14 +- src/dsp_tools/utils/excel_to_json/utils.py | 286 +++++++++ .../utils/excel_to_json_properties.py | 218 ------- src/dsp_tools/utils/id_to_iri.py | 3 +- src/dsp_tools/utils/project_create.py | 2 +- src/dsp_tools/utils/project_create_lists.py | 2 +- src/dsp_tools/utils/project_validate.py | 2 +- src/dsp_tools/utils/shared.py | 2 +- src/dsp_tools/utils/stack_handling.py | 3 +- src/dsp_tools/utils/xml_upload.py | 2 +- test/e2e/test_cli.py | 4 +- test/unittests/test_excel2xml.py | 2 +- test/unittests/test_excel_to_json/__init__.py | 0 .../test_lists.py} | 2 +- .../test_excel_to_json/test_properties.py | 592 ++++++++++++++++++ .../test_resources.py} | 2 +- .../test_excel_to_json/test_utils.py | 185 ++++++ .../test_excel_to_json_properties.py | 358 ----------- test/unittests/test_id_to_iri.py | 2 +- test/unittests/test_shared.py | 10 +- test/unittests/test_xmlupload.py | 2 +- .../excel2json/properties-invalid-super.xlsx | Bin 7053 -> 9254 bytes 32 files changed, 1563 insertions(+), 619 deletions(-) create mode 100644 src/dsp_tools/utils/excel_to_json/__init__.py rename src/dsp_tools/utils/{excel_to_json_lists.py => excel_to_json/lists.py} (100%) rename src/dsp_tools/utils/{excel_to_json_project.py => excel_to_json/project.py} (96%) create mode 100644 src/dsp_tools/utils/excel_to_json/properties.py rename src/dsp_tools/utils/{excel_to_json_resources.py => excel_to_json/resources.py} (95%) create mode 100644 src/dsp_tools/utils/excel_to_json/utils.py delete mode 100644 src/dsp_tools/utils/excel_to_json_properties.py create mode 100644 test/unittests/test_excel_to_json/__init__.py rename test/unittests/{test_excel_to_json_lists.py => test_excel_to_json/test_lists.py} (99%) create mode 100644 test/unittests/test_excel_to_json/test_properties.py rename test/unittests/{test_excel_to_json_resources.py => test_excel_to_json/test_resources.py} (99%) create mode 100644 test/unittests/test_excel_to_json/test_utils.py delete mode 100644 test/unittests/test_excel_to_json_properties.py diff --git a/src/dsp_tools/cli.py b/src/dsp_tools/cli.py index 81a5a10e6..099e55a43 100644 --- a/src/dsp_tools/cli.py +++ b/src/dsp_tools/cli.py @@ -13,10 +13,10 @@ from dsp_tools.fast_xmlupload.upload_files import upload_files from dsp_tools.fast_xmlupload.upload_xml import fast_xmlupload from dsp_tools.models.exceptions import UserError -from dsp_tools.utils.excel_to_json_lists import excel2lists, validate_lists_section_with_schema -from dsp_tools.utils.excel_to_json_project import excel2json -from dsp_tools.utils.excel_to_json_properties import excel2properties -from dsp_tools.utils.excel_to_json_resources import excel2resources +from dsp_tools.utils.excel_to_json.lists import excel2lists, validate_lists_section_with_schema +from dsp_tools.utils.excel_to_json.project import excel2json +from dsp_tools.utils.excel_to_json.properties import excel2properties +from dsp_tools.utils.excel_to_json.resources import excel2resources from dsp_tools.utils.generate_templates import generate_template_repo from dsp_tools.utils.id_to_iri import id_to_iri from dsp_tools.utils.logging import get_logger diff --git a/src/dsp_tools/fast_xmlupload/process_files.py b/src/dsp_tools/fast_xmlupload/process_files.py index 5bc49ef32..01505acfe 100644 --- a/src/dsp_tools/fast_xmlupload/process_files.py +++ b/src/dsp_tools/fast_xmlupload/process_files.py @@ -19,7 +19,6 @@ from dsp_tools.models.exceptions import UserError from dsp_tools.utils.logging import get_logger - from dsp_tools.utils.shared import http_call_with_retry logger = get_logger(__name__, filesize_mb=100, backupcount=36) diff --git a/src/dsp_tools/fast_xmlupload/upload_xml.py b/src/dsp_tools/fast_xmlupload/upload_xml.py index b1c023d10..bc770f392 100644 --- a/src/dsp_tools/fast_xmlupload/upload_xml.py +++ b/src/dsp_tools/fast_xmlupload/upload_xml.py @@ -5,12 +5,11 @@ from lxml import etree +from dsp_tools.fast_xmlupload.upload_files import get_pkl_files from dsp_tools.models.exceptions import UserError from dsp_tools.utils.logging import get_logger from dsp_tools.utils.xml_upload import xml_upload -from dsp_tools.fast_xmlupload.upload_files import get_pkl_files - logger = get_logger(__name__) diff --git a/src/dsp_tools/models/sipi.py b/src/dsp_tools/models/sipi.py index 927e3df40..7b2e67077 100644 --- a/src/dsp_tools/models/sipi.py +++ b/src/dsp_tools/models/sipi.py @@ -1,7 +1,7 @@ -from dataclasses import dataclass -from datetime import datetime import json import os +from dataclasses import dataclass +from datetime import datetime from pathlib import Path from typing import Any diff --git a/src/dsp_tools/models/xmlbitstream.py b/src/dsp_tools/models/xmlbitstream.py index 221f748ba..10d394f77 100644 --- a/src/dsp_tools/models/xmlbitstream.py +++ b/src/dsp_tools/models/xmlbitstream.py @@ -1,4 +1,4 @@ -from typing import cast, Optional +from typing import Optional, cast from lxml import etree diff --git a/src/dsp_tools/models/xmlvalue.py b/src/dsp_tools/models/xmlvalue.py index 9c9d78137..39ad1e492 100644 --- a/src/dsp_tools/models/xmlvalue.py +++ b/src/dsp_tools/models/xmlvalue.py @@ -1,7 +1,7 @@ from typing import Optional, Union, cast -from lxml import etree import regex +from lxml import etree from dsp_tools.models.value import KnoraStandoffXml diff --git a/src/dsp_tools/utils/excel_to_json/__init__.py b/src/dsp_tools/utils/excel_to_json/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/dsp_tools/utils/excel_to_json_lists.py b/src/dsp_tools/utils/excel_to_json/lists.py similarity index 100% rename from src/dsp_tools/utils/excel_to_json_lists.py rename to src/dsp_tools/utils/excel_to_json/lists.py diff --git a/src/dsp_tools/utils/excel_to_json_project.py b/src/dsp_tools/utils/excel_to_json/project.py similarity index 96% rename from src/dsp_tools/utils/excel_to_json_project.py rename to src/dsp_tools/utils/excel_to_json/project.py index d260111fb..7d3b92731 100644 --- a/src/dsp_tools/utils/excel_to_json_project.py +++ b/src/dsp_tools/utils/excel_to_json/project.py @@ -4,9 +4,9 @@ import regex from dsp_tools.models.exceptions import BaseError -from dsp_tools.utils.excel_to_json_lists import excel2lists -from dsp_tools.utils.excel_to_json_properties import excel2properties -from dsp_tools.utils.excel_to_json_resources import excel2resources +from dsp_tools.utils.excel_to_json.lists import excel2lists +from dsp_tools.utils.excel_to_json.properties import excel2properties +from dsp_tools.utils.excel_to_json.resources import excel2resources def excel2json( diff --git a/src/dsp_tools/utils/excel_to_json/properties.py b/src/dsp_tools/utils/excel_to_json/properties.py new file mode 100644 index 000000000..2c60f26e1 --- /dev/null +++ b/src/dsp_tools/utils/excel_to_json/properties.py @@ -0,0 +1,463 @@ +import importlib.resources +import json +import warnings +from typing import Any, Optional + +import jsonpath_ng.ext +import jsonschema +import numpy as np +import pandas as pd +import regex + +import dsp_tools.utils.excel_to_json.utils as utl +from dsp_tools.models.exceptions import UserError + +languages = ["en", "de", "fr", "it", "rm"] +language_label_col = ["label_en", "label_de", "label_fr", "label_it", "label_rm"] +mandatory_properties = ["name", "object", "gui_element"] + + +def _search_json_validation_error_get_err_msg_str( + properties_list: list[dict[str, Any]], + excelfile: str, + validation_error: jsonschema.ValidationError, +) -> str: + """ + This function takes a list of properties, which were transformed from an Excel to a json. + The validation raised an error. + This function searches for the exact location in the Excel where the error was caused. + It returns a string with a user-friendly version of the original json validation error message. + + Args: + properties_list: List of properties + excelfile: Name of the Excel file + validation_error: The error from the calling function + + Returns: + A string which is used in the Error message that contains detailed information about the problem + """ + err_msg_list = [f"The 'properties' section defined in the Excel file '{excelfile}' did not pass validation."] + json_path_to_property = regex.search(r"^\$\[(\d+)\]", validation_error.json_path) + if json_path_to_property: + # fmt: off + wrong_property_name = ( + jsonpath_ng.ext.parse(json_path_to_property.group(0)) + .find(properties_list)[0] + .value["name"] + ) + # fmt: on + excel_row = int(json_path_to_property.group(1)) + 2 + err_msg_list.append(f"The problematic property is '{wrong_property_name}' in Excel row {excel_row}.") + affected_field = regex.search( + r"name|labels|comments|super|subject|object|gui_element|gui_attributes", + validation_error.json_path, + ) + if affected_field: + err_msg_list.append( + f"The problem is that the column '{affected_field.group(0)}' has an invalid value: " + f"{validation_error.message}" + ) + else: + err_msg_list.append( + f"The error message is: {validation_error.message}\n" f"The error occurred at {validation_error.json_path}" + ) + return "\n".join(err_msg_list) + + +def _validate_properties( + properties_list: list[dict[str, Any]], + excelfile: str, +) -> bool: + """ + This function checks if the "properties" section of a JSON project file is valid, according to the JSON schema. + + Args: + properties_list: the "properties" section of a JSON project as a list of dicts + excelfile: path to the Excel file containing the properties + + Raises: + UserError: if the validation fails + + Returns: + True if the "properties" section passed validation + """ + with importlib.resources.files("dsp_tools").joinpath("resources/schema/properties-only.json").open( + encoding="utf-8" + ) as schema_file: + properties_schema = json.load(schema_file) + try: + jsonschema.validate(instance=properties_list, schema=properties_schema) + except jsonschema.ValidationError as err: + err_msg = _search_json_validation_error_get_err_msg_str( + properties_list=properties_list, excelfile=excelfile, validation_error=err + ) + raise UserError(err_msg) from None + return True + + +def _search_convert_numbers(value_str: str) -> str | int | float: + """ + This function takes a string and searches if the string contains a float or an integer. + In those cases, it converts the string to the corresponding data type. + If it is not a float or integer, it returns the string as is. + + Args: + value_str: The value which is checked and may be converted + + Returns: + A int if the string was an integer, float if the string was a float or str if it was neither + """ + if regex.search(r"^\d+$", value_str): + return int(value_str) + elif regex.search(r"^\d+\.\d+$", value_str): + return float(value_str) + else: + return value_str + + +def _unpack_gui_attributes(attribute_str: str) -> dict[str, str]: + """ + This function takes a string which contains the gui_attributes if the string is not formatted correctly, + this raises an IndexError. + Errors regarding the content will be diagnosed when the json is validated. + + Args: + attribute_str: A string containing the gui_attributes + + Returns: + A dictionary with the gui_attribute name as key and the attribute as value. + + Raises: + IndexError: if the sub-lists do not contain each two items + """ + # Create a list with several attributes + gui_list = [x.strip() for x in attribute_str.split(",") if not x.strip() == ""] + # create a sub list with the kex value pair of the attribute if it is an empty string we exclude it. + # this error will be detected when checking for the length of the lists + sub_gui_list = [[sub.strip() for sub in x.split(":") if sub.strip() != ""] for x in gui_list] + # if not all sublist contain two items, something is wrong with the attribute + if not all(len(sub) == 2 for sub in sub_gui_list): + raise IndexError + return {sub[0]: sub[1] for sub in sub_gui_list} + + +def _format_gui_attribute(attribute_str: str) -> dict[str, str | int | float]: + """ + This function takes a string containing the information about the gui_attributes and formats it correctly. + + Args: + attribute_str: A string containing the attributes + + Returns: + A dictionary with the attribute name as a key and the attribute as value. + + Raises: + IndexError: if the attributes are not formatted correctly + """ + attribute_dict = _unpack_gui_attributes(attribute_str=attribute_str) + return {attrib: _search_convert_numbers(value_str=val) for attrib, val in attribute_dict.items()} + + +def _get_gui_attribute(df_row: pd.Series, row_num: int, excelfile: str) -> dict[str, int | str | float] | None: + """ + This function checks if the cell "gui_attributes" is empty. + If it is, it returns None. + If there is information, it extracts and formats it correctly. + + Args: + df_row: Row of a pd.DataFrame + row_num: The number of the row (index + 2) + excelfile: The name of the Excel file. + + Returns: + A gui_attribute dictionary or None if there are no attributes + + Raises: + UserError: if there is a formatting error of the string + """ + if pd.isnull(df_row["gui_attributes"]): + return None + # If the attribute is not in the correct format, a called function may raise an IndexError + try: + return _format_gui_attribute(attribute_str=df_row["gui_attributes"]) + except IndexError: + raise UserError( + f"Row {row_num} of Excel file {excelfile} contains invalid data in column 'gui_attributes'.\n" + "The expected format is '[attribute: value, attribute: value]'." + ) from None + + +def _row2prop(df_row: pd.Series, row_num: int, excelfile: str) -> dict[str, Any]: + """ + Takes a row from a pd.DataFrame, reads its content, and returns a dict object of the property. + + Args: + df_row: row from a pd.DataFrame that defines a property + row_num: row number of Excel file + excelfile: name of the original Excel file + + Returns: + dict object of the property + + Raises: + UserError if there are any formal mistakes in the "gui_attributes" column + """ + _property = {x: df_row[x] for x in mandatory_properties} + # These are also mandatory but require formatting + _property.update( + {"labels": utl.get_labels(df_row=df_row), "super": [s.strip() for s in df_row["super"].split(",")]} + ) + non_mandatory = { + "comments": utl.get_comments(df_row=df_row), + "gui_attributes": _get_gui_attribute(df_row=df_row, row_num=row_num, excelfile=excelfile), + } + # These functions may return None, this is checked before the update + _property = utl.update_dict_if_not_value_none(additional_dict=non_mandatory, to_update_dict=_property) + return _property + + +def _check_compliance_gui_attributes(df: pd.DataFrame) -> dict[str, pd.Series] | None: + """ + This function takes a pd.DataFrame and checks if the "gui_attributes" column is filled correctly. + If any or all of the checks fail, + it creates a dictionary with a pd.Series as value which contains True for all rows where + there is a problem otherwise, it returns None. + + Args: + df: pd.DataFrame that should be checked + + Returns: + A dictionary with a pd.Series that contains the information where there is a problem or None if all the + checks passed. + + Raises: + UserError if any of the checks fail + """ + mandatory_attributes = ["Spinbox", "List"] + mandatory_check = utl.col_must_or_not_empty_based_on_other_col( + df=df, + substring_list=mandatory_attributes, + substring_colname="gui_element", + check_empty_colname="gui_attributes", + must_have_value=True, + ) + no_attributes = ["Checkbox", "Date", "Geonames", "Richtext", "TimeStamp"] + no_attribute_check = utl.col_must_or_not_empty_based_on_other_col( + df=df, + substring_list=no_attributes, + substring_colname="gui_element", + check_empty_colname="gui_attributes", + must_have_value=False, + ) + # If neither has a problem, we return None + if mandatory_check is None and no_attribute_check is None: + return None + # If both have problems, we combine the series + elif mandatory_check is not None and no_attribute_check is not None: + final_series = pd.Series(np.logical_or(mandatory_check, no_attribute_check)) + elif mandatory_check is not None: + final_series = mandatory_check + else: + final_series = no_attribute_check + # The boolean series is returned + return {"wrong gui_attributes": final_series} + + +def _check_missing_values_in_row_raise_error(df: pd.DataFrame, excelfile: str) -> None: + """ + This function checks if all the required values are in the df. + If all the checks are ok, the function ends without any effect. + If any of the checks fail, a UserError is raised which contains the information in which column and row there + are problems. + + Args: + df: pd.DataFrame that is to be checked + excelfile: Name of the original Excel file + + Raises: + UserError: if any of the checks are failed + """ + # Every row in these columns must have a value + required_values = ["name", "super", "object", "gui_element"] + # If there are no problems, it returns an empty dict + missing_dict = utl.check_required_values(df=df, required_values_columns=required_values) + # This checks if the label columns have at least one value per row + missing_labels = utl.find_one_full_cell_in_cols(df=df, required_columns=language_label_col) + # If everything is ok, we get None, otherwise we update the dict + if missing_labels is not None: + missing_dict.update({"label": missing_labels}) + # Some gui_element require a gui_attributes and others must not have one + missing_gui_attributes = _check_compliance_gui_attributes(df=df) + if missing_gui_attributes is not None: + missing_dict.update(missing_gui_attributes) + if missing_dict: + # Get the row numbers from the boolean series + missing_dict = utl.get_wrong_row_numbers(wrong_row_dict=missing_dict, true_remains=True) + error_str = "\n".join([f" - Column Name: {k} Row Number: {v}" for k, v in missing_dict.items()]) + raise UserError(f"The file '{excelfile}' is missing values in the following rows:\n" f"{error_str}") + + +def _do_property_excel_compliance(df: pd.DataFrame, excelfile: str) -> None: + """ + This function calls three separate functions which each checks if the pd.DataFrame is as we expect it. + Each of these functions raises a UserError if there is a problem. + If the checks do not fail, this function ends without an effect. + + Args: + df: The pd.DataFrame that is checked + excelfile: The name of the original Excel file + + Raises: + UserError if any of the checks fail + """ + # If it does not pass any one of the tests, the function stops + required_columns = { + "name", + "label_en", + "label_de", + "label_fr", + "label_it", + "label_rm", + "comment_en", + "comment_de", + "comment_fr", + "comment_it", + "comment_rm", + "super", + "object", + "gui_element", + "gui_attributes", + } + utl.check_contains_required_columns_else_raise_error(df=df, required_columns=required_columns) + utl.check_column_for_duplicate_else_raise_error(df=df, to_check_column="name") + _check_missing_values_in_row_raise_error(df=df, excelfile=excelfile) + + +def _rename_deprecated_hlist(df: pd.DataFrame, excelfile: str) -> pd.DataFrame: + """ + This function deals with Excel files that do conform to a previous format. + If the old column names are not in the pd.DataFrame, then it returns it as was. + + Args: + df: The pd.DataFrame which is checked and renamed + excelfile: Name of the original Excel file. + + Returns: + Renamed pd.DataFrame or the original one + + Warnings: + A warning for the user that the Excel file is not compliant with the new specifications + """ + # If the deprecated feature is not in the df, then return the df + if "hlist" not in df.columns: + return df + warnings.warn( + f"The file '{excelfile}' has a column 'hlist', which is deprecated. " + f"Please use the column 'gui_attributes' for the attribute 'hlist'." + ) + # Reformat the string according to the new convention + df["hlist"] = df["hlist"].apply(lambda x: f"hlist:{x}" if isinstance(x, str) else x) + # If gui_attributes already exists we have to merge the columns + if "gui_attributes" in df.columns: + # In case there is a hlist, it is the only valid value in gui_attributes and has precedence + df["hlist"] = df["hlist"].fillna(df["gui_attributes"]) + df.pop("gui_attributes") + df.rename(columns={"hlist": "gui_attributes"}, inplace=True) + return df + + +def _rename_deprecated_lang_cols(df: pd.DataFrame, excelfile: str) -> pd.DataFrame: + """ + This function takes a pd.DataFrame and checks if the columns with the language label are named according to the old + specifications. + If they are, it renames them and informs the user that an old format is used. + Otherwise, it returns the pd.Dataframe as was. + + Args: + df: pd.DataFrame, which is to be checked + excelfile: Name of the Excel file + + Returns: + pd.DataFrame which has the columns renamed according to the new format + + Warnings: + A warning for the user that the Excel file is not compliant with the new specifications + """ + # If the columns are named correctly, return the df + if set(language_label_col).issubset(set(df.columns)): + return df + if set(languages).issubset(set(df.columns)): + warnings.warn( + f"The file '{excelfile}' uses {languages} as column titles, which is deprecated. " + f"Please use {[f'label_{lang}' for lang in languages]}" + ) + rename_dict = dict(zip(languages, language_label_col)) + df.rename(columns=rename_dict, inplace=True) + return df + + +def _rename_deprecated_columnnames(df: pd.DataFrame, excelfile: str) -> pd.DataFrame: + """ + This function calls two other functions that check and rename a deprecated Excel format. + Afterward, the pd.DataFrame is compliant with the current format. + In case the pd.DataFrame was already in the current format, the function passes without an effect. + + Args: + df: pd.DataFrame that is checked and renamed + excelfile: Name of the original Excel + + Returns: + pd.DataFrame that is renamed + + Warnings: + Two user warnings if the pd.DataFrame is not according to the current specifications + """ + df = _rename_deprecated_lang_cols(df=df, excelfile=excelfile) + df = _rename_deprecated_hlist(df=df, excelfile=excelfile) + return df + + +def excel2properties( + excelfile: str, + path_to_output_file: Optional[str] = None, +) -> tuple[list[dict[str, Any]], bool]: + """ + Converts properties described in an Excel file into a "properties" section which can be inserted into a JSON + project file. + + Args: + excelfile: path to the Excel file containing the properties + path_to_output_file: if provided, the output is written into this JSON file + + Raises: + UserError: if something went wrong + + Returns: + a tuple consisting of the "properties" section as a Python list, + and the success status (True if everything went well) + """ + property_df = utl.read_and_clean_excel_file(excelfile=excelfile) + + property_df = _rename_deprecated_columnnames(df=property_df, excelfile=excelfile) + + _do_property_excel_compliance(df=property_df, excelfile=excelfile) + + # transform every row into a property + props: list[dict[str, Any]] = [] + for index, row in property_df.iterrows(): + props.append( + _row2prop( + df_row=row, + row_num=int(str(index)), # index is a label/index/hashable, but we need an int + excelfile=excelfile, + ) + ) + + # write final JSON file + _validate_properties(properties_list=props, excelfile=excelfile) + if path_to_output_file: + with open(file=path_to_output_file, mode="w", encoding="utf-8") as file: + json.dump(props, file, indent=4, ensure_ascii=False) + print('"properties" section was created successfully and written to file:', path_to_output_file) + + return props, True diff --git a/src/dsp_tools/utils/excel_to_json_resources.py b/src/dsp_tools/utils/excel_to_json/resources.py similarity index 95% rename from src/dsp_tools/utils/excel_to_json_resources.py rename to src/dsp_tools/utils/excel_to_json/resources.py index 458046b25..3422ba352 100644 --- a/src/dsp_tools/utils/excel_to_json_resources.py +++ b/src/dsp_tools/utils/excel_to_json/resources.py @@ -91,7 +91,7 @@ def _validate_resources( def _row2resource( - row: pd.Series, + df_row: pd.Series, excelfile: str, ) -> dict[str, Any]: """ @@ -100,7 +100,7 @@ def _row2resource( and builds a dict object of the resource. Args: - row: row from the "classes" DataFrame + df_row: row from the "classes" DataFrame excelfile: Excel file where the data comes from Raises: @@ -110,12 +110,12 @@ def _row2resource( dict object of the resource """ - name = row["name"] - labels = {lang: row[f"label_{lang}"] for lang in languages if row.get(f"label_{lang}")} + name = df_row["name"] + labels = {lang: df_row[f"label_{lang}"] for lang in languages if df_row.get(f"label_{lang}")} if not labels: - labels = {lang: row[lang] for lang in languages if row.get(lang)} - comments = {lang: row[f"comment_{lang}"] for lang in languages if row.get(f"comment_{lang}")} - supers = [s.strip() for s in row["super"].split(",")] + labels = {lang: df_row[lang] for lang in languages if df_row.get(lang)} + comments = {lang: df_row[f"comment_{lang}"] for lang in languages if df_row.get(f"comment_{lang}")} + supers = [s.strip() for s in df_row["super"].split(",")] # load the cardinalities of this resource try: diff --git a/src/dsp_tools/utils/excel_to_json/utils.py b/src/dsp_tools/utils/excel_to_json/utils.py new file mode 100644 index 000000000..f78743313 --- /dev/null +++ b/src/dsp_tools/utils/excel_to_json/utils.py @@ -0,0 +1,286 @@ +from __future__ import annotations + +from typing import Any +from unittest import mock + +import numpy as np +import pandas as pd +import regex + +from dsp_tools.models.exceptions import UserError + +languages = ["en", "de", "fr", "it", "rm"] + + +def read_and_clean_excel_file(excelfile: str) -> pd.DataFrame: + """ + This function reads an Excel file, if there is a ValueError then it patches the openpyxl part that creates the + error and opens it with that patch. + It cleans and then returns the pd.DataFrame. + + Args: + excelfile: The name of the Excel file + + Returns: + A pd.DataFrame + """ + try: + read_df: pd.DataFrame = pd.read_excel(excelfile) + except ValueError: + # Pandas relies on openpyxl to parse XLSX files. + # A strange behavior of openpyxl prevents pandas from opening files with some formatting properties + # (unclear which formatting properties exactly). + # Apparently, the excel2json test files have one of the unsupported formatting properties. + # Credits: https://stackoverflow.com/a/70537454/14414188 + with mock.patch("openpyxl.styles.fonts.Font.family.max", new=100): + read_df = pd.read_excel(excelfile) + read_df = clean_data_frame(df=read_df) + return read_df + + +def clean_data_frame(df: pd.DataFrame) -> pd.DataFrame: + """ + This function takes a pd.DataFrame and removes: + - Leading and trailing spaces from the column names + - Leading and trailing spaces from each cell and any characters in the cells that are not part of any known + language, for example, linebreaks and replaces it with a pd.NA. + - Removes all rows that are empty in all columns + + Args: + df: The pd.DataFrame that is to be cleaned + + Returns: + pd.DataFrame which has the above-mentioned removed + """ + # Remove leading and trailing blanks in column names and make them lower case + df = df.rename(columns=lambda x: x.strip().lower()) + # Remove the values of all cells that do not at least contain one character of any known language and removes + # leading and trailing spaces. + df = df.applymap( + lambda x: str(x).strip() if pd.notna(x) and regex.search(r"[\w\p{L}]", str(x), flags=regex.U) else pd.NA + ) + # drop all the rows that are entirely empty + df.dropna(axis=0, how="all", inplace=True) + return df + + +def check_contains_required_columns_else_raise_error(df: pd.DataFrame, required_columns: set[str]) -> None: + """ + This function takes a pd.DataFrame and a set of required column names. + It checks if all the columns from the set are in the pd.DataFrame. + Additional columns to the ones in the set are allowed. + It raises an error if any columns are missing. + + Args: + df: pd.DataFrame that is checked + required_columns: set of column names + + Raises: + UserError: if there are required columns missing + """ + if not required_columns.issubset(set(df.columns)): + raise UserError( + f"The following columns are missing in the excel:\n" f"{required_columns.difference(set(df.columns))}" + ) + + +def check_column_for_duplicate_else_raise_error(df: pd.DataFrame, to_check_column: str) -> None: + """ + This function checks if a specified column contains duplicate values. + Empty cells (pd.NA) also count as duplicates. + If there are any duplicate values, it creates a string with the duplicates which are displayed in the error message. + + Args: + df: pd.DataFrame that is checked for duplicates + to_check_column: Name of the column that must not contain duplicates + + Raises: + UserError: if there are duplicates in the column + """ + if df[to_check_column].duplicated().any(): + # If it does, it creates a string with all the duplicate values and raises an error. + duplicate_values = ",".join(df[to_check_column][df[to_check_column].duplicated()].tolist()) + raise UserError( + f"The column '{to_check_column}' may not contain any duplicate values. " + f"The following values appeared multiple times '{duplicate_values}'." + ) + + +def check_required_values(df: pd.DataFrame, required_values_columns: list[str]) -> dict[str, pd.Series]: + """ + If there are any empty cells in the column, it adds the column name and a boolean pd.Series to the dictionary. + If there are no empty cells, then it is not included in the dictionary. + If no column has any empty cells, then it returns an empty dictionary. + + Args: + df: pd.DataFrame that is checked + required_values_columns: a list of column names that may not contain empty cells + + Returns: + a dictionary with the column names as key and pd.Series as values if there are any empty cells + """ + # It checks if any of the values in a specified column are empty. If they are, they are added to the dictionary + # with the column name as key and a boolean series as value that contain true for every pd.NA + res_dict = {col: df[col].isnull() for col in required_values_columns if df[col].isnull().any()} + # If all the columns are filled, then it returns an empty dictionary. + return res_dict + + +def turn_bool_array_into_index_numbers(series: pd.Series[bool], true_remains: bool = True) -> list[int]: + """ + This function takes a pd.Series containing boolean values. + By default, this method extracts the index numbers of the True values. + If the index numbers of the False values are required, the parameter "true_remains" should be turned to False. + + Args: + series: pd.Series, which only contains True and False values + true_remains: True if the index numbers of True are required, likewise with False + + Returns: + A list of index numbers + """ + # If the False are required, we need to invert the array. + if not true_remains: + series = ~series + return list(series[series].index) + + +def get_wrong_row_numbers(wrong_row_dict: dict[str, pd.Series], true_remains: bool = True) -> dict[str, list[int]]: + """ + From the boolean pd.Series the index numbers of the True values are extracted. + The resulting list is the new value of the dictionary. + This new dictionary is taken and to each index number 2 is added, so that it corresponds to the Excel row number. + The result is intended to be used to communicate the exact location of a problem in an error message. + + Args: + wrong_row_dict: The dictionary which contains column names and a boolean pd.Series + true_remains: If True then the index of True is taken, if False then the index of False values is taken + + Returns: + Dictionary with the column name as key and the row number as a list. + """ + wrong_row_dict = { + k: turn_bool_array_into_index_numbers(series=v, true_remains=true_remains) for k, v in wrong_row_dict.items() + } + return {k: [x + 2 for x in v] for k, v in wrong_row_dict.items()} + + +def update_dict_if_not_value_none(additional_dict: dict[Any, Any], to_update_dict: dict[Any, Any]) -> dict[Any, Any]: + """ + This function takes two dictionaries. + The "to_update_dict" should be updated with the information from the "additional_dict" + only if the value of a particular key is not None or pd.NA. + + Args: + additional_dict: The dictionary which contains information that may be transferred + to_update_dict: The dictionary to which the new information should be transferred + + Returns: + The "to_update_dict" which the additional information + """ + additional_dict = {k: v for k, v in additional_dict.items() if v is not None and v is not pd.NA} + to_update_dict.update(additional_dict) + return to_update_dict + + +def get_labels(df_row: pd.Series) -> dict[str, str]: + """ + This function takes a pd.Series which has "label_[language tag]" in the index. + If the value of the index is not pd.NA, the language tag and the value are added to a dictionary. + If it is empty, it is omitted from the dictionary. + + Args: + df_row: a pd.Series (usually a row of a pd.DataFrame) from which the content of the columns containing the + label is extracted + + Returns: + A dictionary with the language tag and the content of the cell + """ + return {lang: df_row[f"label_{lang}"] for lang in languages if df_row[f"label_{lang}"] is not pd.NA} + + +def get_comments(df_row: pd.Series) -> dict[str, str] | None: + """ + This function takes a pd.Series which has "comment_[language tag]" in the index. + If the value of the index is not pd.NA, the language tag and the value are added to a dictionary. + If it is empty, it is omitted from the dictionary. + + Args: + df_row: a pd.Series (usually a row of a pd.DataFrame) from which the content of the columns containing the + comment is extracted + + Returns: + A dictionary with the language tag and the content of the cell + """ + comments = {lang: df_row[f"comment_{lang}"] for lang in languages if df_row[f"comment_{lang}"] is not pd.NA} + if comments == {}: + return None + else: + return comments + + +def find_one_full_cell_in_cols(df: pd.DataFrame, required_columns: list[str]) -> pd.Series | None: + """ + This function takes a pd.DataFrame and a list of column names where at least one cell must have a value per row. + A pd.Series with boolean values is returned, True if any rows do not have a value in at least one column + + Args: + df: The pd.DataFrame which should be checked + required_columns: A list of column names where at least one cell per row must have a value + + Returns: + None if there is no problem or a pd.Series if there is a problem in a row + """ + # The series has True if the cell is empty + # In order to combine more than two arrays, we need to reduce the arrays, which takes a tuple + result_arrays = tuple(df[col].isnull() for col in required_columns) + # If all are True logical_and returns True otherwise False + combined_array = np.logical_and.reduce(result_arrays) + # if any of the values are True, it is turned into a pd.Series + if any(combined_array): + return pd.Series(combined_array) + else: + return None + + +def col_must_or_not_empty_based_on_other_col( + df: pd.DataFrame, + substring_list: list[str], + substring_colname: str, + check_empty_colname: str, + must_have_value: bool, +) -> pd.Series | None: + """ + It is presumed that the column "substring_colname" has no empty cells. + Based on the string content of the individual rows, which is specified in the "substring_list", + the cell in the column "check_empty_colname" is checked whether it is empty or not. + The "substring_list" contains the different possibilities regarding the content of the cell. + If the parameter "must_have_value" is True, then the cell in the "check_empty_colname" column must not be empty. + If the parameter is set to False, then it must be empty. + + Args: + df: The pd.DataFrame which is checked + substring_list: A list of possible information that could be in the column "substring_colname" + substring_colname: The name of the column that may contain any of the sub-strings + check_empty_colname: The name of the column which is checked if it is empty or not + must_have_value: True if the "check_empty_colname" should have a value or the reverse. + + Returns: + None if all rows are correctly filled or empty. + A series which contains True values for the rows, where it does + not comply with the specifications. + """ + na_series = df[check_empty_colname].isna() + # If the cells have to be empty, we need to reverse the series + if not must_have_value: + na_series = ~na_series + # This returns True if it finds the substring in the cell, they are joined in a RegEx "|" which denotes "or". + # If it does not match any of the sub-strings, then the RegEx returns False, + # which means that the value in the column "check_empty_colname" is not relevant. + substring_array = df[substring_colname].str.contains("|".join(substring_list), na=False, regex=True) + # If both are True logical_and returns True otherwise False + combined_array = np.logical_and(na_series, substring_array) + if any(combined_array): + return pd.Series(combined_array) + else: + return None diff --git a/src/dsp_tools/utils/excel_to_json_properties.py b/src/dsp_tools/utils/excel_to_json_properties.py deleted file mode 100644 index 0f9e67ca4..000000000 --- a/src/dsp_tools/utils/excel_to_json_properties.py +++ /dev/null @@ -1,218 +0,0 @@ -import importlib.resources -import json -import warnings -from typing import Any, Optional - -import jsonpath_ng.ext -import jsonschema -import pandas as pd -import regex - -from dsp_tools.models.exceptions import BaseError -from dsp_tools.utils.shared import check_notna, prepare_dataframe - -languages = ["en", "de", "fr", "it", "rm"] - - -def _validate_properties( - properties_list: list[dict[str, Any]], - excelfile: str, -) -> bool: - """ - This function checks if the "properties" section of a JSON project file is valid according to the JSON schema, - and if the property names are unique. - - Args: - properties_list: the "properties" section of a JSON project as a list of dicts - excelfile: path to the Excel file containing the properties - - Raises: - BaseError: if the validation fails - - Returns: - True if the "properties" section passed validation - """ - with importlib.resources.files("dsp_tools").joinpath("resources/schema/properties-only.json").open( - encoding="utf-8" - ) as schema_file: - properties_schema = json.load(schema_file) - try: - jsonschema.validate(instance=properties_list, schema=properties_schema) - except jsonschema.ValidationError as err: - err_msg = f"The 'properties' section defined in the Excel file '{excelfile}' did not pass validation. " - json_path_to_property = regex.search(r"^\$\[(\d+)\]", err.json_path) - if json_path_to_property: - # fmt: off - wrong_property_name = ( - jsonpath_ng.ext.parse(json_path_to_property.group(0)) - .find(properties_list)[0] - .value["name"] - ) - # fmt: on - excel_row = int(json_path_to_property.group(1)) + 2 - err_msg += f"The problematic property is '{wrong_property_name}' in Excel row {excel_row}. " - affected_field = regex.search( - r"name|labels|comments|super|subject|object|gui_element|gui_attributes", - err.json_path, - ) - if affected_field: - err_msg += ( - f"The problem is that the column '{affected_field.group(0)}' has an invalid value: {err.message}" - ) - else: - err_msg += f"The error message is: {err.message}\nThe error occurred at {err.json_path}" - raise BaseError(err_msg) from None - - # check if property names are unique - all_names = [p["name"] for p in properties_list] - duplicates: dict[int, str] = dict() - for index, propdef in enumerate(properties_list): - if all_names.count(propdef["name"]) > 1: - duplicates[index + 2] = propdef["name"] - if duplicates: - err_msg = f"Property names must be unique inside every ontology, but '{excelfile}' contains duplicates:\n" - for row_no, propname in duplicates.items(): - err_msg += f" - Row {row_no}: {propname}\n" - raise BaseError(err_msg) - - return True - - -def _row2prop( - row: pd.Series, - row_count: int, - excelfile: str, -) -> dict[str, Any]: - """ - Takes a row from a pandas DataFrame, reads its content, and returns a dict object of the property - - Args: - row: row from a pandas DataFrame that defines a property - row_count: row number of Excel file - excelfile: name of the original Excel file - - Raises: - BaseError: if the row contains invalid data - - Returns: - dict object of the property - """ - - # extract the elements that are necessary to build the property - name = row["name"] - supers = [s.strip() for s in row["super"].split(",")] - _object = row["object"] - labels = {lang: row[f"label_{lang}"] for lang in languages if row.get(f"label_{lang}")} - if not labels: - labels = {lang: row[lang] for lang in languages if row.get(lang)} - comments = {lang: row[f"comment_{lang}"] for lang in languages if row.get(f"comment_{lang}")} - gui_element = row["gui_element"] - gui_attributes = dict() - if row.get("hlist"): - gui_attributes["hlist"] = row["hlist"] - if row.get("gui_attributes"): - pairs = row["gui_attributes"].split(",") - for pair in pairs: - if pair.count(":") != 1: - raise BaseError( - f"Row {row_count} of Excel file {excelfile} contains invalid data in column 'gui_attributes'. " - "The expected format is 'attribute: value[, attribute: value]'." - ) - attr, val = [x.strip() for x in pair.split(":")] - if regex.search(r"^\d+\.\d+$", val): - val = float(val) - elif regex.search(r"^\d+$", val): - val = int(val) - gui_attributes[attr] = val - - # build the dict structure of this property - _property = {"name": name, "super": supers, "object": _object, "labels": labels} - if comments: - _property["comments"] = comments - _property["gui_element"] = gui_element - if gui_attributes: - _property["gui_attributes"] = gui_attributes - - return _property - - -def excel2properties( - excelfile: str, - path_to_output_file: Optional[str] = None, -) -> tuple[list[dict[str, Any]], bool]: - """ - Converts properties described in an Excel file into a "properties" section which can be inserted into a JSON - project file. - - Args: - excelfile: path to the Excel file containing the properties - path_to_output_file: if provided, the output is written into this JSON file - - Raises: - BaseError: if something went wrong - - Returns: - a tuple consisting of the "properties" section as Python list, - and the success status (True if everything went well) - """ - - # load file - try: - df: pd.DataFrame = pd.read_excel(excelfile) - except ValueError: - # Pandas relies on openpyxl to parse XLSX files. - # A strange behaviour of openpyxl prevents pandas from opening files with some formatting properties - # (unclear which formatting properties exactly). - # Apparently, the excel2json test files have one of the unsupported formatting properties. - # The following two lines of code help out. - # Credits: https://stackoverflow.com/a/70537454/14414188 - # pylint: disable-next=import-outside-toplevel - from unittest import mock - - p = mock.patch("openpyxl.styles.fonts.Font.family.max", new=100) - p.start() - df = pd.read_excel(excelfile) - p.stop() - df = prepare_dataframe( - df=df, - required_columns=["name"], - location_of_sheet=f"File '{excelfile}'", - ) - - # validation of input - required = ["super", "object", "gui_element"] - for index, row in df.iterrows(): - index = int(str(index)) # index is a label/index/hashable, but we need an int - for req in required: - if not check_notna(row[req]): - raise BaseError(f"'{excelfile}' has a missing value in row {index + 2}, column '{req}'") - if any(df.get(lang) is not None for lang in languages): - warnings.warn( - f"The file '{excelfile}' uses {languages} as column titles, which is deprecated. " - f"Please use {[f'label_{lang}' for lang in languages]}" - ) - if df.get("hlist"): - warnings.warn( - f"The file '{excelfile}' has a column 'hlist', which is deprecated. " - f"Please use the column 'gui_attributes' for the attribute 'hlist'." - ) - - # transform every row into a property - props: list[dict[str, Any]] = [] - for index, row in df.iterrows(): - props.append( - _row2prop( - row=row, - row_count=int(str(index)), # index is a label/index/hashable, but we need an int - excelfile=excelfile, - ) - ) - - # write final JSON file - _validate_properties(properties_list=props, excelfile=excelfile) - if path_to_output_file: - with open(file=path_to_output_file, mode="w", encoding="utf-8") as file: - json.dump(props, file, indent=4, ensure_ascii=False) - print('"properties" section was created successfully and written to file:', path_to_output_file) - - return props, True diff --git a/src/dsp_tools/utils/id_to_iri.py b/src/dsp_tools/utils/id_to_iri.py index 0708ea0da..41984b75d 100644 --- a/src/dsp_tools/utils/id_to_iri.py +++ b/src/dsp_tools/utils/id_to_iri.py @@ -2,14 +2,13 @@ from datetime import datetime from pathlib import Path -from lxml import etree import regex +from lxml import etree from dsp_tools.models.exceptions import UserError from dsp_tools.utils.logging import get_logger from dsp_tools.utils.xml_upload import parse_xml_file - logger = get_logger(__name__) diff --git a/src/dsp_tools/utils/project_create.py b/src/dsp_tools/utils/project_create.py index 068ff59ed..ce8f32f0f 100644 --- a/src/dsp_tools/utils/project_create.py +++ b/src/dsp_tools/utils/project_create.py @@ -15,7 +15,7 @@ from dsp_tools.models.propertyclass import PropertyClass from dsp_tools.models.resourceclass import ResourceClass from dsp_tools.models.user import User -from dsp_tools.utils.excel_to_json_lists import expand_lists_from_excel +from dsp_tools.utils.excel_to_json.lists import expand_lists_from_excel from dsp_tools.utils.logging import get_logger from dsp_tools.utils.project_create_lists import create_lists_on_server from dsp_tools.utils.project_validate import validate_project diff --git a/src/dsp_tools/utils/project_create_lists.py b/src/dsp_tools/utils/project_create_lists.py index 366abd7bf..d8ee459b8 100644 --- a/src/dsp_tools/utils/project_create_lists.py +++ b/src/dsp_tools/utils/project_create_lists.py @@ -4,7 +4,7 @@ from dsp_tools.models.exceptions import BaseError, UserError from dsp_tools.models.listnode import ListNode from dsp_tools.models.project import Project -from dsp_tools.utils.excel_to_json_lists import expand_lists_from_excel +from dsp_tools.utils.excel_to_json.lists import expand_lists_from_excel from dsp_tools.utils.logging import get_logger from dsp_tools.utils.project_validate import validate_project from dsp_tools.utils.shared import login, parse_json_input, try_network_action diff --git a/src/dsp_tools/utils/project_validate.py b/src/dsp_tools/utils/project_validate.py index 84329448f..12eb92c41 100644 --- a/src/dsp_tools/utils/project_validate.py +++ b/src/dsp_tools/utils/project_validate.py @@ -10,7 +10,7 @@ import regex from dsp_tools.models.exceptions import BaseError -from dsp_tools.utils.excel_to_json_lists import expand_lists_from_excel +from dsp_tools.utils.excel_to_json.lists import expand_lists_from_excel def _check_for_duplicate_names(project_definition: dict[str, Any]) -> bool: diff --git a/src/dsp_tools/utils/shared.py b/src/dsp_tools/utils/shared.py index 302a54a6e..5ba54216d 100644 --- a/src/dsp_tools/utils/shared.py +++ b/src/dsp_tools/utils/shared.py @@ -12,9 +12,9 @@ import pandas as pd import regex +import requests from lxml import etree from requests import ReadTimeout, RequestException -import requests from urllib3.exceptions import ReadTimeoutError from dsp_tools.models.connection import Connection diff --git a/src/dsp_tools/utils/stack_handling.py b/src/dsp_tools/utils/stack_handling.py index 55d016f75..1ebd6c48e 100644 --- a/src/dsp_tools/utils/stack_handling.py +++ b/src/dsp_tools/utils/stack_handling.py @@ -1,8 +1,8 @@ -from dataclasses import dataclass import importlib.resources import shutil import subprocess import time +from dataclasses import dataclass from pathlib import Path from typing import Optional @@ -12,7 +12,6 @@ from dsp_tools.models.exceptions import UserError from dsp_tools.utils.logging import get_logger - from dsp_tools.utils.shared import http_call_with_retry logger = get_logger(__name__) diff --git a/src/dsp_tools/utils/xml_upload.py b/src/dsp_tools/utils/xml_upload.py index 9c101bc5e..b6d8cd669 100644 --- a/src/dsp_tools/utils/xml_upload.py +++ b/src/dsp_tools/utils/xml_upload.py @@ -15,9 +15,9 @@ from typing import Any, Optional, Union, cast from urllib.parse import quote_plus -from lxml import etree import pandas as pd import regex +from lxml import etree from dsp_tools.models.connection import Connection from dsp_tools.models.exceptions import BaseError, UserError diff --git a/test/e2e/test_cli.py b/test/e2e/test_cli.py index c6a32a2b4..51b9942d0 100644 --- a/test/e2e/test_cli.py +++ b/test/e2e/test_cli.py @@ -8,11 +8,11 @@ import copy import json -from pathlib import Path import shutil import subprocess -from typing import Any, Optional, cast import unittest +from pathlib import Path +from typing import Any, Optional, cast import jsonpath_ng import jsonpath_ng.ext diff --git a/test/unittests/test_excel2xml.py b/test/unittests/test_excel2xml.py index 145563ea9..292dc1c3d 100644 --- a/test/unittests/test_excel2xml.py +++ b/test/unittests/test_excel2xml.py @@ -5,11 +5,11 @@ from pathlib import Path from typing import Any, Callable, Optional, Sequence, Union -from lxml import etree import numpy as np import pandas as pd import pytest import regex +from lxml import etree from dsp_tools import excel2xml from dsp_tools.models.exceptions import BaseError diff --git a/test/unittests/test_excel_to_json/__init__.py b/test/unittests/test_excel_to_json/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/unittests/test_excel_to_json_lists.py b/test/unittests/test_excel_to_json/test_lists.py similarity index 99% rename from test/unittests/test_excel_to_json_lists.py rename to test/unittests/test_excel_to_json/test_lists.py index 479745f1c..cd1e2e9ca 100644 --- a/test/unittests/test_excel_to_json_lists.py +++ b/test/unittests/test_excel_to_json/test_lists.py @@ -15,7 +15,7 @@ import regex from dsp_tools.models.exceptions import BaseError -from dsp_tools.utils import excel_to_json_lists as e2l +from dsp_tools.utils.excel_to_json import lists as e2l class TestExcelToJSONList(unittest.TestCase): diff --git a/test/unittests/test_excel_to_json/test_properties.py b/test/unittests/test_excel_to_json/test_properties.py new file mode 100644 index 000000000..b7ab2e83a --- /dev/null +++ b/test/unittests/test_excel_to_json/test_properties.py @@ -0,0 +1,592 @@ +"""unit tests for excel to properties""" + +# pylint: disable=missing-class-docstring,missing-function-docstring,protected-access, +# disable=wrong-import-order mypy: allow_untyped_calls + +import json +import os +import unittest +from typing import Any, cast + +import jsonpath_ng +import jsonpath_ng.ext +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from dsp_tools.models.exceptions import BaseError, UserError +from dsp_tools.utils.excel_to_json import properties as e2j + + +class TestExcelToProperties(unittest.TestCase): + outfile = "testdata/tmp/_out_properties.json" + + @classmethod + def setUpClass(cls) -> None: + """Is executed once before the methods of this class are run""" + os.makedirs("testdata/tmp", exist_ok=True) + + @classmethod + def tearDownClass(cls) -> None: + """Is executed after the methods of this class have all run through""" + for file in os.listdir("testdata/tmp"): + os.remove("testdata/tmp/" + file) + os.rmdir("testdata/tmp") + + def test_excel2properties(self) -> None: + excelfile = "testdata/excel2json/excel2json_files/test-name (test_label)/properties.xlsx" + output_from_method, _ = e2j.excel2properties(excelfile, self.outfile) + + # define the expected values from the excel file + excel_names = [ + "correspondsToGenericAnthroponym", + "hasAnthroponym", + "hasGender", + "isDesignatedAs", + "hasTitle", + "hasStatus", + "hasLifeYearAmount", + "hasBirthDate", + "hasRepresentation", + "hasRemarks", + "hasTerminusPostQuem", + "hasGND", + "hasColor", + "hasDecimal", + "hasTime", + "hasInterval", + "hasBoolean", + "hasGeoname", + "partOfDocument", + "linkstoRegion", + "hasLinkToImage", + "hasLinkToResource", + "hasLinkToArchiveRepresentation", + "hasLinkToMovingImageRepesentation", + "hasLinkToAudioRepesentation", + ] + excel_supers = [ + ["hasLinkTo"], + ["hasValue", "dcterms:creator"], + ["hasValue"], + ["hasValue"], + ["hasLinkTo"], + ["hasValue"], + ["hasValue"], + ["hasValue"], + ["hasRepresentation"], + ["hasValue", "dcterms:description"], + ["hasValue"], + ["hasValue"], + ["hasColor"], + ["hasValue"], + ["hasValue"], + ["hasSequenceBounds"], + ["hasValue"], + ["hasValue"], + ["isPartOf"], + ["hasLinkTo"], + ["hasLinkTo"], + ["hasLinkTo"], + ["hasLinkTo"], + ["hasLinkTo"], + ["hasLinkTo"], + ] + excel_objects = [ + ":GenericAnthroponym", + "TextValue", + "ListValue", + "ListValue", + ":Titles", + "ListValue", + "IntValue", + "DateValue", + "Representation", + "TextValue", + "DateValue", + "UriValue", + "ColorValue", + "DecimalValue", + "TimeValue", + "IntervalValue", + "BooleanValue", + "GeonameValue", + ":Documents", + "Region", + "StillImageRepresentation", + "Resource", + "ArchiveRepresentation", + "MovingImageRepresentation", + "AudioRepresentation", + ] + + excel_labels = dict() + # there are also labels in other languages, but they are not tested + excel_labels["de"] = [ + "", + "only German", + "", + "", + "", + "", + "", + "", + "hat eine Multimediadatei", + "", + "", + "GND", + "Farbe", + "Dezimalzahl", + "Zeit", + "Zeitintervall", + "Bool'sche Variable", + "Link zu Geonames", + "ist Teil eines Dokuments", + "", + "", + "", + "", + "", + "", + ] + excel_labels["it"] = [ + "", + "", + "", + "only Italian", + "", + "", + "", + "", + "", + "", + "", + "GND", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ] + + excel_comments = dict() + # there are also comments in other languages, but they are not tested + excel_comments["comment_fr"] = [ + "J'avais déjà examiné plusieurs propriétés quand, un jour, le notaire, qui me " + "donnait des indications nécessaires pour une de mes explorations, me dit :", + "Un étrange hasard m'a mis en possession de ce journal.", + "Je n'en sais rien du tout ; mais si vous voulez la voir, monsieur, voici les " + "indications précises pour la trouver.", + "Vous devrez arranger l'affaire avec le curé du village de --.\"", + "Un étrange hasard m'a mis en possession de ce journal.", + "", + "", + "only French", + "", + "", + "J'avais déjà examiné plusieurs propriétés quand, un jour, le notaire, qui me " + "donnait des indications nécessaires pour une de mes explorations, me dit :", + "Gemeinsame Normdatei", + "", + "Chiffre décimale", + "Temps", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ] + excel_comments["comment_it"] = [ + "Avevo già visto diverse proprietà quando un giorno il notaio,", + "Uno strano caso mi mise in possesso di questo diario.", + "Non ne so nulla; ma se volete vederla, signore, eccovi le indicazioni precise per trovarla.", + "Dovrete organizzare l'affare con il curato del villaggio di --\".", + "Uno strano caso mi mise in possesso di questo diario.", + "", + "", + "", + "", + "", + "Avevo già visto diverse proprietà quando un giorno il notaio,", + "Gemeinsame Normdatei", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ] + + excel_gui_elements = [ + "Searchbox", + "Richtext", + "List", + "Radio", + "Searchbox", + "List", + "Spinbox", + "Date", + "Searchbox", + "Textarea", + "Date", + "SimpleText", + "Colorpicker", + "Spinbox", + "TimeStamp", + "Interval", + "Checkbox", + "Geonames", + "Searchbox", + "Searchbox", + "Searchbox", + "Searchbox", + "Searchbox", + "Searchbox", + "Searchbox", + ] + + excel_gui_attributes_hasGender = {"hlist": "gender"} + excel_gui_attributes_hasGND = {"size": 100} + excel_gui_attributes_hasDecimal = {"min": 0.0, "max": 100.0} + + # read json file + with open(self.outfile, encoding="utf-8") as f: + output_from_file: list[dict[str, Any]] = json.load(f) + + # check that output from file and from method are equal + self.assertListEqual(output_from_file, output_from_method) + + # extract infos from json file + json_names = [match.value for match in jsonpath_ng.parse("$[*].name").find(output_from_file)] + json_supers = [match.value for match in jsonpath_ng.parse("$[*].super").find(output_from_file)] + json_objects = [match.value for match in jsonpath_ng.parse("$[*].object").find(output_from_file)] + + json_labels_all = [match.value for match in jsonpath_ng.parse("$[*].labels").find(output_from_file)] + json_labels: dict[str, list[str]] = dict() + for lang in ["de", "it"]: + json_labels[lang] = [label.get(lang, "").strip() for label in json_labels_all] + + json_comments: dict[str, list[str]] = dict() + for lang in ["fr", "it"]: + json_comments[f"comment_{lang}"] = [ + resource.get("comments", {}).get(lang, "").strip() for resource in output_from_file + ] + + json_gui_elements = [match.value for match in jsonpath_ng.parse("$[*].gui_element").find(output_from_file)] + + json_gui_attributes_hasGender = ( + jsonpath_ng.ext.parse("$[?name='hasGender'].gui_attributes").find(output_from_file)[0].value + ) + json_gui_attributes_hasGND = ( + jsonpath_ng.ext.parse("$[?name='hasGND'].gui_attributes").find(output_from_file)[0].value + ) + json_gui_attributes_hasDecimal = ( + jsonpath_ng.ext.parse("$[?name='hasDecimal'].gui_attributes").find(output_from_file)[0].value + ) + + # make checks + self.assertListEqual(excel_names, json_names) + self.assertListEqual(excel_supers, json_supers) + self.assertListEqual(excel_objects, json_objects) + self.assertDictEqual(excel_labels, json_labels) + self.assertDictEqual(excel_comments, json_comments) + self.assertListEqual(excel_gui_elements, json_gui_elements) + self.assertDictEqual(excel_gui_attributes_hasGND, json_gui_attributes_hasGND) + self.assertDictEqual(excel_gui_attributes_hasDecimal, json_gui_attributes_hasDecimal) + self.assertDictEqual(excel_gui_attributes_hasGender, json_gui_attributes_hasGender) + + def test_validate_properties(self) -> None: + # it is not possible to call the method to be tested directly. + # So let's make a reference to it, so that it can be found by the usage search + lambda x: e2j._validate_properties([], "file") # pylint: disable=expression-not-assigned,protected-access + + testcases = [ + ( + "testdata/invalid-testdata/excel2json/properties-invalid-super.xlsx", + "did not pass validation.\n" + "The problematic property is 'hasGeoname' in Excel row 3.\n" + "The problem is that the column 'super' has an invalid value: " + "'GeonameValue' is not valid under any of the given schemas", + ), + ( + "testdata/invalid-testdata/excel2json/properties-invalid-object.xlsx", + "did not pass validation.\n" + "The problematic property is 'hasBoolean' in Excel row 2.\n" + "The problem is that the column 'object' has an invalid value: " + "'hasValue' is not valid under any of the given schemas", + ), + ( + "testdata/invalid-testdata/excel2json/properties-invalid-gui_element.xlsx", + "did not pass validation.\n" + "The problematic property is 'hasInterval' in Excel row 4.\n" + r"The problem is that the column 'gui_element' has an invalid value: " + r"'Interval' was expected", + ), + ( + "testdata/invalid-testdata/excel2json/properties-invalid-gui_attribute.xlsx", + "did not pass validation.\n" + "The problematic property is 'hasInteger' in Excel row 4.\n" + r"The problem is that the column 'gui_attributes' has an invalid value: " + r"Additional properties are not allowed \('rows' was unexpected\)", + ), + ] + + for file, message in testcases: + with self.assertRaisesRegex(UserError, message): + e2j.excel2properties(file, self.outfile) + + def test__rename_deprecated_lang_cols(self) -> None: + original_df = pd.DataFrame( + {"en": [1, 2, 3], "de": [1, 2, 3], "fr": [1, 2, 3], "it": [1, 2, 3], "rm": [1, 2, 3]} + ) + expected_df = pd.DataFrame( + { + "label_en": [1, 2, 3], + "label_de": [1, 2, 3], + "label_fr": [1, 2, 3], + "label_it": [1, 2, 3], + "label_rm": [1, 2, 3], + } + ) + returned_df = e2j._rename_deprecated_lang_cols(df=original_df, excelfile="Test") + assert_frame_equal(original_df, returned_df) + returned_df = e2j._rename_deprecated_lang_cols(df=expected_df, excelfile="Test") + assert_frame_equal(original_df, returned_df) + + def test__do_property_excel_compliance(self) -> None: + original_df = pd.DataFrame( + { + "name": ["name_1", "name_2", "name_3", "name_4", "name_5", "name_6"], + "label_en": ["label_en_1", "label_en_2", pd.NA, pd.NA, "label_en_5", pd.NA], + "label_de": ["label_de_1", pd.NA, "label_de_3", pd.NA, pd.NA, pd.NA], + "label_fr": ["label_fr_1", pd.NA, pd.NA, "label_fr_4", pd.NA, pd.NA], + "label_it": ["label_it_1", pd.NA, pd.NA, pd.NA, "label_it_5", pd.NA], + "label_rm": ["label_rm_1", pd.NA, pd.NA, pd.NA, pd.NA, "label_rm_6"], + "comment_en": ["comment_en_1", "comment_en_2", pd.NA, pd.NA, "comment_en_5", pd.NA], + "comment_de": ["comment_de_1", pd.NA, "comment_de_3", pd.NA, pd.NA, pd.NA], + "comment_fr": ["comment_fr_1", pd.NA, pd.NA, "comment_fr_4", pd.NA, pd.NA], + "comment_it": ["comment_it_1", pd.NA, pd.NA, pd.NA, "comment_it_5", pd.NA], + "comment_rm": ["comment_rm_1", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA], + "super": ["super_1", "super_2", "super_3", "super_4.1, super_4.2, super_4.3", "super_5", "super_6"], + "subject": ["subject_1", "subject_2", "subject_3", "subject_4", "subject_5", "subject_6"], + "object": ["object_1", "object_2", "object_3", "object_4", "object_5", "object_6"], + "gui_element": ["Simple", "Searchbox", "Date", "Searchbox", "List", "Searchbox"], + "gui_attributes": ["size: 32, maxlength: 128", pd.NA, pd.NA, pd.NA, "hlist: languages", pd.NA], + } + ) + e2j._do_property_excel_compliance(df=original_df, excelfile="Test") + + original_df = pd.DataFrame( + { + "name": ["name_1", "name_2", "name_3", "name_4", "name_5", "name_6", "name_7", pd.NA], + "label_en": ["label_en_1", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA], + "label_de": [pd.NA, pd.NA, "label_de_3", pd.NA, pd.NA, pd.NA, pd.NA, "label_de_8"], + "label_fr": [pd.NA, pd.NA, pd.NA, "label_fr_4", pd.NA, pd.NA, pd.NA, pd.NA], + "label_it": [pd.NA, pd.NA, pd.NA, pd.NA, "label_it_5", pd.NA, pd.NA, pd.NA], + "label_rm": [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, "label_rm_6", pd.NA, pd.NA], + "comment_en": ["comment_en_1", pd.NA, "comment_en_3", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA], + "comment_de": [pd.NA, pd.NA, pd.NA, "comment_de_4", pd.NA, pd.NA, pd.NA, pd.NA], + "comment_fr": [pd.NA, pd.NA, pd.NA, pd.NA, "comment_fr_5", pd.NA, pd.NA, pd.NA], + "comment_it": [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, "comment_it_6", pd.NA, pd.NA], + "comment_rm": [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, "comment_rm_7", pd.NA], + "super": [ + pd.NA, + "super_2", + pd.NA, + "super_4.1, super_4.2, super_4.3", + "super_5", + "super_6", + "super_7", + pd.NA, + ], + "subject": [ + "subject_1", + "subject_2", + "subject_3", + "subject_4", + "subject_5", + "subject_6", + "subject_7", + pd.NA, + ], + "object": ["object_1", "object_2", "object_3", pd.NA, "object_5", "object_6", "object_7", pd.NA], + "gui_element": ["Simple", "Searchbox", "Date", "Date", pd.NA, "List", pd.NA, pd.NA], + "gui_attributes": ["size: 32, maxlength: 128", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA], + } + ) + with self.assertRaises(BaseError) as context: + e2j._do_property_excel_compliance(df=original_df, excelfile="Test") + self.assertEqual( + context, + "The file '{excel_filename}' is missing values in some rows. See below for more information:\n" + "{error_str}", + ) + + def test__rename_deprecated_hlist(self) -> None: + original_df = pd.DataFrame({"hlist": [pd.NA, pd.NA, "languages"]}) + expected_df = pd.DataFrame({"gui_attributes": [pd.NA, pd.NA, "hlist:languages"]}) + returned_df = e2j._rename_deprecated_hlist(df=original_df, excelfile="Test") + assert_frame_equal(expected_df, returned_df) + + original_df = pd.DataFrame( + {"hlist": [pd.NA, pd.NA, "languages"], "gui_attributes": [pd.NA, "attribute_1", pd.NA]} + ) + expected_df = pd.DataFrame({"gui_attributes": [pd.NA, "attribute_1", "hlist:languages"]}) + returned_df = e2j._rename_deprecated_hlist(df=original_df, excelfile="Test") + assert_frame_equal(expected_df, returned_df) + + def test__unpack_gui_attributes(self) -> None: + test_dict = { + "maxlength:1, size:32": {"maxlength": "1", "size": "32"}, + "hlist: languages": {"hlist": "languages"}, + } + for original, expected in test_dict.items(): + self.assertDictEqual(e2j._unpack_gui_attributes(attribute_str=original), expected) + + def test__search_convert_numbers(self) -> None: + test_dict = {"1": 1, "string": "string", "1.453": 1.453, "sdf.asdf": "sdf.asdf"} + for original, expected in test_dict.items(): + self.assertEqual(e2j._search_convert_numbers(value_str=original), expected) + + def test__get_gui_attribute(self) -> None: + original_df = pd.DataFrame( + {"gui_attributes": [pd.NA, "max:1.4 / min:1.2", "hlist:", "234345", "hlist: languages,"]} + ) + self.assertIsNone(e2j._get_gui_attribute(df_row=original_df.loc[0, :], row_num=2, excelfile="Test")) + with self.assertRaises(UserError) as context: + e2j._get_gui_attribute(df_row=original_df.loc[1, :], row_num=3, excelfile="Test") + self.assertEqual( + "Row {row_num} of Excel file {excel_filename} contains invalid data in column 'gui_attributes'. " + "The expected format is '[attribute: value, attribute: value]'.", + context, + ) + with self.assertRaises(UserError) as context: + e2j._get_gui_attribute(df_row=original_df.loc[2, :], row_num=4, excelfile="Test") + self.assertEqual( + "Row {row_num} of Excel file {excel_filename} contains invalid data in column 'gui_attributes'. " + "The expected format is '[attribute: value, attribute: value]'.", + context, + ) + with self.assertRaises(UserError) as context: + e2j._get_gui_attribute(df_row=original_df.loc[3, :], row_num=5, excelfile="Test") + self.assertEqual( + "Row {row_num} of Excel file {excel_filename} contains invalid data in column 'gui_attributes'. " + "The expected format is '[attribute: value, attribute: value]'.", + context, + ) + expected_dict = {"hlist": "languages"} + returned_dict = e2j._get_gui_attribute(df_row=original_df.loc[4, :], row_num=6, excelfile="Test") + self.assertDictEqual(expected_dict, cast(dict[str, str], returned_dict)) + + def test__check_compliance_gui_attributes(self) -> None: + original_df = pd.DataFrame( + { + "gui_element": ["Spinbox", "List", "Searchbox", "Date", "Geonames", "Richtext", "TimeStamp"], + "gui_attributes": ["Spinbox_attr", "List_attr", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA], + } + ) + returned_value = e2j._check_compliance_gui_attributes(df=original_df) + self.assertIsNone(cast(None, returned_value)) + original_df = pd.DataFrame( + { + "gui_element": ["Spinbox", "List", "Searchbox", "Date", "Geonames", "Richtext", "TimeStamp"], + "gui_attributes": ["Spinbox_attr", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, "TimeStamp_attr"], + } + ) + expected_dict = {"wrong gui_attributes": [False, True, False, False, False, False, True]} + returned_dict = e2j._check_compliance_gui_attributes(df=original_df) + returned_dict = cast(dict[str, list[pd.Series]], returned_dict) + casted_dict: dict[str, Any] = {"wrong gui_attributes": list(returned_dict["wrong gui_attributes"])} + self.assertDictEqual(expected_dict, casted_dict) + + def test__row2prop(self) -> None: + original_df = pd.DataFrame( + { + "name": ["name_1", "name_2", "name_3"], + "label_en": ["label_en_1", "label_en_2", pd.NA], + "label_de": ["label_de_1", pd.NA, "label_de_3"], + "label_fr": ["label_fr_1", pd.NA, pd.NA], + "label_it": ["label_it_1", pd.NA, pd.NA], + "label_rm": ["label_rm_1", pd.NA, pd.NA], + "comment_en": ["comment_en_1", "comment_en_2", pd.NA], + "comment_de": ["comment_de_1", pd.NA, "comment_de_3"], + "comment_fr": ["comment_fr_1", pd.NA, pd.NA], + "comment_it": ["comment_it_1", pd.NA, pd.NA], + "comment_rm": ["comment_rm_1", pd.NA, pd.NA], + "super": ["super_1", "super_2.1, super_2.2", "super_3"], + "subject": ["subject_1", "subject_2", pd.NA], + "object": ["object_1", "object_2", "object_3"], + "gui_element": ["Simple", "Date", "List"], + "gui_attributes": ["size: 32, maxlength: 128", pd.NA, "hlist: languages"], + } + ) + returned_dict = e2j._row2prop(df_row=original_df.loc[0, :], row_num=0, excelfile="Test") + expected_dict = { + "name": "name_1", + "object": "object_1", + "gui_element": "Simple", + "labels": { + "en": "label_en_1", + "de": "label_de_1", + "fr": "label_fr_1", + "it": "label_it_1", + "rm": "label_rm_1", + }, + "super": ["super_1"], + "comments": { + "en": "comment_en_1", + "de": "comment_de_1", + "fr": "comment_fr_1", + "it": "comment_it_1", + "rm": "comment_rm_1", + }, + "gui_attributes": {"size": 32, "maxlength": 128}, + } + self.assertDictEqual(expected_dict, returned_dict) + + returned_dict = e2j._row2prop(df_row=original_df.loc[1, :], row_num=1, excelfile="Test") + expected_dict = { + "comments": {"en": "comment_en_2"}, + "gui_element": "Date", + "labels": {"en": "label_en_2"}, + "name": "name_2", + "object": "object_2", + "super": ["super_2.1", "super_2.2"], + } + self.assertDictEqual(expected_dict, returned_dict) + + returned_dict = e2j._row2prop(df_row=original_df.loc[2, :], row_num=2, excelfile="Test") + expected_dict = { + "comments": {"de": "comment_de_3"}, + "gui_attributes": {"hlist": "languages"}, + "gui_element": "List", + "labels": {"de": "label_de_3"}, + "name": "name_3", + "object": "object_3", + "super": ["super_3"], + } + self.assertDictEqual(expected_dict, returned_dict) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/test/unittests/test_excel_to_json_resources.py b/test/unittests/test_excel_to_json/test_resources.py similarity index 99% rename from test/unittests/test_excel_to_json_resources.py rename to test/unittests/test_excel_to_json/test_resources.py index 1910c6beb..20c097b36 100644 --- a/test/unittests/test_excel_to_json_resources.py +++ b/test/unittests/test_excel_to_json/test_resources.py @@ -12,7 +12,7 @@ import pytest from dsp_tools.models.exceptions import BaseError -from dsp_tools.utils import excel_to_json_resources as e2j +from dsp_tools.utils.excel_to_json import resources as e2j class TestExcelToResource(unittest.TestCase): diff --git a/test/unittests/test_excel_to_json/test_utils.py b/test/unittests/test_excel_to_json/test_utils.py new file mode 100644 index 000000000..aaebca4df --- /dev/null +++ b/test/unittests/test_excel_to_json/test_utils.py @@ -0,0 +1,185 @@ +# pylint: disable=f-string-without-interpolation,missing-class-docstring,missing-function-docstring +# mypy: allow_untyped_calls + +import unittest +from typing import cast + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal, assert_series_equal + +import dsp_tools.utils.excel_to_json.utils as utl +from dsp_tools.models.exceptions import BaseError + + +class TestUtils(unittest.TestCase): + def test_clean_data_frame(self) -> None: + original_df = pd.DataFrame( + { + " TitLE of Column 1 ": [1.54, " 0-1 ", "1-n ", " "], + " Title of Column 2 ": ["1", 1, " t ext ", "text"], + "Title of Column 3": ["", pd.NA, None, "text"], + } + ) + expected_df = pd.DataFrame( + { + "title of column 1": ["1.54", "0-1", "1-n", pd.NA], + "title of column 2": ["1", "1", "t ext", "text"], + "title of column 3": [pd.NA, pd.NA, pd.NA, "text"], + } + ) + returned_df = utl.clean_data_frame(df=original_df) + assert_frame_equal(expected_df, returned_df) + + def test_check_contains_required_columns_else_raise_error(self) -> None: + original_df = pd.DataFrame(columns=["col1", "col2", "col3", "extra_col"]) + required = {"col1", "col2", "col3"} + utl.check_contains_required_columns_else_raise_error(df=original_df, required_columns=required) + required = {"col1", "col2", "col3", "col4"} + with self.assertRaises(BaseError) as context: + utl.check_contains_required_columns_else_raise_error(df=original_df, required_columns=required) + self.assertEqual( + context, + "The following columns are missing in the excel: " + "{required_columns.difference(set(check_df.columns))}", + ) + + def test_check_column_for_duplicate_else_raise_error(self) -> None: + original_df = pd.DataFrame( + { + "col_1": ["1.54", "0-1", "1-n", "0-1", "1.54"], + "col_2": ["1.54", "0-1", "1-n", "text", "neu"], + } + ) + utl.check_column_for_duplicate_else_raise_error(df=original_df, to_check_column="col_2") + with self.assertRaises(BaseError) as context: + utl.check_column_for_duplicate_else_raise_error(df=original_df, to_check_column="col_1") + self.assertEqual( + context, + "The column '{duplicate_column}' may not contain any duplicate values. " + "The following values appeared multiple times '{duplicate_values}'.", + ) + + def test_check_required_values(self) -> None: + original_df = pd.DataFrame( + { + "col_1": ["1.54", "0-1", "1-n", pd.NA], + "col_2": ["1", "1", pd.NA, "text"], + "col_3": ["1", "1", "1", "text"], + } + ) + expected_dict = {"col_1": [False, False, False, True]} + returned_dict = utl.check_required_values(df=original_df, required_values_columns=["col_1", "col_3"]) + self.assertListEqual(list(expected_dict.keys()), list(returned_dict.keys())) + for key, expected_list in expected_dict.items(): + self.assertListEqual(list(returned_dict[key]), expected_list) + + def test_turn_bool_array_into_index_numbers(self) -> None: + original_series = pd.Series([False, True, False, True]) + expected_list = [1, 3] + returned_list = utl.turn_bool_array_into_index_numbers(series=original_series, true_remains=True) + self.assertListEqual(expected_list, returned_list) + expected_list = [0, 2] + returned_list = utl.turn_bool_array_into_index_numbers(series=original_series, true_remains=False) + self.assertListEqual(expected_list, returned_list) + + def test_get_wrong_row_numbers(self) -> None: + original_dict = { + "col_1": pd.Series([False, True, False, True]), + "col_2": pd.Series([False, False, True, False]), + } + expected_dict = {"col_1": [3, 5], "col_2": [4]} + returned_dict = utl.get_wrong_row_numbers(wrong_row_dict=original_dict, true_remains=True) + self.assertDictEqual(expected_dict, returned_dict) + + def test_update_dict_if_not_value_none(self) -> None: + original_dict = {0: 0} + original_update_dict = {1: 1, 2: 2, 3: None, 4: pd.NA, 5: "5"} + expected_dict = {0: 0, 1: 1, 2: 2, 5: "5"} + returned_dict = utl.update_dict_if_not_value_none( + additional_dict=original_update_dict, to_update_dict=original_dict + ) + self.assertDictEqual(expected_dict, returned_dict) + + def test_find_one_full_cell_in_cols(self) -> None: + required_cols = ["label_en", "label_de", "label_fr", "label_it", "label_rm"] + original_df = pd.DataFrame( + { + "label_en": [1, pd.NA, pd.NA, 4], + "label_de": [1, pd.NA, 3, 4], + "label_fr": [1, pd.NA, 3, pd.NA], + "label_it": [1, pd.NA, 3, 4], + "label_rm": [pd.NA, pd.NA, 3, 4], + } + ) + expected_array = pd.Series([False, True, False, False]) + returned_array = utl.find_one_full_cell_in_cols(df=original_df, required_columns=required_cols) + assert_series_equal(expected_array, returned_array) + original_df = pd.DataFrame( + { + "label_en": [1, 2, 3, 4], + "label_de": [1, pd.NA, 3, 4], + "label_fr": [1, pd.NA, 3, pd.NA], + "label_it": [1, pd.NA, 3, 4], + "label_rm": [pd.NA, pd.NA, 3, 4], + } + ) + returned_array = utl.find_one_full_cell_in_cols(df=original_df, required_columns=required_cols) + self.assertIsNone(returned_array) + + def test_col_must_or_not_empty_based_on_other_col(self) -> None: + original_df = pd.DataFrame({"substring": ["1", "2", "3", "4", "5", "6"], "check": [1, pd.NA, 3, 4, pd.NA, 6]}) + returned_value = utl.col_must_or_not_empty_based_on_other_col( + df=original_df, + substring_list=["1", "3", "6"], + substring_colname="substring", + check_empty_colname="check", + must_have_value=True, + ) + self.assertIsNone(returned_value) + expected_series = pd.Series([True, False, False, False, False, False]) + returned_series = utl.col_must_or_not_empty_based_on_other_col( + df=original_df, + substring_list=["1", "2"], + substring_colname="substring", + check_empty_colname="check", + must_have_value=False, + ) + assert_series_equal(expected_series, returned_series) + + def test__get_labels(self) -> None: + original_df = pd.DataFrame( + { + "label_en": ["text_en", "text_en"], + "label_de": ["text_de", pd.NA], + "label_fr": ["text_fr", pd.NA], + "label_it": ["text_it", pd.NA], + "label_rm": ["text_rm", pd.NA], + } + ) + expected_dict = {"de": "text_de", "en": "text_en", "fr": "text_fr", "it": "text_it", "rm": "text_rm"} + returned_dict = utl.get_labels(original_df.loc[0, :]) + self.assertDictEqual(expected_dict, returned_dict) + expected_dict = {"en": "text_en"} + returned_dict = utl.get_labels(original_df.loc[1, :]) + self.assertDictEqual(expected_dict, returned_dict) + + def test_get_comments(self) -> None: + original_df = pd.DataFrame( + { + "comment_en": ["text_en", pd.NA], + "comment_de": ["text_de", pd.NA], + "comment_fr": ["text_fr", pd.NA], + "comment_it": ["text_it", pd.NA], + "comment_rm": ["text_rm", pd.NA], + } + ) + expected_dict = {"de": "text_de", "en": "text_en", "fr": "text_fr", "it": "text_it", "rm": "text_rm"} + returned_dict = utl.get_comments(original_df.loc[0, :]) + self.assertDictEqual(expected_dict, cast(dict[str, str], returned_dict)) + returned_none = utl.get_comments(original_df.loc[1, :]) + self.assertIsNone(cast(None, returned_none)) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/test/unittests/test_excel_to_json_properties.py b/test/unittests/test_excel_to_json_properties.py deleted file mode 100644 index 2985eac5c..000000000 --- a/test/unittests/test_excel_to_json_properties.py +++ /dev/null @@ -1,358 +0,0 @@ -"""unit tests for excel to properties""" - -# pylint: disable=missing-class-docstring,missing-function-docstring - -import json -import os -import unittest -from typing import Any - -import jsonpath_ng -import jsonpath_ng.ext -import pytest - -from dsp_tools.models.exceptions import BaseError -from dsp_tools.utils import excel_to_json_properties as e2j - - -class TestExcelToProperties(unittest.TestCase): - outfile = "testdata/tmp/_out_properties.json" - - @classmethod - def setUpClass(cls) -> None: - """Is executed once before the methods of this class are run""" - os.makedirs("testdata/tmp", exist_ok=True) - - @classmethod - def tearDownClass(cls) -> None: - """Is executed after the methods of this class have all run through""" - for file in os.listdir("testdata/tmp"): - os.remove("testdata/tmp/" + file) - os.rmdir("testdata/tmp") - - def test_excel2properties(self) -> None: - excelfile = "testdata/excel2json/excel2json_files/test-name (test_label)/properties.xlsx" - output_from_method, _ = e2j.excel2properties(excelfile, self.outfile) - - # define the expected values from the excel file - excel_names = [ - "correspondsToGenericAnthroponym", - "hasAnthroponym", - "hasGender", - "isDesignatedAs", - "hasTitle", - "hasStatus", - "hasLifeYearAmount", - "hasBirthDate", - "hasRepresentation", - "hasRemarks", - "hasTerminusPostQuem", - "hasGND", - "hasColor", - "hasDecimal", - "hasTime", - "hasInterval", - "hasBoolean", - "hasGeoname", - "partOfDocument", - "linkstoRegion", - "hasLinkToImage", - "hasLinkToResource", - "hasLinkToArchiveRepresentation", - "hasLinkToMovingImageRepesentation", - "hasLinkToAudioRepesentation", - ] - excel_supers = [ - ["hasLinkTo"], - ["hasValue", "dcterms:creator"], - ["hasValue"], - ["hasValue"], - ["hasLinkTo"], - ["hasValue"], - ["hasValue"], - ["hasValue"], - ["hasRepresentation"], - ["hasValue", "dcterms:description"], - ["hasValue"], - ["hasValue"], - ["hasColor"], - ["hasValue"], - ["hasValue"], - ["hasSequenceBounds"], - ["hasValue"], - ["hasValue"], - ["isPartOf"], - ["hasLinkTo"], - ["hasLinkTo"], - ["hasLinkTo"], - ["hasLinkTo"], - ["hasLinkTo"], - ["hasLinkTo"], - ] - excel_objects = [ - ":GenericAnthroponym", - "TextValue", - "ListValue", - "ListValue", - ":Titles", - "ListValue", - "IntValue", - "DateValue", - "Representation", - "TextValue", - "DateValue", - "UriValue", - "ColorValue", - "DecimalValue", - "TimeValue", - "IntervalValue", - "BooleanValue", - "GeonameValue", - ":Documents", - "Region", - "StillImageRepresentation", - "Resource", - "ArchiveRepresentation", - "MovingImageRepresentation", - "AudioRepresentation", - ] - - excel_labels = dict() - # there are also labels in other languages, but they are not tested - excel_labels["de"] = [ - "", - "only German", - "", - "", - "", - "", - "", - "", - "hat eine Multimediadatei", - "", - "", - "GND", - "Farbe", - "Dezimalzahl", - "Zeit", - "Zeitintervall", - "Bool'sche Variable", - "Link zu Geonames", - "ist Teil eines Dokuments", - "", - "", - "", - "", - "", - "", - ] - excel_labels["it"] = [ - "", - "", - "", - "only Italian", - "", - "", - "", - "", - "", - "", - "", - "GND", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - ] - - excel_comments = dict() - # there are also comments in other languages, but they are not tested - excel_comments["comment_fr"] = [ - "J'avais déjà examiné plusieurs propriétés quand, un jour, le notaire, qui me " - "donnait des indications nécessaires pour une de mes explorations, me dit :", - "Un étrange hasard m'a mis en possession de ce journal.", - "Je n'en sais rien du tout ; mais si vous voulez la voir, monsieur, voici les " - "indications précises pour la trouver.", - "Vous devrez arranger l'affaire avec le curé du village de --.\"", - "Un étrange hasard m'a mis en possession de ce journal.", - "", - "", - "only French", - "", - "", - "J'avais déjà examiné plusieurs propriétés quand, un jour, le notaire, qui me " - "donnait des indications nécessaires pour une de mes explorations, me dit :", - "Gemeinsame Normdatei", - "", - "Chiffre décimale", - "Temps", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - ] - excel_comments["comment_it"] = [ - "Avevo già visto diverse proprietà quando un giorno il notaio,", - "Uno strano caso mi mise in possesso di questo diario.", - "Non ne so nulla; ma se volete vederla, signore, eccovi le indicazioni precise per trovarla.", - "Dovrete organizzare l'affare con il curato del villaggio di --\".", - "Uno strano caso mi mise in possesso di questo diario.", - "", - "", - "", - "", - "", - "Avevo già visto diverse proprietà quando un giorno il notaio,", - "Gemeinsame Normdatei", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - ] - - excel_gui_elements = [ - "Searchbox", - "Richtext", - "List", - "Radio", - "Searchbox", - "List", - "Spinbox", - "Date", - "Searchbox", - "Textarea", - "Date", - "SimpleText", - "Colorpicker", - "Spinbox", - "TimeStamp", - "Interval", - "Checkbox", - "Geonames", - "Searchbox", - "Searchbox", - "Searchbox", - "Searchbox", - "Searchbox", - "Searchbox", - "Searchbox", - ] - - excel_gui_attributes_hasGender = {"hlist": "gender"} - excel_gui_attributes_hasGND = {"size": 100} - excel_gui_attributes_hasDecimal = {"min": 0.0, "max": 100.0} - - # read json file - with open(self.outfile, encoding="utf-8") as f: - output_from_file: list[dict[str, Any]] = json.load(f) - - # check that output from file and from method are equal - self.assertListEqual(output_from_file, output_from_method) - - # extract infos from json file - json_names = [match.value for match in jsonpath_ng.parse("$[*].name").find(output_from_file)] - json_supers = [match.value for match in jsonpath_ng.parse("$[*].super").find(output_from_file)] - json_objects = [match.value for match in jsonpath_ng.parse("$[*].object").find(output_from_file)] - - json_labels_all = [match.value for match in jsonpath_ng.parse("$[*].labels").find(output_from_file)] - json_labels: dict[str, list[str]] = dict() - for lang in ["de", "it"]: - json_labels[lang] = [label.get(lang, "").strip() for label in json_labels_all] - - json_comments: dict[str, list[str]] = dict() - for lang in ["fr", "it"]: - json_comments[f"comment_{lang}"] = [ - resource.get("comments", {}).get(lang, "").strip() for resource in output_from_file - ] - - json_gui_elements = [match.value for match in jsonpath_ng.parse("$[*].gui_element").find(output_from_file)] - - json_gui_attributes_hasGender = ( - jsonpath_ng.ext.parse("$[?name='hasGender'].gui_attributes").find(output_from_file)[0].value - ) - json_gui_attributes_hasGND = ( - jsonpath_ng.ext.parse("$[?name='hasGND'].gui_attributes").find(output_from_file)[0].value - ) - json_gui_attributes_hasDecimal = ( - jsonpath_ng.ext.parse("$[?name='hasDecimal'].gui_attributes").find(output_from_file)[0].value - ) - - # make checks - self.assertListEqual(excel_names, json_names) - self.assertListEqual(excel_supers, json_supers) - self.assertListEqual(excel_objects, json_objects) - self.assertDictEqual(excel_labels, json_labels) - self.assertDictEqual(excel_comments, json_comments) - self.assertListEqual(excel_gui_elements, json_gui_elements) - self.assertDictEqual(excel_gui_attributes_hasGND, json_gui_attributes_hasGND) - self.assertDictEqual(excel_gui_attributes_hasDecimal, json_gui_attributes_hasDecimal) - self.assertDictEqual(excel_gui_attributes_hasGender, json_gui_attributes_hasGender) - - def test_validate_properties(self) -> None: - # it is not possible to call the method to be tested directly. - # So let's make a reference to it, so that it can be found by the usage search - lambda x: e2j._validate_properties([], "file") # pylint: disable=expression-not-assigned,protected-access - - testcases = [ - ( - "testdata/invalid-testdata/excel2json/properties-invalid-super.xlsx", - "did not pass validation. The problematic property is 'hasGeoname' in Excel row 3. " - "The problem is that the column 'super' has an invalid value: " - "'GeonameValue' is not valid under any of the given schemas", - ), - ( - "testdata/invalid-testdata/excel2json/properties-invalid-object.xlsx", - "did not pass validation. The problematic property is 'hasBoolean' in Excel row 2. " - "The problem is that the column 'object' has an invalid value: " - "'hasValue' is not valid under any of the given schemas", - ), - ( - "testdata/invalid-testdata/excel2json/properties-invalid-gui_element.xlsx", - "did not pass validation. The problematic property is 'hasInterval' in Excel row 4. " - r"The problem is that the column 'gui_element' has an invalid value: " - r"'Interval' was expected", - ), - ( - "testdata/invalid-testdata/excel2json/properties-invalid-gui_attribute.xlsx", - "did not pass validation. The problematic property is 'hasInteger' in Excel row 4. " - r"The problem is that the column 'gui_attributes' has an invalid value: " - r"Additional properties are not allowed \('rows' was unexpected\)", - ), - ( - "testdata/invalid-testdata/excel2json/properties-duplicate-name.xlsx", - "Property names must be unique inside every ontology, but '.+' contains duplicates:\n" - r" - Row 3: hasGender\n - Row 4: hasGender\n - Row 5: isDesignatedAs\n - Row 6: isDesignatedAs", - ), - ] - - for file, message in testcases: - with self.assertRaisesRegex(BaseError, message): - e2j.excel2properties(file, self.outfile) - - -if __name__ == "__main__": - pytest.main([__file__]) diff --git a/test/unittests/test_id_to_iri.py b/test/unittests/test_id_to_iri.py index 545051ae5..dc9387d33 100644 --- a/test/unittests/test_id_to_iri.py +++ b/test/unittests/test_id_to_iri.py @@ -1,8 +1,8 @@ # pylint: disable=missing-class-docstring,missing-function-docstring -from pathlib import Path import shutil import unittest +from pathlib import Path import pytest import regex diff --git a/test/unittests/test_shared.py b/test/unittests/test_shared.py index f1466e6e8..2326ad6d2 100644 --- a/test/unittests/test_shared.py +++ b/test/unittests/test_shared.py @@ -100,9 +100,9 @@ def test_validate_xml_tags_in_text_properties(self) -> None: def test_prepare_dataframe(self) -> None: original_df = pd.DataFrame( { - " TitLE of Column 1 ": ["1", " 0-1 ", "1-n ", pd.NA, " ", " ", "", " 0-n ", np.nan], + " TitLE of Column 1 ": ["1", " 0-1 ", "1-n ", pd.NA, " ", " ", "", " 0-n ", pd.NA], " Title of Column 2 ": [None, "1", 1, "text", "text", "text", "text", "text", "text"], - "Title of Column 3": ["", pd.NA, None, "text", "text", "text", "text", np.nan, "text"], + "Title of Column 3": ["", pd.NA, None, "text", "text", "text", "text", pd.NA, "text"], } ) expected_df = pd.DataFrame( @@ -115,16 +115,14 @@ def test_prepare_dataframe(self) -> None: returned_df = shared.prepare_dataframe( df=original_df, required_columns=[" TitLE of Column 1 ", " Title of Column 2 "], location_of_sheet="" ) - for expected, returned in zip(expected_df.iterrows(), returned_df.iterrows()): - i, expected_row = expected - _, returned_row = returned + for (i, expected_row), (_, returned_row) in zip(expected_df.iterrows(), returned_df.iterrows()): self.assertListEqual(list(expected_row), list(returned_row), msg=f"Failed in row {i}") def test_check_notna(self) -> None: na_values = [ None, pd.NA, - np.nan, + pd.NA, "", " ", "-", diff --git a/test/unittests/test_xmlupload.py b/test/unittests/test_xmlupload.py index d7c149580..75be9eb5c 100644 --- a/test/unittests/test_xmlupload.py +++ b/test/unittests/test_xmlupload.py @@ -13,9 +13,9 @@ from dsp_tools.utils.xml_upload import ( _convert_ark_v0_to_resource_iri, _determine_save_location_of_diagnostic_info, - parse_xml_file, _remove_circular_references, _transform_server_url_to_foldername, + parse_xml_file, ) diff --git a/testdata/invalid-testdata/excel2json/properties-invalid-super.xlsx b/testdata/invalid-testdata/excel2json/properties-invalid-super.xlsx index 05e57d86409525d5e07d03875f6f09d39389bd90..47a2bf43e37b6b3fa1fae234f2725ad20a2822e7 100644 GIT binary patch literal 9254 zcmeHtg;yNe_H`qTySoMt?!kh)1Wm96ce-iZoe&5T9D=)(;I6^lU4jG>TpLZk&di(N z%w*>K3*M_{A-b2#5p#BmfEk0H6h!9%Wb?!vO%XhyVaS00rJa z8sz9=;pp;G+Y@ZzY{cQ=U{8~a2+y1ifQOC$-|=5O1LX1}qUmFlM` z{D(pPc&yLFJ3qzum6{o3+SvTad~=5*l|yJN_z`b>#hd4J)VkWyp*|?QuSF9hGH{?% z$CzA@v$y|%p@Wz-4x<0!IGcd{DL3iMmr<7Q0Vz%mJ(_$npCy&b$*c*9!~?c6mil#Z z*ZZ0AY8n+>8q}XrF7bm})jb zZj+VUotYbXXJ47YuW^ONwqdP~ts}Lx3sR}_kyV#OT$Jkf#oDoT_!#xK8;Bg%H{Ukw{r!sufO|yE(@EM=t(>l3%b7oAexf%(b z!{d#D{!AHlVL{d)zoLP`MxjpiLo*5=$HGbEZA97Y9|TvpUz^W6rEms2X&?E%KHM1i zD&l=Gv11T?bei#JbHPYs(KF>M%b~!{p$hf>}o8Il(M^|LJO_7Ux4BYc3o87Bl6V_CJG#FvS0*x z|2Dt-q2*Q4sBZ(bSDSq0k$40m^!4r)Vac~nuE?y6kR&;$^3R`e-DWRmuTtfeJel3v zVpz)>i*u9**XiY^Ph~#hjC1Ibq2d=%gb|6Q2I}{zzF0H8u7sPF(mJdN`_Lem{Vjg{ zz5jet;a5zFaAB2$sZ_!NXH%=iN}mCH+RGa<9W85Nn-8X$kSBDWFU{>c&t=|s;6M1V ztECTU((&P4^Nh>)Gh|=->(}y~4rh3N5=8AO8#oyZili=HgE7=UgCtNqA^99OBh9cN zAp@YmdDwIQ9w+V~h@B}2WcQQ3{)ibk7=^*S{C6ManySj5xNzFgzrW^kfA2vyqE&f0ABd`_7Y0TkwWJlgLD<;U7Yz`yKbD~`ZJ^uo7476F1r z&qonZOb;ulg5-htcn9D1%MQ`8(wy;|#N!4YXA%f~xn^b{eVtcGCv(v>6~<>f#^uUm z>pVd@1jEd5?RLOzh}Jm|a)iEyfN-nKD%=znD*$3(e~Rd1`-_IOm<*)E1rN&n{ATLo6N3XOD+3 zjxDzb`)Ai-EB#N%Ea@P~e1O440G4&JVIyFW`72oRwT;x5xd?o-YJcF|hzY4JYtU&T zCPXVOYdiRe(#<=OWA~5Q7g0;yLyd}5(Jw+J6BgRr4^Nw*nfucZ}v-(E`X>A~r>+ zw~=b2rHNdaYGy#H=4=2yQhKw+tOQmJl_C$BX+@SEw+uSXv2-0wTn-@NC_%Wb$4PXU zRJx*Sj7#fpMoZ;j3rIm@S!srol^mQpHMEf~!s|AEXZ(TZgz@AZN?4;9sv)v`Vi2jr z)E6l^z0 zCJ9&e)$Wv2Uh!74zHsbP(yf)n?5meVWHJ=@oBnntLKj#HDBXeYsuR=>Lz?M|qegJr z>u{*Ac=9b26T2%e*xN5KQzd-xkD4<#S9Eix_4N;E=F*=;gz9UsoPis<^xh6%$IPO2 z;Bbl^B>afwv=H|u;i-RPTjTd$3}r08LWXG!@dFihhjgmnG>`x@{~gHN>!57PQ)485 z`*R8JfiVTsstE~oydw#=$*jb2-)C9K+HrqgzB50WxHK1|VrpR>TfNuG1nD=aQ*1+t zz0%8A<_g*NqEiBgtch_@*c|i#;Yx5lV?((`FIq5owoRkS*pm4uT?S{ix$bUNJY~4g zn-ijEUhEwAsin=1;|oxOv26h)73pD1>LqK_n8O#ubkW16bPX)}5(jh~r6?}scHB!) z9z9ULA~GhC*hZMth(=#_*ZaeMuyd^BLD!n7ytm~ z@8EWJ@wB&a{+S0ybd?>Kxo}pA_b5>+mqK&ZANLD90yqUc z$|chBFc~V6ZShx}IGtUoYNzrS5i2WU!lS(C^{`P9c3NdM(3C7i(9uLyM=Jnwf0#vY zVC843MX=Ht!0k3|V8E zF|UDmx=d(Xp;W5ABK{UVjja|}Q{1=rDu?vNCw**hO37VRg`WEyFg`16oEmugmBu+r z7EjV3Myq?#kau4fr|Y_zA9-#<+k1>6_FRHY zIuVj%o@Hdg1Al4AaT5>HC)zg5q7F6a^VM;f@d0w{;8E)eri_3Y9Xf zBdJ;nNT-qQ7MY|}LoQ@nFq~AvVs9jxUa(Pz`^3i~1gR=Y<9Z1N^9ONG zVPp=-dlRfoR<9a;AYt`ZV$miNs_)Qly29ld^B7=hd>qnO6J>s|J)ag7X*JB~(E^iPTN&y}nz#uoe|jMzGpe zl^@%P4I2|mA4>;=pJa?qeI)iMlJGiT<}GZnTtTcjL5`2;d0QmN+2Yw7nZe%IrmEcW zjx2k1R~##BO>xg#&1>&w?3IGtn6(GV6PrEcn$NGihzd<&yc>5DcAMJQ$0Cz_gt%2S z#?1A~Dy4kc2vF-|T0@p~x>rZf20{NCd1DdLDY`I9aVP&-MgK-CF4h(f7M#DGxqrsr z!3$*&1wTax{V8R6Q^R}TFFB5-=G2(AHhJa-GkNZFB3qSKqSbAR(6A+XJiJ^Q4uusV znruxrCzdQm4Mf>*dK(~bH}~|*LPb>8&_d1l)BAhqkB1Y(CNBw-y;u}Q&n1kZWIcG9 zsaR}lGXCJ@wxfbNfMV!0oTVtk=qo96u?F+wW8Ks~bL4 z+4-IM-h_1(n~o7Y^LMkZ(5>RHx4gg%D3LX^s+->9v%J6nyrxyK^>nhS)YY$}FaRt{ zh}ABa9ojB-5%1{{d#si2`(v#IJbqY$Z(nu@B%*AEd*R~?_j28M-Ld5Yb6}Gk{dzXB z!p&he2lSLX@2fRj1?JM>mpxMRfyHRzcBGR!mUCwUkDig13tZ{UQj7Ma4R1pixa>d- zwlq5a!2z7t8@=r57r@>$>Q+hc=O`nzo$fes^g)dY75-psUJP9BBIwy7lru|A6807M zJGr>-Qt!v6rjodxoPr1(ZSb)=`kWqG`TIR^-);>Ga#Hzz?jNT6K2I%vv$H$?@H9MI zSbn_2=YDy*f4t-N0(wm4xnTegjXa@J4VMud#9i7&9k3+Wf~ehY`iOXO7mi_3AUI%u zjgFw)7)7-dUlDLhbeDg6b^=!wbu8RU^~toncz1i>X%pZsY_)9YK?pX=(wP8*(&fJQ z^frbs_g_!AyP+3F+lXvoRDYRapf=3 zymcEQIxxS}pRzN`+5gmb|IL~-s#t^Fc zJn|MYkquq%WWD0{{r)xozRb~Tv#x=*)ikozyx%AYd z1sJ^pkp^>Sm%q3)XXH-g`0%+l#tW}CyZ2)vFP1#8SxocV zUV@ZCut2y4_kF555Q_tE;XpE|ET%XD+caHr-B`fP{E_q2I~mesF3vOy-ibK;+R<>O z99O|X?C##|V&BF@olh36=s-U)G@JVOpAJ>)7`nl>jeTekAbMhD6RWlRK-lnSSG}j^VQ};k8)U-nm?FU3S!WVr}&hI3xy-g7mxRu zQU_nw6$ObRuzohp%QaJ%=wwfytM`OBgQ=W)TR4Mj_ERjl$7)KW$8AUq8c{b%vA+yL z>=qCfTU*ILuaz^)3VzRelWT{%!l#YA1W#)SkN(!BRRDoQdn8A|oEx(R{-|H&8vqpe z-p~Wn4^eOOO9Rd8+lDA%<(?3JjJLK~ata#7vaCcQ2a>uxYfv=PomR&eagUkIMN2y? zM;7j=ele1?nYnpG7ToDS&9xR8Jf$7K8Ed#C!_1)BwX<_x8_TM$JSC1|(& zm`O4--nI>m(=SHcUz>8bBpTDRjv{|dl&SPE5-Je9PnZ8A_?x`!z=0K}k(##;wI*9o z6+wNC)W%G*{dAy(do57#&lwCVTK#v0zmrqDvjU?30|A2lb!zo0%qlL>| zNt?7^y+{AI>L!Lm_G>Uq$nAiYHYC5((9i0|+1kRw#hLTB%Wqtio?rx8;vxt;W;&FR zX$RpUE3wdN8wLeL8s6d^o-lH7*|4C*j8C6f87N6 z+N-wBa)aOCPb{Pf4Sy})08K7WT4SYrr=Iq%or!C&sC#H|eP|rL@$z7tYk6`*hNC zvzER9cAb{U#b1+@eXG(~Fkn`0#SSKE*aXgbJZ4C()>UjcCLh?;%$a4=HjeLDvT+^x zfh`z|bi3Dl+%S$@(rc?DzB$Fh=r`^m3wbl$sq?1blNC+q-Dsz-39`gd%iY=sUaqTl zc9M0{a>b%j3x0y~%%a9A!31+MHLp;5mOWneu8-3^5*a)WEJF1OWjOoXnh2Q}WZ;85 zbv5%jDX=~5IsPNVy2-uFN{p0Yg@<_uu4p4Jv9_===O}hC4|va>Y&?BOE+VI`1w3UE z$Gh$FJvX>(k900Oqi4l@!dXfxqiJu0gDmUB(a1_5loCTOeiD=T=I8+(7;{DQxfU0F>>a)P&OMsG>W&^y2_;^E|Fg5(=!q&_p3Eye>;x3m&3Aa{@dID; zmE)PM`!WbFE3)(By-h$hudDWu*SA9tY84>bW}FLVHM&E*%k$MA z&wla4E@5c#418~6{p^h#&5b@upg^KNPHpK*hGwId(r}~oRw7W6t%_wcFHw-}1q7RF zenBMR72Yvx|HV_~V#OrNB^GAJrO$BH{7Z`KQ<}rCH&Uj^5S0dF0lo%ReoX8!cPhApR!EO?grQu_7fp2c@g}cxSXv` zAQt9YE)W|>tKSM(7cEs~1n%FotJp}U)wu|T#Z@Wvh>g*u;N-YbwU&>A=A-f@-@R&T zBR>pMov*2^y*Dv#7h+M2epNp4eB$eB2;S35yTH33zw#_l7W2nwlhJ%R<~{7^&CFF* z5IaiLHf!`Hq4}|zkuxNM8uw-(#)ukuuc1epucZM(c|B%;7Eeo-3KefJqGorUyxzoX z76z)`hS#&CDWle@2G71!$`sO!8>U*V-n~`S4P1dQJoU==vzof>E^x{V6Pj0oYYu^o ze_9g5FN3O-uk(IH*f$98o0ug&u?}w|Hi(@dr5c-jHX=;VdfbV%7`9k2hxfd89bF|_ z<<@$T(sP;;Zjp=rIYVB=$*E;IZ!trY4#K_0SFP@C^t~A;!&h`bsxTuE9DPTop>6IY zr^nRbbM=&Ogp?^C5^`4bnQ2Tz?h!wGaeTwbYL^~BfIJUVn0YsC%wyyFNu7A1$$=DG z66;NE7i>cl>%aUW-1X+S==-%RkvM7^rv_UlOPGR&3lm(;L1r2d5ZIa13cjd4tMO4f;0@Y;;v6y!FTUx5-i6_Y>VcP5M7k98Zt$OEWXLdNFgh?}S&U9LRPw)?YRsGyZYs zFsG)xKjp1E!%Ggdu?d@(X~w3?=0 zIc3CXU@nAcwi0cNb#t||r=%nt*v#fJQorEoI0IA(dk2LeJRkH9N20LkloVhvvzN{- z_Pf1K*OyV!H1E=;=xK7JdXH@@j0OR-P+%~|t~AP&W4ut2NMo0|MLvb7iSnql;;RAG zt=ZLz#!&l}cWb_b*8_sZ`c{PG?bd-{`AdGOPFE_Z$kpiAP=cbcd&d=LFO@e2*_ct>BSj|7K|MGr@hVtJX{JnSlhvCm_GAv~N(nExG|%df`2ir9Y|kHBgknDL*I_OGVDs&Idr%E1aEnCY*Y+^-IP zt)KpMFiQA;fBYZS)URHC-OT*yC5-I%C-`-1^Q(uy^X#AY0027$0Pqji{nh;M8S<~@ e)l`2m|4+WuPzJ(Q7XUzmeF9*_9!vZ4?*9Qn5&m@m literal 7053 zcmaKR1z6Ng_x8fFNG;tROCu#EE#1bvaEsdCWv^iexH?>6!+jjaR7^cO^abuH z#_9Cw=MfphJjsRRnE>dWlP7n{j|J? zqOUW!V-4~VyIp5V*ZTW%RWgFcKm$r^O^ADarOnCkK_>jiM?LnF^=B1P*uo%Cq|^kr zOT3|sTF4_Yx{O`IWimf^{C)xkU0r>mr!PvUq0hcJb0CPO>6KdqO$yTC?7+Va?WxY; zQ}N?Po;K%bDZ=1X?M(hw5G zc&v;|1Zh5yJfK28`mX=L=am^M6QlJ`I7-wdhR#W(R_+0X9Q&rKnk-nlP`78e3KucelW*?o6Fk>mhd>HHAiCWQ<&M-^eqP5o7HZb)4BUxjo@P zLE~%*iQGq`2-(detkT^cAC0JsguHxnREVU!;gd&fd@AJ~d*%DX1{PXKj z!6ggsg~8s_p)*!myrxI2u>zAA=6Qy){p(LMYLMyD_DO;wT04^YU1jno=6LeoHhlbC z-v)}g>lR|(7H}@oupyi!d5$oB9*TaME8Y=9_M{-`M@_q4t+DP$>muXsp{SD}PPgLq z$PWpYILS-pya>kpGO0=D7kqy6diuzpQ8N~4x~AyWL7{Nb4-|!_mWK6e*;GboGg!;z zW;d}C1IA{lijuO!08vxqC!9_-n#`ke@*JD5lbiF*sh7%!(n#!!%Cq1ePHAHAWH8N4 z!`}e`i3TD)WEHTK7AYs^!3(b>5m`A&TuuD8-rPehJ{x4X#^f9K0jhjW{62ZKz$P@RuEVw3SInu*`Xp8mX#M(YB&nGw{fEg$G(MjiMS&MA8 zGSX?DZo#`>{Fq12x8+_r8EMTQy~Dv5+yzMu67NFwiv7fK!hu`ppdAcVeM2tnRff7` zTKm%H%$kGT6KR95pJY%J1GgUAiGM>v(N_Wm#9rbVeQP~Tm1IR&^D(!48CkQPLW;#B z7haS3-9FcFoaI3DMdQ?q*}^U2^B#APz=_hCrHpoN23$+|O|+p}>C$6J;fMgMlW)rmMsF}DK@s%8_Q?RxU5w!4P$(bHEMpH|LuS}=;TiPSi~VvYdO8Et(ZQ78vk zwJR^dzqsTkxht1-sZqcI0DLfN#eyaG-&|s1?__qJ+6&@z<$Kr(Lw?8&`ka4TatW#j zChFhE%wSjyUY*CcL5Gan{a7rY=TB|Hal%=oA9I%xnlEKNmHJ#T4rY9n0V!_6QmrI8 zXwE27i~rtPYnVAc6HE4l)7sjOkrzEXvw9;yB3mNnIAK1-Myv|A^AOCmUL&(X_%c$B zrMFBs(Cp%Q1=f&Y2PoJd8J}<`xP&ZfwHVi1eU9xt7`_WmK)pCHl$v>I1kq*4z=s{u zAw%0U`bpymH~MIL7;EQTc)JWP>XKl;-Qx+7lw!f6eK?J!br+-i>i|b~mNFAzmQR{m z*nOAi$DMD-{KevWQI6h@F?yDflP5dBOY+ny@D!Xda}G=kE#CC*MGz0)Z8wNY!U$#G8esXJp z_I!LEJLV-`_(@_*ae-Y$^CFv?mrpcdA~7Xy*h-^82bCXCvEJ0&Ab}L4NWiB#f9A&p zG2#%aQt`t)f2&#-jjlLpbYB;=p_k0ES(`*Ah5P3; z{K{jIeL{-)i$tN%>SZEnt_I^ERL0(^8C=Blgk-&nMmed?i_>DO(pH6U<$aPH2K7MR;yC2ad)uL zW`x`|AZSvQ+&$IA`2LI)?p*4n^G^0?No-HzyY6H32INBWhGhAAg=*Ot1oc*?T#B$9 zswa3~5*AdSdOUx02^4Tvuy>s+G)gBMz^lBV`670z$^Md_3tSy8o(K(ej8sQ{%; zO?7E{S@Wx*xx4!IS%&n-853G^$6gTeDY_deL!H91nyZwjVT zj5QK6i1VoY7(bKZPz`Y@9KqmCP(e$iN@wz|_111oVHW4n7F==9(t)~Bhl{%g5E38b zI9(2$mO#WPaat)FN~st5FCseSi40T^L8C@Pd`h zu&pTu=ei)vU>`i?aW1$;b!Z?oX?y>Kl~WS8g%p%>Z-_xD75)Q}JVR%etdwO0BaRQy zpAo@41~^7>=aJ|vErKSdwtP7J6HA(0Kl8b1Mmmc^eJRI2%X1IGx10clHH@ znfr*e^dfn3ofOs+PKVibvYLjDZzw$a@ZJw*1x8Gzij1v5^uLY^Vt0p25^r8rzTW?% zFdy!VkX(l4?J3xu2z--4oLxL^%$%?D_JZbu-2^+XFW#`0!}$F-bG8*}N-p{ZvOuZ| znGelj&!LA8SNZbEKGk`^J&Ii6vs6tC>q1mGq81hw`FQG0oSiXvDuv=QpwLwp?(l=5 z^I3^Dy`CA?Rh8b+C_5921~MMB8p&SAf2`KA#78imRfJ4owHA550dkGhX;Q)z=Vs){KF}@q7q()vN{sM~# ziqM0-eInt*DFio2~ki*zSN zfxbkuVjUNW@8j2sciEHOsW~rZydJ#S8z;ld*=%{OjcOYUdNOj~UrJ@LbpV9tZN?|X z3QW|qX%lv_n~JlN9`r9z!_Mv&(aK?vr8kfaTe_3eC^GL>0j=*)t&-=1`So^jV1u zU(>sQ+!SQT73ji%%fOKShGMukZ=;+Zjp)m~C?x3;NsJX<^p1%2YJn;mj27ALC7}z} zcfi5d`Q7J)p^)p^tJCylRx%d-S)F?>a*o9=bw_4SAxfT?_=k z$xi?V{KMI;SODrMSUVz#VL>*m8B|#njtozuIPwU~{t!x6z-t9Qc*5Bf$B!5wO6y8$ zEbQecWlM&4(&H(ukD3@?1I3z;Z!uJo8D{Hd3fwlSsza{i6UB}3GA9}`!VSI%?zS{; z_}og+o?-*O7=+pomvu?Bealca8p=Vd2dwH(jU{UFpqVE#P1GEWFr6s*_O^P)II8I@ zbf|5-7a2UE9P&-enz-|I#gAi(oUp$JB}-3`jC3jG%1=+ZNrPyF+03rQ5%>{!wYD;Y z$8XM{X3q7TtRyXyl}K_HlvU%j0e$Uvn#iX{QD^oedqf28zqro_J5vP1aHoPP445$7 zf7g#j4i49#XR3}Xt^5RiOnQMjNO_GgGaNluRI4OckZrW^>+(!eThMX)Rfak;Tww6i(=4AAJVqXg{CotrGS>#yL7UtDe)F>lHqHh0S;=w2=yWTjzbBRfI z2t`4nO9bE;`c~f>%rC(Rh1M6p4&DJ#yD_ho2j_ID^1rxBpL2G5K{npbBdi)b#@gCX z9GAC5Gn1^gS36DMG>)U{D7Ts)r=AUE;UbFcDC0&r>v#NdwB+7JM~>?8zy!AWm-naX zze!hB^>h|!H}wY!y`Bet23WlruAlS*4JD@2VH+ zS<9+WN_0nQ*IpGVpTQk{Nd3U%Fz%T!ia6VM>E<<&UrY9Pk@}D9^S?wlB<_z?E3g%9 zgfR^94ZGZ!U01Zm_V(7-%lnT8H*D2Qi7Wb4a<9EALUQN9$*cX0BZZdZJ)B`3g325K%a5zJ@Ne)M${pP zo8QF((~m~Ev~a@a`wiLTWyN2 z4niy}-s~g|i9A%tBSJ3CqP~+CBcszpl@JW*f<;eFq_Uit<=u8u4-!2x7+4n*m^_3`>(O z>HT~rZy_9REY%$rpRw|B_`#!RVN^nI@Z` zn+(dihDjz# zf#|u!f#|0l|K3et*x?;)_a!mZxVVybKWVxFO5q1$j}yd3yqHUux}q$5KJ*hJv!uOB z9!wp?1kZJ)@8QG>)dXm8mWr-iPAb+azOn^gsHe(zZxbnOIUjMxiDmci;Bg<2$Qa;6 zEp87g748|*v&{weNfYH~b_(|*TmY8u;6LE<0e-mi#wd_kMY=@X^wm_DZ*#DIQ<9^e z$p@KGZc%x$t6@&P?E{Seu1jRQ%`6{mU65c~7UO1J{=HeFREF)kVYvv;`U|ZrBLgLx zssWSglv9oa&({!bzIrnOgsb-SoZ{UI`mxBi8iDP;EmdnbDT)Mc(pM@dXo^M!oYLq9 z^Y2?owV!p%2X-m!idIi$Fx+_$kViye+)jU_dmo16ab3@VX#m4fhD z<+Uj=tVaXD%Jv+}7`wu4h zwX5tTMJ9%G^@N`y)UjmKV=1qcmp_CSGp0sa(VZUa2M^RM~+l4EY& z{~q?W;&9Wh=lKTd=RE(TKm2#>e<~2aya9mIzkF}?h}#j|OyJL3$ZZq>*rf!9^zEJG zPuD+-wp&+e*bM$h3HN7!KdX=1039$@<@e_Jrxv;O{u{ghZY?KR2?-nd|K7@f660;B zssDEScjozHXWsz#CyU(1sfCfsHE@4&%IyShBK(=}ZX+n*|6crGRIMxzy8eF+c1|$? N{jeiZO>q6~{{bV-kM95g