diff --git a/src/dsp_tools/cli.py b/src/dsp_tools/cli.py index 81a5a10e6..099e55a43 100644 --- a/src/dsp_tools/cli.py +++ b/src/dsp_tools/cli.py @@ -13,10 +13,10 @@ from dsp_tools.fast_xmlupload.upload_files import upload_files from dsp_tools.fast_xmlupload.upload_xml import fast_xmlupload from dsp_tools.models.exceptions import UserError -from dsp_tools.utils.excel_to_json_lists import excel2lists, validate_lists_section_with_schema -from dsp_tools.utils.excel_to_json_project import excel2json -from dsp_tools.utils.excel_to_json_properties import excel2properties -from dsp_tools.utils.excel_to_json_resources import excel2resources +from dsp_tools.utils.excel_to_json.lists import excel2lists, validate_lists_section_with_schema +from dsp_tools.utils.excel_to_json.project import excel2json +from dsp_tools.utils.excel_to_json.properties import excel2properties +from dsp_tools.utils.excel_to_json.resources import excel2resources from dsp_tools.utils.generate_templates import generate_template_repo from dsp_tools.utils.id_to_iri import id_to_iri from dsp_tools.utils.logging import get_logger diff --git a/src/dsp_tools/fast_xmlupload/process_files.py b/src/dsp_tools/fast_xmlupload/process_files.py index 5bc49ef32..01505acfe 100644 --- a/src/dsp_tools/fast_xmlupload/process_files.py +++ b/src/dsp_tools/fast_xmlupload/process_files.py @@ -19,7 +19,6 @@ from dsp_tools.models.exceptions import UserError from dsp_tools.utils.logging import get_logger - from dsp_tools.utils.shared import http_call_with_retry logger = get_logger(__name__, filesize_mb=100, backupcount=36) diff --git a/src/dsp_tools/fast_xmlupload/upload_xml.py b/src/dsp_tools/fast_xmlupload/upload_xml.py index b1c023d10..bc770f392 100644 --- a/src/dsp_tools/fast_xmlupload/upload_xml.py +++ b/src/dsp_tools/fast_xmlupload/upload_xml.py @@ -5,12 +5,11 @@ from lxml import etree +from dsp_tools.fast_xmlupload.upload_files import get_pkl_files from dsp_tools.models.exceptions import UserError from dsp_tools.utils.logging import get_logger from dsp_tools.utils.xml_upload import xml_upload -from dsp_tools.fast_xmlupload.upload_files import get_pkl_files - logger = get_logger(__name__) diff --git a/src/dsp_tools/models/sipi.py b/src/dsp_tools/models/sipi.py index 927e3df40..7b2e67077 100644 --- a/src/dsp_tools/models/sipi.py +++ b/src/dsp_tools/models/sipi.py @@ -1,7 +1,7 @@ -from dataclasses import dataclass -from datetime import datetime import json import os +from dataclasses import dataclass +from datetime import datetime from pathlib import Path from typing import Any diff --git a/src/dsp_tools/models/xmlbitstream.py b/src/dsp_tools/models/xmlbitstream.py index 221f748ba..10d394f77 100644 --- a/src/dsp_tools/models/xmlbitstream.py +++ b/src/dsp_tools/models/xmlbitstream.py @@ -1,4 +1,4 @@ -from typing import cast, Optional +from typing import Optional, cast from lxml import etree diff --git a/src/dsp_tools/models/xmlvalue.py b/src/dsp_tools/models/xmlvalue.py index 9c9d78137..39ad1e492 100644 --- a/src/dsp_tools/models/xmlvalue.py +++ b/src/dsp_tools/models/xmlvalue.py @@ -1,7 +1,7 @@ from typing import Optional, Union, cast -from lxml import etree import regex +from lxml import etree from dsp_tools.models.value import KnoraStandoffXml diff --git a/src/dsp_tools/utils/excel_to_json/__init__.py b/src/dsp_tools/utils/excel_to_json/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/dsp_tools/utils/excel_to_json_lists.py b/src/dsp_tools/utils/excel_to_json/lists.py similarity index 100% rename from src/dsp_tools/utils/excel_to_json_lists.py rename to src/dsp_tools/utils/excel_to_json/lists.py diff --git a/src/dsp_tools/utils/excel_to_json_project.py b/src/dsp_tools/utils/excel_to_json/project.py similarity index 96% rename from src/dsp_tools/utils/excel_to_json_project.py rename to src/dsp_tools/utils/excel_to_json/project.py index d260111fb..7d3b92731 100644 --- a/src/dsp_tools/utils/excel_to_json_project.py +++ b/src/dsp_tools/utils/excel_to_json/project.py @@ -4,9 +4,9 @@ import regex from dsp_tools.models.exceptions import BaseError -from dsp_tools.utils.excel_to_json_lists import excel2lists -from dsp_tools.utils.excel_to_json_properties import excel2properties -from dsp_tools.utils.excel_to_json_resources import excel2resources +from dsp_tools.utils.excel_to_json.lists import excel2lists +from dsp_tools.utils.excel_to_json.properties import excel2properties +from dsp_tools.utils.excel_to_json.resources import excel2resources def excel2json( diff --git a/src/dsp_tools/utils/excel_to_json/properties.py b/src/dsp_tools/utils/excel_to_json/properties.py new file mode 100644 index 000000000..2c60f26e1 --- /dev/null +++ b/src/dsp_tools/utils/excel_to_json/properties.py @@ -0,0 +1,463 @@ +import importlib.resources +import json +import warnings +from typing import Any, Optional + +import jsonpath_ng.ext +import jsonschema +import numpy as np +import pandas as pd +import regex + +import dsp_tools.utils.excel_to_json.utils as utl +from dsp_tools.models.exceptions import UserError + +languages = ["en", "de", "fr", "it", "rm"] +language_label_col = ["label_en", "label_de", "label_fr", "label_it", "label_rm"] +mandatory_properties = ["name", "object", "gui_element"] + + +def _search_json_validation_error_get_err_msg_str( + properties_list: list[dict[str, Any]], + excelfile: str, + validation_error: jsonschema.ValidationError, +) -> str: + """ + This function takes a list of properties, which were transformed from an Excel to a json. + The validation raised an error. + This function searches for the exact location in the Excel where the error was caused. + It returns a string with a user-friendly version of the original json validation error message. + + Args: + properties_list: List of properties + excelfile: Name of the Excel file + validation_error: The error from the calling function + + Returns: + A string which is used in the Error message that contains detailed information about the problem + """ + err_msg_list = [f"The 'properties' section defined in the Excel file '{excelfile}' did not pass validation."] + json_path_to_property = regex.search(r"^\$\[(\d+)\]", validation_error.json_path) + if json_path_to_property: + # fmt: off + wrong_property_name = ( + jsonpath_ng.ext.parse(json_path_to_property.group(0)) + .find(properties_list)[0] + .value["name"] + ) + # fmt: on + excel_row = int(json_path_to_property.group(1)) + 2 + err_msg_list.append(f"The problematic property is '{wrong_property_name}' in Excel row {excel_row}.") + affected_field = regex.search( + r"name|labels|comments|super|subject|object|gui_element|gui_attributes", + validation_error.json_path, + ) + if affected_field: + err_msg_list.append( + f"The problem is that the column '{affected_field.group(0)}' has an invalid value: " + f"{validation_error.message}" + ) + else: + err_msg_list.append( + f"The error message is: {validation_error.message}\n" f"The error occurred at {validation_error.json_path}" + ) + return "\n".join(err_msg_list) + + +def _validate_properties( + properties_list: list[dict[str, Any]], + excelfile: str, +) -> bool: + """ + This function checks if the "properties" section of a JSON project file is valid, according to the JSON schema. + + Args: + properties_list: the "properties" section of a JSON project as a list of dicts + excelfile: path to the Excel file containing the properties + + Raises: + UserError: if the validation fails + + Returns: + True if the "properties" section passed validation + """ + with importlib.resources.files("dsp_tools").joinpath("resources/schema/properties-only.json").open( + encoding="utf-8" + ) as schema_file: + properties_schema = json.load(schema_file) + try: + jsonschema.validate(instance=properties_list, schema=properties_schema) + except jsonschema.ValidationError as err: + err_msg = _search_json_validation_error_get_err_msg_str( + properties_list=properties_list, excelfile=excelfile, validation_error=err + ) + raise UserError(err_msg) from None + return True + + +def _search_convert_numbers(value_str: str) -> str | int | float: + """ + This function takes a string and searches if the string contains a float or an integer. + In those cases, it converts the string to the corresponding data type. + If it is not a float or integer, it returns the string as is. + + Args: + value_str: The value which is checked and may be converted + + Returns: + A int if the string was an integer, float if the string was a float or str if it was neither + """ + if regex.search(r"^\d+$", value_str): + return int(value_str) + elif regex.search(r"^\d+\.\d+$", value_str): + return float(value_str) + else: + return value_str + + +def _unpack_gui_attributes(attribute_str: str) -> dict[str, str]: + """ + This function takes a string which contains the gui_attributes if the string is not formatted correctly, + this raises an IndexError. + Errors regarding the content will be diagnosed when the json is validated. + + Args: + attribute_str: A string containing the gui_attributes + + Returns: + A dictionary with the gui_attribute name as key and the attribute as value. + + Raises: + IndexError: if the sub-lists do not contain each two items + """ + # Create a list with several attributes + gui_list = [x.strip() for x in attribute_str.split(",") if not x.strip() == ""] + # create a sub list with the kex value pair of the attribute if it is an empty string we exclude it. + # this error will be detected when checking for the length of the lists + sub_gui_list = [[sub.strip() for sub in x.split(":") if sub.strip() != ""] for x in gui_list] + # if not all sublist contain two items, something is wrong with the attribute + if not all(len(sub) == 2 for sub in sub_gui_list): + raise IndexError + return {sub[0]: sub[1] for sub in sub_gui_list} + + +def _format_gui_attribute(attribute_str: str) -> dict[str, str | int | float]: + """ + This function takes a string containing the information about the gui_attributes and formats it correctly. + + Args: + attribute_str: A string containing the attributes + + Returns: + A dictionary with the attribute name as a key and the attribute as value. + + Raises: + IndexError: if the attributes are not formatted correctly + """ + attribute_dict = _unpack_gui_attributes(attribute_str=attribute_str) + return {attrib: _search_convert_numbers(value_str=val) for attrib, val in attribute_dict.items()} + + +def _get_gui_attribute(df_row: pd.Series, row_num: int, excelfile: str) -> dict[str, int | str | float] | None: + """ + This function checks if the cell "gui_attributes" is empty. + If it is, it returns None. + If there is information, it extracts and formats it correctly. + + Args: + df_row: Row of a pd.DataFrame + row_num: The number of the row (index + 2) + excelfile: The name of the Excel file. + + Returns: + A gui_attribute dictionary or None if there are no attributes + + Raises: + UserError: if there is a formatting error of the string + """ + if pd.isnull(df_row["gui_attributes"]): + return None + # If the attribute is not in the correct format, a called function may raise an IndexError + try: + return _format_gui_attribute(attribute_str=df_row["gui_attributes"]) + except IndexError: + raise UserError( + f"Row {row_num} of Excel file {excelfile} contains invalid data in column 'gui_attributes'.\n" + "The expected format is '[attribute: value, attribute: value]'." + ) from None + + +def _row2prop(df_row: pd.Series, row_num: int, excelfile: str) -> dict[str, Any]: + """ + Takes a row from a pd.DataFrame, reads its content, and returns a dict object of the property. + + Args: + df_row: row from a pd.DataFrame that defines a property + row_num: row number of Excel file + excelfile: name of the original Excel file + + Returns: + dict object of the property + + Raises: + UserError if there are any formal mistakes in the "gui_attributes" column + """ + _property = {x: df_row[x] for x in mandatory_properties} + # These are also mandatory but require formatting + _property.update( + {"labels": utl.get_labels(df_row=df_row), "super": [s.strip() for s in df_row["super"].split(",")]} + ) + non_mandatory = { + "comments": utl.get_comments(df_row=df_row), + "gui_attributes": _get_gui_attribute(df_row=df_row, row_num=row_num, excelfile=excelfile), + } + # These functions may return None, this is checked before the update + _property = utl.update_dict_if_not_value_none(additional_dict=non_mandatory, to_update_dict=_property) + return _property + + +def _check_compliance_gui_attributes(df: pd.DataFrame) -> dict[str, pd.Series] | None: + """ + This function takes a pd.DataFrame and checks if the "gui_attributes" column is filled correctly. + If any or all of the checks fail, + it creates a dictionary with a pd.Series as value which contains True for all rows where + there is a problem otherwise, it returns None. + + Args: + df: pd.DataFrame that should be checked + + Returns: + A dictionary with a pd.Series that contains the information where there is a problem or None if all the + checks passed. + + Raises: + UserError if any of the checks fail + """ + mandatory_attributes = ["Spinbox", "List"] + mandatory_check = utl.col_must_or_not_empty_based_on_other_col( + df=df, + substring_list=mandatory_attributes, + substring_colname="gui_element", + check_empty_colname="gui_attributes", + must_have_value=True, + ) + no_attributes = ["Checkbox", "Date", "Geonames", "Richtext", "TimeStamp"] + no_attribute_check = utl.col_must_or_not_empty_based_on_other_col( + df=df, + substring_list=no_attributes, + substring_colname="gui_element", + check_empty_colname="gui_attributes", + must_have_value=False, + ) + # If neither has a problem, we return None + if mandatory_check is None and no_attribute_check is None: + return None + # If both have problems, we combine the series + elif mandatory_check is not None and no_attribute_check is not None: + final_series = pd.Series(np.logical_or(mandatory_check, no_attribute_check)) + elif mandatory_check is not None: + final_series = mandatory_check + else: + final_series = no_attribute_check + # The boolean series is returned + return {"wrong gui_attributes": final_series} + + +def _check_missing_values_in_row_raise_error(df: pd.DataFrame, excelfile: str) -> None: + """ + This function checks if all the required values are in the df. + If all the checks are ok, the function ends without any effect. + If any of the checks fail, a UserError is raised which contains the information in which column and row there + are problems. + + Args: + df: pd.DataFrame that is to be checked + excelfile: Name of the original Excel file + + Raises: + UserError: if any of the checks are failed + """ + # Every row in these columns must have a value + required_values = ["name", "super", "object", "gui_element"] + # If there are no problems, it returns an empty dict + missing_dict = utl.check_required_values(df=df, required_values_columns=required_values) + # This checks if the label columns have at least one value per row + missing_labels = utl.find_one_full_cell_in_cols(df=df, required_columns=language_label_col) + # If everything is ok, we get None, otherwise we update the dict + if missing_labels is not None: + missing_dict.update({"label": missing_labels}) + # Some gui_element require a gui_attributes and others must not have one + missing_gui_attributes = _check_compliance_gui_attributes(df=df) + if missing_gui_attributes is not None: + missing_dict.update(missing_gui_attributes) + if missing_dict: + # Get the row numbers from the boolean series + missing_dict = utl.get_wrong_row_numbers(wrong_row_dict=missing_dict, true_remains=True) + error_str = "\n".join([f" - Column Name: {k} Row Number: {v}" for k, v in missing_dict.items()]) + raise UserError(f"The file '{excelfile}' is missing values in the following rows:\n" f"{error_str}") + + +def _do_property_excel_compliance(df: pd.DataFrame, excelfile: str) -> None: + """ + This function calls three separate functions which each checks if the pd.DataFrame is as we expect it. + Each of these functions raises a UserError if there is a problem. + If the checks do not fail, this function ends without an effect. + + Args: + df: The pd.DataFrame that is checked + excelfile: The name of the original Excel file + + Raises: + UserError if any of the checks fail + """ + # If it does not pass any one of the tests, the function stops + required_columns = { + "name", + "label_en", + "label_de", + "label_fr", + "label_it", + "label_rm", + "comment_en", + "comment_de", + "comment_fr", + "comment_it", + "comment_rm", + "super", + "object", + "gui_element", + "gui_attributes", + } + utl.check_contains_required_columns_else_raise_error(df=df, required_columns=required_columns) + utl.check_column_for_duplicate_else_raise_error(df=df, to_check_column="name") + _check_missing_values_in_row_raise_error(df=df, excelfile=excelfile) + + +def _rename_deprecated_hlist(df: pd.DataFrame, excelfile: str) -> pd.DataFrame: + """ + This function deals with Excel files that do conform to a previous format. + If the old column names are not in the pd.DataFrame, then it returns it as was. + + Args: + df: The pd.DataFrame which is checked and renamed + excelfile: Name of the original Excel file. + + Returns: + Renamed pd.DataFrame or the original one + + Warnings: + A warning for the user that the Excel file is not compliant with the new specifications + """ + # If the deprecated feature is not in the df, then return the df + if "hlist" not in df.columns: + return df + warnings.warn( + f"The file '{excelfile}' has a column 'hlist', which is deprecated. " + f"Please use the column 'gui_attributes' for the attribute 'hlist'." + ) + # Reformat the string according to the new convention + df["hlist"] = df["hlist"].apply(lambda x: f"hlist:{x}" if isinstance(x, str) else x) + # If gui_attributes already exists we have to merge the columns + if "gui_attributes" in df.columns: + # In case there is a hlist, it is the only valid value in gui_attributes and has precedence + df["hlist"] = df["hlist"].fillna(df["gui_attributes"]) + df.pop("gui_attributes") + df.rename(columns={"hlist": "gui_attributes"}, inplace=True) + return df + + +def _rename_deprecated_lang_cols(df: pd.DataFrame, excelfile: str) -> pd.DataFrame: + """ + This function takes a pd.DataFrame and checks if the columns with the language label are named according to the old + specifications. + If they are, it renames them and informs the user that an old format is used. + Otherwise, it returns the pd.Dataframe as was. + + Args: + df: pd.DataFrame, which is to be checked + excelfile: Name of the Excel file + + Returns: + pd.DataFrame which has the columns renamed according to the new format + + Warnings: + A warning for the user that the Excel file is not compliant with the new specifications + """ + # If the columns are named correctly, return the df + if set(language_label_col).issubset(set(df.columns)): + return df + if set(languages).issubset(set(df.columns)): + warnings.warn( + f"The file '{excelfile}' uses {languages} as column titles, which is deprecated. " + f"Please use {[f'label_{lang}' for lang in languages]}" + ) + rename_dict = dict(zip(languages, language_label_col)) + df.rename(columns=rename_dict, inplace=True) + return df + + +def _rename_deprecated_columnnames(df: pd.DataFrame, excelfile: str) -> pd.DataFrame: + """ + This function calls two other functions that check and rename a deprecated Excel format. + Afterward, the pd.DataFrame is compliant with the current format. + In case the pd.DataFrame was already in the current format, the function passes without an effect. + + Args: + df: pd.DataFrame that is checked and renamed + excelfile: Name of the original Excel + + Returns: + pd.DataFrame that is renamed + + Warnings: + Two user warnings if the pd.DataFrame is not according to the current specifications + """ + df = _rename_deprecated_lang_cols(df=df, excelfile=excelfile) + df = _rename_deprecated_hlist(df=df, excelfile=excelfile) + return df + + +def excel2properties( + excelfile: str, + path_to_output_file: Optional[str] = None, +) -> tuple[list[dict[str, Any]], bool]: + """ + Converts properties described in an Excel file into a "properties" section which can be inserted into a JSON + project file. + + Args: + excelfile: path to the Excel file containing the properties + path_to_output_file: if provided, the output is written into this JSON file + + Raises: + UserError: if something went wrong + + Returns: + a tuple consisting of the "properties" section as a Python list, + and the success status (True if everything went well) + """ + property_df = utl.read_and_clean_excel_file(excelfile=excelfile) + + property_df = _rename_deprecated_columnnames(df=property_df, excelfile=excelfile) + + _do_property_excel_compliance(df=property_df, excelfile=excelfile) + + # transform every row into a property + props: list[dict[str, Any]] = [] + for index, row in property_df.iterrows(): + props.append( + _row2prop( + df_row=row, + row_num=int(str(index)), # index is a label/index/hashable, but we need an int + excelfile=excelfile, + ) + ) + + # write final JSON file + _validate_properties(properties_list=props, excelfile=excelfile) + if path_to_output_file: + with open(file=path_to_output_file, mode="w", encoding="utf-8") as file: + json.dump(props, file, indent=4, ensure_ascii=False) + print('"properties" section was created successfully and written to file:', path_to_output_file) + + return props, True diff --git a/src/dsp_tools/utils/excel_to_json_resources.py b/src/dsp_tools/utils/excel_to_json/resources.py similarity index 95% rename from src/dsp_tools/utils/excel_to_json_resources.py rename to src/dsp_tools/utils/excel_to_json/resources.py index 458046b25..3422ba352 100644 --- a/src/dsp_tools/utils/excel_to_json_resources.py +++ b/src/dsp_tools/utils/excel_to_json/resources.py @@ -91,7 +91,7 @@ def _validate_resources( def _row2resource( - row: pd.Series, + df_row: pd.Series, excelfile: str, ) -> dict[str, Any]: """ @@ -100,7 +100,7 @@ def _row2resource( and builds a dict object of the resource. Args: - row: row from the "classes" DataFrame + df_row: row from the "classes" DataFrame excelfile: Excel file where the data comes from Raises: @@ -110,12 +110,12 @@ def _row2resource( dict object of the resource """ - name = row["name"] - labels = {lang: row[f"label_{lang}"] for lang in languages if row.get(f"label_{lang}")} + name = df_row["name"] + labels = {lang: df_row[f"label_{lang}"] for lang in languages if df_row.get(f"label_{lang}")} if not labels: - labels = {lang: row[lang] for lang in languages if row.get(lang)} - comments = {lang: row[f"comment_{lang}"] for lang in languages if row.get(f"comment_{lang}")} - supers = [s.strip() for s in row["super"].split(",")] + labels = {lang: df_row[lang] for lang in languages if df_row.get(lang)} + comments = {lang: df_row[f"comment_{lang}"] for lang in languages if df_row.get(f"comment_{lang}")} + supers = [s.strip() for s in df_row["super"].split(",")] # load the cardinalities of this resource try: diff --git a/src/dsp_tools/utils/excel_to_json/utils.py b/src/dsp_tools/utils/excel_to_json/utils.py new file mode 100644 index 000000000..f78743313 --- /dev/null +++ b/src/dsp_tools/utils/excel_to_json/utils.py @@ -0,0 +1,286 @@ +from __future__ import annotations + +from typing import Any +from unittest import mock + +import numpy as np +import pandas as pd +import regex + +from dsp_tools.models.exceptions import UserError + +languages = ["en", "de", "fr", "it", "rm"] + + +def read_and_clean_excel_file(excelfile: str) -> pd.DataFrame: + """ + This function reads an Excel file, if there is a ValueError then it patches the openpyxl part that creates the + error and opens it with that patch. + It cleans and then returns the pd.DataFrame. + + Args: + excelfile: The name of the Excel file + + Returns: + A pd.DataFrame + """ + try: + read_df: pd.DataFrame = pd.read_excel(excelfile) + except ValueError: + # Pandas relies on openpyxl to parse XLSX files. + # A strange behavior of openpyxl prevents pandas from opening files with some formatting properties + # (unclear which formatting properties exactly). + # Apparently, the excel2json test files have one of the unsupported formatting properties. + # Credits: https://stackoverflow.com/a/70537454/14414188 + with mock.patch("openpyxl.styles.fonts.Font.family.max", new=100): + read_df = pd.read_excel(excelfile) + read_df = clean_data_frame(df=read_df) + return read_df + + +def clean_data_frame(df: pd.DataFrame) -> pd.DataFrame: + """ + This function takes a pd.DataFrame and removes: + - Leading and trailing spaces from the column names + - Leading and trailing spaces from each cell and any characters in the cells that are not part of any known + language, for example, linebreaks and replaces it with a pd.NA. + - Removes all rows that are empty in all columns + + Args: + df: The pd.DataFrame that is to be cleaned + + Returns: + pd.DataFrame which has the above-mentioned removed + """ + # Remove leading and trailing blanks in column names and make them lower case + df = df.rename(columns=lambda x: x.strip().lower()) + # Remove the values of all cells that do not at least contain one character of any known language and removes + # leading and trailing spaces. + df = df.applymap( + lambda x: str(x).strip() if pd.notna(x) and regex.search(r"[\w\p{L}]", str(x), flags=regex.U) else pd.NA + ) + # drop all the rows that are entirely empty + df.dropna(axis=0, how="all", inplace=True) + return df + + +def check_contains_required_columns_else_raise_error(df: pd.DataFrame, required_columns: set[str]) -> None: + """ + This function takes a pd.DataFrame and a set of required column names. + It checks if all the columns from the set are in the pd.DataFrame. + Additional columns to the ones in the set are allowed. + It raises an error if any columns are missing. + + Args: + df: pd.DataFrame that is checked + required_columns: set of column names + + Raises: + UserError: if there are required columns missing + """ + if not required_columns.issubset(set(df.columns)): + raise UserError( + f"The following columns are missing in the excel:\n" f"{required_columns.difference(set(df.columns))}" + ) + + +def check_column_for_duplicate_else_raise_error(df: pd.DataFrame, to_check_column: str) -> None: + """ + This function checks if a specified column contains duplicate values. + Empty cells (pd.NA) also count as duplicates. + If there are any duplicate values, it creates a string with the duplicates which are displayed in the error message. + + Args: + df: pd.DataFrame that is checked for duplicates + to_check_column: Name of the column that must not contain duplicates + + Raises: + UserError: if there are duplicates in the column + """ + if df[to_check_column].duplicated().any(): + # If it does, it creates a string with all the duplicate values and raises an error. + duplicate_values = ",".join(df[to_check_column][df[to_check_column].duplicated()].tolist()) + raise UserError( + f"The column '{to_check_column}' may not contain any duplicate values. " + f"The following values appeared multiple times '{duplicate_values}'." + ) + + +def check_required_values(df: pd.DataFrame, required_values_columns: list[str]) -> dict[str, pd.Series]: + """ + If there are any empty cells in the column, it adds the column name and a boolean pd.Series to the dictionary. + If there are no empty cells, then it is not included in the dictionary. + If no column has any empty cells, then it returns an empty dictionary. + + Args: + df: pd.DataFrame that is checked + required_values_columns: a list of column names that may not contain empty cells + + Returns: + a dictionary with the column names as key and pd.Series as values if there are any empty cells + """ + # It checks if any of the values in a specified column are empty. If they are, they are added to the dictionary + # with the column name as key and a boolean series as value that contain true for every pd.NA + res_dict = {col: df[col].isnull() for col in required_values_columns if df[col].isnull().any()} + # If all the columns are filled, then it returns an empty dictionary. + return res_dict + + +def turn_bool_array_into_index_numbers(series: pd.Series[bool], true_remains: bool = True) -> list[int]: + """ + This function takes a pd.Series containing boolean values. + By default, this method extracts the index numbers of the True values. + If the index numbers of the False values are required, the parameter "true_remains" should be turned to False. + + Args: + series: pd.Series, which only contains True and False values + true_remains: True if the index numbers of True are required, likewise with False + + Returns: + A list of index numbers + """ + # If the False are required, we need to invert the array. + if not true_remains: + series = ~series + return list(series[series].index) + + +def get_wrong_row_numbers(wrong_row_dict: dict[str, pd.Series], true_remains: bool = True) -> dict[str, list[int]]: + """ + From the boolean pd.Series the index numbers of the True values are extracted. + The resulting list is the new value of the dictionary. + This new dictionary is taken and to each index number 2 is added, so that it corresponds to the Excel row number. + The result is intended to be used to communicate the exact location of a problem in an error message. + + Args: + wrong_row_dict: The dictionary which contains column names and a boolean pd.Series + true_remains: If True then the index of True is taken, if False then the index of False values is taken + + Returns: + Dictionary with the column name as key and the row number as a list. + """ + wrong_row_dict = { + k: turn_bool_array_into_index_numbers(series=v, true_remains=true_remains) for k, v in wrong_row_dict.items() + } + return {k: [x + 2 for x in v] for k, v in wrong_row_dict.items()} + + +def update_dict_if_not_value_none(additional_dict: dict[Any, Any], to_update_dict: dict[Any, Any]) -> dict[Any, Any]: + """ + This function takes two dictionaries. + The "to_update_dict" should be updated with the information from the "additional_dict" + only if the value of a particular key is not None or pd.NA. + + Args: + additional_dict: The dictionary which contains information that may be transferred + to_update_dict: The dictionary to which the new information should be transferred + + Returns: + The "to_update_dict" which the additional information + """ + additional_dict = {k: v for k, v in additional_dict.items() if v is not None and v is not pd.NA} + to_update_dict.update(additional_dict) + return to_update_dict + + +def get_labels(df_row: pd.Series) -> dict[str, str]: + """ + This function takes a pd.Series which has "label_[language tag]" in the index. + If the value of the index is not pd.NA, the language tag and the value are added to a dictionary. + If it is empty, it is omitted from the dictionary. + + Args: + df_row: a pd.Series (usually a row of a pd.DataFrame) from which the content of the columns containing the + label is extracted + + Returns: + A dictionary with the language tag and the content of the cell + """ + return {lang: df_row[f"label_{lang}"] for lang in languages if df_row[f"label_{lang}"] is not pd.NA} + + +def get_comments(df_row: pd.Series) -> dict[str, str] | None: + """ + This function takes a pd.Series which has "comment_[language tag]" in the index. + If the value of the index is not pd.NA, the language tag and the value are added to a dictionary. + If it is empty, it is omitted from the dictionary. + + Args: + df_row: a pd.Series (usually a row of a pd.DataFrame) from which the content of the columns containing the + comment is extracted + + Returns: + A dictionary with the language tag and the content of the cell + """ + comments = {lang: df_row[f"comment_{lang}"] for lang in languages if df_row[f"comment_{lang}"] is not pd.NA} + if comments == {}: + return None + else: + return comments + + +def find_one_full_cell_in_cols(df: pd.DataFrame, required_columns: list[str]) -> pd.Series | None: + """ + This function takes a pd.DataFrame and a list of column names where at least one cell must have a value per row. + A pd.Series with boolean values is returned, True if any rows do not have a value in at least one column + + Args: + df: The pd.DataFrame which should be checked + required_columns: A list of column names where at least one cell per row must have a value + + Returns: + None if there is no problem or a pd.Series if there is a problem in a row + """ + # The series has True if the cell is empty + # In order to combine more than two arrays, we need to reduce the arrays, which takes a tuple + result_arrays = tuple(df[col].isnull() for col in required_columns) + # If all are True logical_and returns True otherwise False + combined_array = np.logical_and.reduce(result_arrays) + # if any of the values are True, it is turned into a pd.Series + if any(combined_array): + return pd.Series(combined_array) + else: + return None + + +def col_must_or_not_empty_based_on_other_col( + df: pd.DataFrame, + substring_list: list[str], + substring_colname: str, + check_empty_colname: str, + must_have_value: bool, +) -> pd.Series | None: + """ + It is presumed that the column "substring_colname" has no empty cells. + Based on the string content of the individual rows, which is specified in the "substring_list", + the cell in the column "check_empty_colname" is checked whether it is empty or not. + The "substring_list" contains the different possibilities regarding the content of the cell. + If the parameter "must_have_value" is True, then the cell in the "check_empty_colname" column must not be empty. + If the parameter is set to False, then it must be empty. + + Args: + df: The pd.DataFrame which is checked + substring_list: A list of possible information that could be in the column "substring_colname" + substring_colname: The name of the column that may contain any of the sub-strings + check_empty_colname: The name of the column which is checked if it is empty or not + must_have_value: True if the "check_empty_colname" should have a value or the reverse. + + Returns: + None if all rows are correctly filled or empty. + A series which contains True values for the rows, where it does + not comply with the specifications. + """ + na_series = df[check_empty_colname].isna() + # If the cells have to be empty, we need to reverse the series + if not must_have_value: + na_series = ~na_series + # This returns True if it finds the substring in the cell, they are joined in a RegEx "|" which denotes "or". + # If it does not match any of the sub-strings, then the RegEx returns False, + # which means that the value in the column "check_empty_colname" is not relevant. + substring_array = df[substring_colname].str.contains("|".join(substring_list), na=False, regex=True) + # If both are True logical_and returns True otherwise False + combined_array = np.logical_and(na_series, substring_array) + if any(combined_array): + return pd.Series(combined_array) + else: + return None diff --git a/src/dsp_tools/utils/excel_to_json_properties.py b/src/dsp_tools/utils/excel_to_json_properties.py deleted file mode 100644 index 0f9e67ca4..000000000 --- a/src/dsp_tools/utils/excel_to_json_properties.py +++ /dev/null @@ -1,218 +0,0 @@ -import importlib.resources -import json -import warnings -from typing import Any, Optional - -import jsonpath_ng.ext -import jsonschema -import pandas as pd -import regex - -from dsp_tools.models.exceptions import BaseError -from dsp_tools.utils.shared import check_notna, prepare_dataframe - -languages = ["en", "de", "fr", "it", "rm"] - - -def _validate_properties( - properties_list: list[dict[str, Any]], - excelfile: str, -) -> bool: - """ - This function checks if the "properties" section of a JSON project file is valid according to the JSON schema, - and if the property names are unique. - - Args: - properties_list: the "properties" section of a JSON project as a list of dicts - excelfile: path to the Excel file containing the properties - - Raises: - BaseError: if the validation fails - - Returns: - True if the "properties" section passed validation - """ - with importlib.resources.files("dsp_tools").joinpath("resources/schema/properties-only.json").open( - encoding="utf-8" - ) as schema_file: - properties_schema = json.load(schema_file) - try: - jsonschema.validate(instance=properties_list, schema=properties_schema) - except jsonschema.ValidationError as err: - err_msg = f"The 'properties' section defined in the Excel file '{excelfile}' did not pass validation. " - json_path_to_property = regex.search(r"^\$\[(\d+)\]", err.json_path) - if json_path_to_property: - # fmt: off - wrong_property_name = ( - jsonpath_ng.ext.parse(json_path_to_property.group(0)) - .find(properties_list)[0] - .value["name"] - ) - # fmt: on - excel_row = int(json_path_to_property.group(1)) + 2 - err_msg += f"The problematic property is '{wrong_property_name}' in Excel row {excel_row}. " - affected_field = regex.search( - r"name|labels|comments|super|subject|object|gui_element|gui_attributes", - err.json_path, - ) - if affected_field: - err_msg += ( - f"The problem is that the column '{affected_field.group(0)}' has an invalid value: {err.message}" - ) - else: - err_msg += f"The error message is: {err.message}\nThe error occurred at {err.json_path}" - raise BaseError(err_msg) from None - - # check if property names are unique - all_names = [p["name"] for p in properties_list] - duplicates: dict[int, str] = dict() - for index, propdef in enumerate(properties_list): - if all_names.count(propdef["name"]) > 1: - duplicates[index + 2] = propdef["name"] - if duplicates: - err_msg = f"Property names must be unique inside every ontology, but '{excelfile}' contains duplicates:\n" - for row_no, propname in duplicates.items(): - err_msg += f" - Row {row_no}: {propname}\n" - raise BaseError(err_msg) - - return True - - -def _row2prop( - row: pd.Series, - row_count: int, - excelfile: str, -) -> dict[str, Any]: - """ - Takes a row from a pandas DataFrame, reads its content, and returns a dict object of the property - - Args: - row: row from a pandas DataFrame that defines a property - row_count: row number of Excel file - excelfile: name of the original Excel file - - Raises: - BaseError: if the row contains invalid data - - Returns: - dict object of the property - """ - - # extract the elements that are necessary to build the property - name = row["name"] - supers = [s.strip() for s in row["super"].split(",")] - _object = row["object"] - labels = {lang: row[f"label_{lang}"] for lang in languages if row.get(f"label_{lang}")} - if not labels: - labels = {lang: row[lang] for lang in languages if row.get(lang)} - comments = {lang: row[f"comment_{lang}"] for lang in languages if row.get(f"comment_{lang}")} - gui_element = row["gui_element"] - gui_attributes = dict() - if row.get("hlist"): - gui_attributes["hlist"] = row["hlist"] - if row.get("gui_attributes"): - pairs = row["gui_attributes"].split(",") - for pair in pairs: - if pair.count(":") != 1: - raise BaseError( - f"Row {row_count} of Excel file {excelfile} contains invalid data in column 'gui_attributes'. " - "The expected format is 'attribute: value[, attribute: value]'." - ) - attr, val = [x.strip() for x in pair.split(":")] - if regex.search(r"^\d+\.\d+$", val): - val = float(val) - elif regex.search(r"^\d+$", val): - val = int(val) - gui_attributes[attr] = val - - # build the dict structure of this property - _property = {"name": name, "super": supers, "object": _object, "labels": labels} - if comments: - _property["comments"] = comments - _property["gui_element"] = gui_element - if gui_attributes: - _property["gui_attributes"] = gui_attributes - - return _property - - -def excel2properties( - excelfile: str, - path_to_output_file: Optional[str] = None, -) -> tuple[list[dict[str, Any]], bool]: - """ - Converts properties described in an Excel file into a "properties" section which can be inserted into a JSON - project file. - - Args: - excelfile: path to the Excel file containing the properties - path_to_output_file: if provided, the output is written into this JSON file - - Raises: - BaseError: if something went wrong - - Returns: - a tuple consisting of the "properties" section as Python list, - and the success status (True if everything went well) - """ - - # load file - try: - df: pd.DataFrame = pd.read_excel(excelfile) - except ValueError: - # Pandas relies on openpyxl to parse XLSX files. - # A strange behaviour of openpyxl prevents pandas from opening files with some formatting properties - # (unclear which formatting properties exactly). - # Apparently, the excel2json test files have one of the unsupported formatting properties. - # The following two lines of code help out. - # Credits: https://stackoverflow.com/a/70537454/14414188 - # pylint: disable-next=import-outside-toplevel - from unittest import mock - - p = mock.patch("openpyxl.styles.fonts.Font.family.max", new=100) - p.start() - df = pd.read_excel(excelfile) - p.stop() - df = prepare_dataframe( - df=df, - required_columns=["name"], - location_of_sheet=f"File '{excelfile}'", - ) - - # validation of input - required = ["super", "object", "gui_element"] - for index, row in df.iterrows(): - index = int(str(index)) # index is a label/index/hashable, but we need an int - for req in required: - if not check_notna(row[req]): - raise BaseError(f"'{excelfile}' has a missing value in row {index + 2}, column '{req}'") - if any(df.get(lang) is not None for lang in languages): - warnings.warn( - f"The file '{excelfile}' uses {languages} as column titles, which is deprecated. " - f"Please use {[f'label_{lang}' for lang in languages]}" - ) - if df.get("hlist"): - warnings.warn( - f"The file '{excelfile}' has a column 'hlist', which is deprecated. " - f"Please use the column 'gui_attributes' for the attribute 'hlist'." - ) - - # transform every row into a property - props: list[dict[str, Any]] = [] - for index, row in df.iterrows(): - props.append( - _row2prop( - row=row, - row_count=int(str(index)), # index is a label/index/hashable, but we need an int - excelfile=excelfile, - ) - ) - - # write final JSON file - _validate_properties(properties_list=props, excelfile=excelfile) - if path_to_output_file: - with open(file=path_to_output_file, mode="w", encoding="utf-8") as file: - json.dump(props, file, indent=4, ensure_ascii=False) - print('"properties" section was created successfully and written to file:', path_to_output_file) - - return props, True diff --git a/src/dsp_tools/utils/id_to_iri.py b/src/dsp_tools/utils/id_to_iri.py index 0708ea0da..41984b75d 100644 --- a/src/dsp_tools/utils/id_to_iri.py +++ b/src/dsp_tools/utils/id_to_iri.py @@ -2,14 +2,13 @@ from datetime import datetime from pathlib import Path -from lxml import etree import regex +from lxml import etree from dsp_tools.models.exceptions import UserError from dsp_tools.utils.logging import get_logger from dsp_tools.utils.xml_upload import parse_xml_file - logger = get_logger(__name__) diff --git a/src/dsp_tools/utils/project_create.py b/src/dsp_tools/utils/project_create.py index 068ff59ed..ce8f32f0f 100644 --- a/src/dsp_tools/utils/project_create.py +++ b/src/dsp_tools/utils/project_create.py @@ -15,7 +15,7 @@ from dsp_tools.models.propertyclass import PropertyClass from dsp_tools.models.resourceclass import ResourceClass from dsp_tools.models.user import User -from dsp_tools.utils.excel_to_json_lists import expand_lists_from_excel +from dsp_tools.utils.excel_to_json.lists import expand_lists_from_excel from dsp_tools.utils.logging import get_logger from dsp_tools.utils.project_create_lists import create_lists_on_server from dsp_tools.utils.project_validate import validate_project diff --git a/src/dsp_tools/utils/project_create_lists.py b/src/dsp_tools/utils/project_create_lists.py index 366abd7bf..d8ee459b8 100644 --- a/src/dsp_tools/utils/project_create_lists.py +++ b/src/dsp_tools/utils/project_create_lists.py @@ -4,7 +4,7 @@ from dsp_tools.models.exceptions import BaseError, UserError from dsp_tools.models.listnode import ListNode from dsp_tools.models.project import Project -from dsp_tools.utils.excel_to_json_lists import expand_lists_from_excel +from dsp_tools.utils.excel_to_json.lists import expand_lists_from_excel from dsp_tools.utils.logging import get_logger from dsp_tools.utils.project_validate import validate_project from dsp_tools.utils.shared import login, parse_json_input, try_network_action diff --git a/src/dsp_tools/utils/project_validate.py b/src/dsp_tools/utils/project_validate.py index 84329448f..12eb92c41 100644 --- a/src/dsp_tools/utils/project_validate.py +++ b/src/dsp_tools/utils/project_validate.py @@ -10,7 +10,7 @@ import regex from dsp_tools.models.exceptions import BaseError -from dsp_tools.utils.excel_to_json_lists import expand_lists_from_excel +from dsp_tools.utils.excel_to_json.lists import expand_lists_from_excel def _check_for_duplicate_names(project_definition: dict[str, Any]) -> bool: diff --git a/src/dsp_tools/utils/shared.py b/src/dsp_tools/utils/shared.py index 302a54a6e..5ba54216d 100644 --- a/src/dsp_tools/utils/shared.py +++ b/src/dsp_tools/utils/shared.py @@ -12,9 +12,9 @@ import pandas as pd import regex +import requests from lxml import etree from requests import ReadTimeout, RequestException -import requests from urllib3.exceptions import ReadTimeoutError from dsp_tools.models.connection import Connection diff --git a/src/dsp_tools/utils/stack_handling.py b/src/dsp_tools/utils/stack_handling.py index 55d016f75..1ebd6c48e 100644 --- a/src/dsp_tools/utils/stack_handling.py +++ b/src/dsp_tools/utils/stack_handling.py @@ -1,8 +1,8 @@ -from dataclasses import dataclass import importlib.resources import shutil import subprocess import time +from dataclasses import dataclass from pathlib import Path from typing import Optional @@ -12,7 +12,6 @@ from dsp_tools.models.exceptions import UserError from dsp_tools.utils.logging import get_logger - from dsp_tools.utils.shared import http_call_with_retry logger = get_logger(__name__) diff --git a/src/dsp_tools/utils/xml_upload.py b/src/dsp_tools/utils/xml_upload.py index 9c101bc5e..b6d8cd669 100644 --- a/src/dsp_tools/utils/xml_upload.py +++ b/src/dsp_tools/utils/xml_upload.py @@ -15,9 +15,9 @@ from typing import Any, Optional, Union, cast from urllib.parse import quote_plus -from lxml import etree import pandas as pd import regex +from lxml import etree from dsp_tools.models.connection import Connection from dsp_tools.models.exceptions import BaseError, UserError diff --git a/test/e2e/test_cli.py b/test/e2e/test_cli.py index c6a32a2b4..51b9942d0 100644 --- a/test/e2e/test_cli.py +++ b/test/e2e/test_cli.py @@ -8,11 +8,11 @@ import copy import json -from pathlib import Path import shutil import subprocess -from typing import Any, Optional, cast import unittest +from pathlib import Path +from typing import Any, Optional, cast import jsonpath_ng import jsonpath_ng.ext diff --git a/test/unittests/test_excel2xml.py b/test/unittests/test_excel2xml.py index 145563ea9..292dc1c3d 100644 --- a/test/unittests/test_excel2xml.py +++ b/test/unittests/test_excel2xml.py @@ -5,11 +5,11 @@ from pathlib import Path from typing import Any, Callable, Optional, Sequence, Union -from lxml import etree import numpy as np import pandas as pd import pytest import regex +from lxml import etree from dsp_tools import excel2xml from dsp_tools.models.exceptions import BaseError diff --git a/test/unittests/test_excel_to_json/__init__.py b/test/unittests/test_excel_to_json/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/unittests/test_excel_to_json_lists.py b/test/unittests/test_excel_to_json/test_lists.py similarity index 99% rename from test/unittests/test_excel_to_json_lists.py rename to test/unittests/test_excel_to_json/test_lists.py index 479745f1c..cd1e2e9ca 100644 --- a/test/unittests/test_excel_to_json_lists.py +++ b/test/unittests/test_excel_to_json/test_lists.py @@ -15,7 +15,7 @@ import regex from dsp_tools.models.exceptions import BaseError -from dsp_tools.utils import excel_to_json_lists as e2l +from dsp_tools.utils.excel_to_json import lists as e2l class TestExcelToJSONList(unittest.TestCase): diff --git a/test/unittests/test_excel_to_json/test_properties.py b/test/unittests/test_excel_to_json/test_properties.py new file mode 100644 index 000000000..b7ab2e83a --- /dev/null +++ b/test/unittests/test_excel_to_json/test_properties.py @@ -0,0 +1,592 @@ +"""unit tests for excel to properties""" + +# pylint: disable=missing-class-docstring,missing-function-docstring,protected-access, +# disable=wrong-import-order mypy: allow_untyped_calls + +import json +import os +import unittest +from typing import Any, cast + +import jsonpath_ng +import jsonpath_ng.ext +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from dsp_tools.models.exceptions import BaseError, UserError +from dsp_tools.utils.excel_to_json import properties as e2j + + +class TestExcelToProperties(unittest.TestCase): + outfile = "testdata/tmp/_out_properties.json" + + @classmethod + def setUpClass(cls) -> None: + """Is executed once before the methods of this class are run""" + os.makedirs("testdata/tmp", exist_ok=True) + + @classmethod + def tearDownClass(cls) -> None: + """Is executed after the methods of this class have all run through""" + for file in os.listdir("testdata/tmp"): + os.remove("testdata/tmp/" + file) + os.rmdir("testdata/tmp") + + def test_excel2properties(self) -> None: + excelfile = "testdata/excel2json/excel2json_files/test-name (test_label)/properties.xlsx" + output_from_method, _ = e2j.excel2properties(excelfile, self.outfile) + + # define the expected values from the excel file + excel_names = [ + "correspondsToGenericAnthroponym", + "hasAnthroponym", + "hasGender", + "isDesignatedAs", + "hasTitle", + "hasStatus", + "hasLifeYearAmount", + "hasBirthDate", + "hasRepresentation", + "hasRemarks", + "hasTerminusPostQuem", + "hasGND", + "hasColor", + "hasDecimal", + "hasTime", + "hasInterval", + "hasBoolean", + "hasGeoname", + "partOfDocument", + "linkstoRegion", + "hasLinkToImage", + "hasLinkToResource", + "hasLinkToArchiveRepresentation", + "hasLinkToMovingImageRepesentation", + "hasLinkToAudioRepesentation", + ] + excel_supers = [ + ["hasLinkTo"], + ["hasValue", "dcterms:creator"], + ["hasValue"], + ["hasValue"], + ["hasLinkTo"], + ["hasValue"], + ["hasValue"], + ["hasValue"], + ["hasRepresentation"], + ["hasValue", "dcterms:description"], + ["hasValue"], + ["hasValue"], + ["hasColor"], + ["hasValue"], + ["hasValue"], + ["hasSequenceBounds"], + ["hasValue"], + ["hasValue"], + ["isPartOf"], + ["hasLinkTo"], + ["hasLinkTo"], + ["hasLinkTo"], + ["hasLinkTo"], + ["hasLinkTo"], + ["hasLinkTo"], + ] + excel_objects = [ + ":GenericAnthroponym", + "TextValue", + "ListValue", + "ListValue", + ":Titles", + "ListValue", + "IntValue", + "DateValue", + "Representation", + "TextValue", + "DateValue", + "UriValue", + "ColorValue", + "DecimalValue", + "TimeValue", + "IntervalValue", + "BooleanValue", + "GeonameValue", + ":Documents", + "Region", + "StillImageRepresentation", + "Resource", + "ArchiveRepresentation", + "MovingImageRepresentation", + "AudioRepresentation", + ] + + excel_labels = dict() + # there are also labels in other languages, but they are not tested + excel_labels["de"] = [ + "", + "only German", + "", + "", + "", + "", + "", + "", + "hat eine Multimediadatei", + "", + "", + "GND", + "Farbe", + "Dezimalzahl", + "Zeit", + "Zeitintervall", + "Bool'sche Variable", + "Link zu Geonames", + "ist Teil eines Dokuments", + "", + "", + "", + "", + "", + "", + ] + excel_labels["it"] = [ + "", + "", + "", + "only Italian", + "", + "", + "", + "", + "", + "", + "", + "GND", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ] + + excel_comments = dict() + # there are also comments in other languages, but they are not tested + excel_comments["comment_fr"] = [ + "J'avais déjà examiné plusieurs propriétés quand, un jour, le notaire, qui me " + "donnait des indications nécessaires pour une de mes explorations, me dit :", + "Un étrange hasard m'a mis en possession de ce journal.", + "Je n'en sais rien du tout ; mais si vous voulez la voir, monsieur, voici les " + "indications précises pour la trouver.", + "Vous devrez arranger l'affaire avec le curé du village de --.\"", + "Un étrange hasard m'a mis en possession de ce journal.", + "", + "", + "only French", + "", + "", + "J'avais déjà examiné plusieurs propriétés quand, un jour, le notaire, qui me " + "donnait des indications nécessaires pour une de mes explorations, me dit :", + "Gemeinsame Normdatei", + "", + "Chiffre décimale", + "Temps", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ] + excel_comments["comment_it"] = [ + "Avevo già visto diverse proprietà quando un giorno il notaio,", + "Uno strano caso mi mise in possesso di questo diario.", + "Non ne so nulla; ma se volete vederla, signore, eccovi le indicazioni precise per trovarla.", + "Dovrete organizzare l'affare con il curato del villaggio di --\".", + "Uno strano caso mi mise in possesso di questo diario.", + "", + "", + "", + "", + "", + "Avevo già visto diverse proprietà quando un giorno il notaio,", + "Gemeinsame Normdatei", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ] + + excel_gui_elements = [ + "Searchbox", + "Richtext", + "List", + "Radio", + "Searchbox", + "List", + "Spinbox", + "Date", + "Searchbox", + "Textarea", + "Date", + "SimpleText", + "Colorpicker", + "Spinbox", + "TimeStamp", + "Interval", + "Checkbox", + "Geonames", + "Searchbox", + "Searchbox", + "Searchbox", + "Searchbox", + "Searchbox", + "Searchbox", + "Searchbox", + ] + + excel_gui_attributes_hasGender = {"hlist": "gender"} + excel_gui_attributes_hasGND = {"size": 100} + excel_gui_attributes_hasDecimal = {"min": 0.0, "max": 100.0} + + # read json file + with open(self.outfile, encoding="utf-8") as f: + output_from_file: list[dict[str, Any]] = json.load(f) + + # check that output from file and from method are equal + self.assertListEqual(output_from_file, output_from_method) + + # extract infos from json file + json_names = [match.value for match in jsonpath_ng.parse("$[*].name").find(output_from_file)] + json_supers = [match.value for match in jsonpath_ng.parse("$[*].super").find(output_from_file)] + json_objects = [match.value for match in jsonpath_ng.parse("$[*].object").find(output_from_file)] + + json_labels_all = [match.value for match in jsonpath_ng.parse("$[*].labels").find(output_from_file)] + json_labels: dict[str, list[str]] = dict() + for lang in ["de", "it"]: + json_labels[lang] = [label.get(lang, "").strip() for label in json_labels_all] + + json_comments: dict[str, list[str]] = dict() + for lang in ["fr", "it"]: + json_comments[f"comment_{lang}"] = [ + resource.get("comments", {}).get(lang, "").strip() for resource in output_from_file + ] + + json_gui_elements = [match.value for match in jsonpath_ng.parse("$[*].gui_element").find(output_from_file)] + + json_gui_attributes_hasGender = ( + jsonpath_ng.ext.parse("$[?name='hasGender'].gui_attributes").find(output_from_file)[0].value + ) + json_gui_attributes_hasGND = ( + jsonpath_ng.ext.parse("$[?name='hasGND'].gui_attributes").find(output_from_file)[0].value + ) + json_gui_attributes_hasDecimal = ( + jsonpath_ng.ext.parse("$[?name='hasDecimal'].gui_attributes").find(output_from_file)[0].value + ) + + # make checks + self.assertListEqual(excel_names, json_names) + self.assertListEqual(excel_supers, json_supers) + self.assertListEqual(excel_objects, json_objects) + self.assertDictEqual(excel_labels, json_labels) + self.assertDictEqual(excel_comments, json_comments) + self.assertListEqual(excel_gui_elements, json_gui_elements) + self.assertDictEqual(excel_gui_attributes_hasGND, json_gui_attributes_hasGND) + self.assertDictEqual(excel_gui_attributes_hasDecimal, json_gui_attributes_hasDecimal) + self.assertDictEqual(excel_gui_attributes_hasGender, json_gui_attributes_hasGender) + + def test_validate_properties(self) -> None: + # it is not possible to call the method to be tested directly. + # So let's make a reference to it, so that it can be found by the usage search + lambda x: e2j._validate_properties([], "file") # pylint: disable=expression-not-assigned,protected-access + + testcases = [ + ( + "testdata/invalid-testdata/excel2json/properties-invalid-super.xlsx", + "did not pass validation.\n" + "The problematic property is 'hasGeoname' in Excel row 3.\n" + "The problem is that the column 'super' has an invalid value: " + "'GeonameValue' is not valid under any of the given schemas", + ), + ( + "testdata/invalid-testdata/excel2json/properties-invalid-object.xlsx", + "did not pass validation.\n" + "The problematic property is 'hasBoolean' in Excel row 2.\n" + "The problem is that the column 'object' has an invalid value: " + "'hasValue' is not valid under any of the given schemas", + ), + ( + "testdata/invalid-testdata/excel2json/properties-invalid-gui_element.xlsx", + "did not pass validation.\n" + "The problematic property is 'hasInterval' in Excel row 4.\n" + r"The problem is that the column 'gui_element' has an invalid value: " + r"'Interval' was expected", + ), + ( + "testdata/invalid-testdata/excel2json/properties-invalid-gui_attribute.xlsx", + "did not pass validation.\n" + "The problematic property is 'hasInteger' in Excel row 4.\n" + r"The problem is that the column 'gui_attributes' has an invalid value: " + r"Additional properties are not allowed \('rows' was unexpected\)", + ), + ] + + for file, message in testcases: + with self.assertRaisesRegex(UserError, message): + e2j.excel2properties(file, self.outfile) + + def test__rename_deprecated_lang_cols(self) -> None: + original_df = pd.DataFrame( + {"en": [1, 2, 3], "de": [1, 2, 3], "fr": [1, 2, 3], "it": [1, 2, 3], "rm": [1, 2, 3]} + ) + expected_df = pd.DataFrame( + { + "label_en": [1, 2, 3], + "label_de": [1, 2, 3], + "label_fr": [1, 2, 3], + "label_it": [1, 2, 3], + "label_rm": [1, 2, 3], + } + ) + returned_df = e2j._rename_deprecated_lang_cols(df=original_df, excelfile="Test") + assert_frame_equal(original_df, returned_df) + returned_df = e2j._rename_deprecated_lang_cols(df=expected_df, excelfile="Test") + assert_frame_equal(original_df, returned_df) + + def test__do_property_excel_compliance(self) -> None: + original_df = pd.DataFrame( + { + "name": ["name_1", "name_2", "name_3", "name_4", "name_5", "name_6"], + "label_en": ["label_en_1", "label_en_2", pd.NA, pd.NA, "label_en_5", pd.NA], + "label_de": ["label_de_1", pd.NA, "label_de_3", pd.NA, pd.NA, pd.NA], + "label_fr": ["label_fr_1", pd.NA, pd.NA, "label_fr_4", pd.NA, pd.NA], + "label_it": ["label_it_1", pd.NA, pd.NA, pd.NA, "label_it_5", pd.NA], + "label_rm": ["label_rm_1", pd.NA, pd.NA, pd.NA, pd.NA, "label_rm_6"], + "comment_en": ["comment_en_1", "comment_en_2", pd.NA, pd.NA, "comment_en_5", pd.NA], + "comment_de": ["comment_de_1", pd.NA, "comment_de_3", pd.NA, pd.NA, pd.NA], + "comment_fr": ["comment_fr_1", pd.NA, pd.NA, "comment_fr_4", pd.NA, pd.NA], + "comment_it": ["comment_it_1", pd.NA, pd.NA, pd.NA, "comment_it_5", pd.NA], + "comment_rm": ["comment_rm_1", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA], + "super": ["super_1", "super_2", "super_3", "super_4.1, super_4.2, super_4.3", "super_5", "super_6"], + "subject": ["subject_1", "subject_2", "subject_3", "subject_4", "subject_5", "subject_6"], + "object": ["object_1", "object_2", "object_3", "object_4", "object_5", "object_6"], + "gui_element": ["Simple", "Searchbox", "Date", "Searchbox", "List", "Searchbox"], + "gui_attributes": ["size: 32, maxlength: 128", pd.NA, pd.NA, pd.NA, "hlist: languages", pd.NA], + } + ) + e2j._do_property_excel_compliance(df=original_df, excelfile="Test") + + original_df = pd.DataFrame( + { + "name": ["name_1", "name_2", "name_3", "name_4", "name_5", "name_6", "name_7", pd.NA], + "label_en": ["label_en_1", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA], + "label_de": [pd.NA, pd.NA, "label_de_3", pd.NA, pd.NA, pd.NA, pd.NA, "label_de_8"], + "label_fr": [pd.NA, pd.NA, pd.NA, "label_fr_4", pd.NA, pd.NA, pd.NA, pd.NA], + "label_it": [pd.NA, pd.NA, pd.NA, pd.NA, "label_it_5", pd.NA, pd.NA, pd.NA], + "label_rm": [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, "label_rm_6", pd.NA, pd.NA], + "comment_en": ["comment_en_1", pd.NA, "comment_en_3", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA], + "comment_de": [pd.NA, pd.NA, pd.NA, "comment_de_4", pd.NA, pd.NA, pd.NA, pd.NA], + "comment_fr": [pd.NA, pd.NA, pd.NA, pd.NA, "comment_fr_5", pd.NA, pd.NA, pd.NA], + "comment_it": [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, "comment_it_6", pd.NA, pd.NA], + "comment_rm": [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, "comment_rm_7", pd.NA], + "super": [ + pd.NA, + "super_2", + pd.NA, + "super_4.1, super_4.2, super_4.3", + "super_5", + "super_6", + "super_7", + pd.NA, + ], + "subject": [ + "subject_1", + "subject_2", + "subject_3", + "subject_4", + "subject_5", + "subject_6", + "subject_7", + pd.NA, + ], + "object": ["object_1", "object_2", "object_3", pd.NA, "object_5", "object_6", "object_7", pd.NA], + "gui_element": ["Simple", "Searchbox", "Date", "Date", pd.NA, "List", pd.NA, pd.NA], + "gui_attributes": ["size: 32, maxlength: 128", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA], + } + ) + with self.assertRaises(BaseError) as context: + e2j._do_property_excel_compliance(df=original_df, excelfile="Test") + self.assertEqual( + context, + "The file '{excel_filename}' is missing values in some rows. See below for more information:\n" + "{error_str}", + ) + + def test__rename_deprecated_hlist(self) -> None: + original_df = pd.DataFrame({"hlist": [pd.NA, pd.NA, "languages"]}) + expected_df = pd.DataFrame({"gui_attributes": [pd.NA, pd.NA, "hlist:languages"]}) + returned_df = e2j._rename_deprecated_hlist(df=original_df, excelfile="Test") + assert_frame_equal(expected_df, returned_df) + + original_df = pd.DataFrame( + {"hlist": [pd.NA, pd.NA, "languages"], "gui_attributes": [pd.NA, "attribute_1", pd.NA]} + ) + expected_df = pd.DataFrame({"gui_attributes": [pd.NA, "attribute_1", "hlist:languages"]}) + returned_df = e2j._rename_deprecated_hlist(df=original_df, excelfile="Test") + assert_frame_equal(expected_df, returned_df) + + def test__unpack_gui_attributes(self) -> None: + test_dict = { + "maxlength:1, size:32": {"maxlength": "1", "size": "32"}, + "hlist: languages": {"hlist": "languages"}, + } + for original, expected in test_dict.items(): + self.assertDictEqual(e2j._unpack_gui_attributes(attribute_str=original), expected) + + def test__search_convert_numbers(self) -> None: + test_dict = {"1": 1, "string": "string", "1.453": 1.453, "sdf.asdf": "sdf.asdf"} + for original, expected in test_dict.items(): + self.assertEqual(e2j._search_convert_numbers(value_str=original), expected) + + def test__get_gui_attribute(self) -> None: + original_df = pd.DataFrame( + {"gui_attributes": [pd.NA, "max:1.4 / min:1.2", "hlist:", "234345", "hlist: languages,"]} + ) + self.assertIsNone(e2j._get_gui_attribute(df_row=original_df.loc[0, :], row_num=2, excelfile="Test")) + with self.assertRaises(UserError) as context: + e2j._get_gui_attribute(df_row=original_df.loc[1, :], row_num=3, excelfile="Test") + self.assertEqual( + "Row {row_num} of Excel file {excel_filename} contains invalid data in column 'gui_attributes'. " + "The expected format is '[attribute: value, attribute: value]'.", + context, + ) + with self.assertRaises(UserError) as context: + e2j._get_gui_attribute(df_row=original_df.loc[2, :], row_num=4, excelfile="Test") + self.assertEqual( + "Row {row_num} of Excel file {excel_filename} contains invalid data in column 'gui_attributes'. " + "The expected format is '[attribute: value, attribute: value]'.", + context, + ) + with self.assertRaises(UserError) as context: + e2j._get_gui_attribute(df_row=original_df.loc[3, :], row_num=5, excelfile="Test") + self.assertEqual( + "Row {row_num} of Excel file {excel_filename} contains invalid data in column 'gui_attributes'. " + "The expected format is '[attribute: value, attribute: value]'.", + context, + ) + expected_dict = {"hlist": "languages"} + returned_dict = e2j._get_gui_attribute(df_row=original_df.loc[4, :], row_num=6, excelfile="Test") + self.assertDictEqual(expected_dict, cast(dict[str, str], returned_dict)) + + def test__check_compliance_gui_attributes(self) -> None: + original_df = pd.DataFrame( + { + "gui_element": ["Spinbox", "List", "Searchbox", "Date", "Geonames", "Richtext", "TimeStamp"], + "gui_attributes": ["Spinbox_attr", "List_attr", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA], + } + ) + returned_value = e2j._check_compliance_gui_attributes(df=original_df) + self.assertIsNone(cast(None, returned_value)) + original_df = pd.DataFrame( + { + "gui_element": ["Spinbox", "List", "Searchbox", "Date", "Geonames", "Richtext", "TimeStamp"], + "gui_attributes": ["Spinbox_attr", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, "TimeStamp_attr"], + } + ) + expected_dict = {"wrong gui_attributes": [False, True, False, False, False, False, True]} + returned_dict = e2j._check_compliance_gui_attributes(df=original_df) + returned_dict = cast(dict[str, list[pd.Series]], returned_dict) + casted_dict: dict[str, Any] = {"wrong gui_attributes": list(returned_dict["wrong gui_attributes"])} + self.assertDictEqual(expected_dict, casted_dict) + + def test__row2prop(self) -> None: + original_df = pd.DataFrame( + { + "name": ["name_1", "name_2", "name_3"], + "label_en": ["label_en_1", "label_en_2", pd.NA], + "label_de": ["label_de_1", pd.NA, "label_de_3"], + "label_fr": ["label_fr_1", pd.NA, pd.NA], + "label_it": ["label_it_1", pd.NA, pd.NA], + "label_rm": ["label_rm_1", pd.NA, pd.NA], + "comment_en": ["comment_en_1", "comment_en_2", pd.NA], + "comment_de": ["comment_de_1", pd.NA, "comment_de_3"], + "comment_fr": ["comment_fr_1", pd.NA, pd.NA], + "comment_it": ["comment_it_1", pd.NA, pd.NA], + "comment_rm": ["comment_rm_1", pd.NA, pd.NA], + "super": ["super_1", "super_2.1, super_2.2", "super_3"], + "subject": ["subject_1", "subject_2", pd.NA], + "object": ["object_1", "object_2", "object_3"], + "gui_element": ["Simple", "Date", "List"], + "gui_attributes": ["size: 32, maxlength: 128", pd.NA, "hlist: languages"], + } + ) + returned_dict = e2j._row2prop(df_row=original_df.loc[0, :], row_num=0, excelfile="Test") + expected_dict = { + "name": "name_1", + "object": "object_1", + "gui_element": "Simple", + "labels": { + "en": "label_en_1", + "de": "label_de_1", + "fr": "label_fr_1", + "it": "label_it_1", + "rm": "label_rm_1", + }, + "super": ["super_1"], + "comments": { + "en": "comment_en_1", + "de": "comment_de_1", + "fr": "comment_fr_1", + "it": "comment_it_1", + "rm": "comment_rm_1", + }, + "gui_attributes": {"size": 32, "maxlength": 128}, + } + self.assertDictEqual(expected_dict, returned_dict) + + returned_dict = e2j._row2prop(df_row=original_df.loc[1, :], row_num=1, excelfile="Test") + expected_dict = { + "comments": {"en": "comment_en_2"}, + "gui_element": "Date", + "labels": {"en": "label_en_2"}, + "name": "name_2", + "object": "object_2", + "super": ["super_2.1", "super_2.2"], + } + self.assertDictEqual(expected_dict, returned_dict) + + returned_dict = e2j._row2prop(df_row=original_df.loc[2, :], row_num=2, excelfile="Test") + expected_dict = { + "comments": {"de": "comment_de_3"}, + "gui_attributes": {"hlist": "languages"}, + "gui_element": "List", + "labels": {"de": "label_de_3"}, + "name": "name_3", + "object": "object_3", + "super": ["super_3"], + } + self.assertDictEqual(expected_dict, returned_dict) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/test/unittests/test_excel_to_json_resources.py b/test/unittests/test_excel_to_json/test_resources.py similarity index 99% rename from test/unittests/test_excel_to_json_resources.py rename to test/unittests/test_excel_to_json/test_resources.py index 1910c6beb..20c097b36 100644 --- a/test/unittests/test_excel_to_json_resources.py +++ b/test/unittests/test_excel_to_json/test_resources.py @@ -12,7 +12,7 @@ import pytest from dsp_tools.models.exceptions import BaseError -from dsp_tools.utils import excel_to_json_resources as e2j +from dsp_tools.utils.excel_to_json import resources as e2j class TestExcelToResource(unittest.TestCase): diff --git a/test/unittests/test_excel_to_json/test_utils.py b/test/unittests/test_excel_to_json/test_utils.py new file mode 100644 index 000000000..aaebca4df --- /dev/null +++ b/test/unittests/test_excel_to_json/test_utils.py @@ -0,0 +1,185 @@ +# pylint: disable=f-string-without-interpolation,missing-class-docstring,missing-function-docstring +# mypy: allow_untyped_calls + +import unittest +from typing import cast + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal, assert_series_equal + +import dsp_tools.utils.excel_to_json.utils as utl +from dsp_tools.models.exceptions import BaseError + + +class TestUtils(unittest.TestCase): + def test_clean_data_frame(self) -> None: + original_df = pd.DataFrame( + { + " TitLE of Column 1 ": [1.54, " 0-1 ", "1-n ", " "], + " Title of Column 2 ": ["1", 1, " t ext ", "text"], + "Title of Column 3": ["", pd.NA, None, "text"], + } + ) + expected_df = pd.DataFrame( + { + "title of column 1": ["1.54", "0-1", "1-n", pd.NA], + "title of column 2": ["1", "1", "t ext", "text"], + "title of column 3": [pd.NA, pd.NA, pd.NA, "text"], + } + ) + returned_df = utl.clean_data_frame(df=original_df) + assert_frame_equal(expected_df, returned_df) + + def test_check_contains_required_columns_else_raise_error(self) -> None: + original_df = pd.DataFrame(columns=["col1", "col2", "col3", "extra_col"]) + required = {"col1", "col2", "col3"} + utl.check_contains_required_columns_else_raise_error(df=original_df, required_columns=required) + required = {"col1", "col2", "col3", "col4"} + with self.assertRaises(BaseError) as context: + utl.check_contains_required_columns_else_raise_error(df=original_df, required_columns=required) + self.assertEqual( + context, + "The following columns are missing in the excel: " + "{required_columns.difference(set(check_df.columns))}", + ) + + def test_check_column_for_duplicate_else_raise_error(self) -> None: + original_df = pd.DataFrame( + { + "col_1": ["1.54", "0-1", "1-n", "0-1", "1.54"], + "col_2": ["1.54", "0-1", "1-n", "text", "neu"], + } + ) + utl.check_column_for_duplicate_else_raise_error(df=original_df, to_check_column="col_2") + with self.assertRaises(BaseError) as context: + utl.check_column_for_duplicate_else_raise_error(df=original_df, to_check_column="col_1") + self.assertEqual( + context, + "The column '{duplicate_column}' may not contain any duplicate values. " + "The following values appeared multiple times '{duplicate_values}'.", + ) + + def test_check_required_values(self) -> None: + original_df = pd.DataFrame( + { + "col_1": ["1.54", "0-1", "1-n", pd.NA], + "col_2": ["1", "1", pd.NA, "text"], + "col_3": ["1", "1", "1", "text"], + } + ) + expected_dict = {"col_1": [False, False, False, True]} + returned_dict = utl.check_required_values(df=original_df, required_values_columns=["col_1", "col_3"]) + self.assertListEqual(list(expected_dict.keys()), list(returned_dict.keys())) + for key, expected_list in expected_dict.items(): + self.assertListEqual(list(returned_dict[key]), expected_list) + + def test_turn_bool_array_into_index_numbers(self) -> None: + original_series = pd.Series([False, True, False, True]) + expected_list = [1, 3] + returned_list = utl.turn_bool_array_into_index_numbers(series=original_series, true_remains=True) + self.assertListEqual(expected_list, returned_list) + expected_list = [0, 2] + returned_list = utl.turn_bool_array_into_index_numbers(series=original_series, true_remains=False) + self.assertListEqual(expected_list, returned_list) + + def test_get_wrong_row_numbers(self) -> None: + original_dict = { + "col_1": pd.Series([False, True, False, True]), + "col_2": pd.Series([False, False, True, False]), + } + expected_dict = {"col_1": [3, 5], "col_2": [4]} + returned_dict = utl.get_wrong_row_numbers(wrong_row_dict=original_dict, true_remains=True) + self.assertDictEqual(expected_dict, returned_dict) + + def test_update_dict_if_not_value_none(self) -> None: + original_dict = {0: 0} + original_update_dict = {1: 1, 2: 2, 3: None, 4: pd.NA, 5: "5"} + expected_dict = {0: 0, 1: 1, 2: 2, 5: "5"} + returned_dict = utl.update_dict_if_not_value_none( + additional_dict=original_update_dict, to_update_dict=original_dict + ) + self.assertDictEqual(expected_dict, returned_dict) + + def test_find_one_full_cell_in_cols(self) -> None: + required_cols = ["label_en", "label_de", "label_fr", "label_it", "label_rm"] + original_df = pd.DataFrame( + { + "label_en": [1, pd.NA, pd.NA, 4], + "label_de": [1, pd.NA, 3, 4], + "label_fr": [1, pd.NA, 3, pd.NA], + "label_it": [1, pd.NA, 3, 4], + "label_rm": [pd.NA, pd.NA, 3, 4], + } + ) + expected_array = pd.Series([False, True, False, False]) + returned_array = utl.find_one_full_cell_in_cols(df=original_df, required_columns=required_cols) + assert_series_equal(expected_array, returned_array) + original_df = pd.DataFrame( + { + "label_en": [1, 2, 3, 4], + "label_de": [1, pd.NA, 3, 4], + "label_fr": [1, pd.NA, 3, pd.NA], + "label_it": [1, pd.NA, 3, 4], + "label_rm": [pd.NA, pd.NA, 3, 4], + } + ) + returned_array = utl.find_one_full_cell_in_cols(df=original_df, required_columns=required_cols) + self.assertIsNone(returned_array) + + def test_col_must_or_not_empty_based_on_other_col(self) -> None: + original_df = pd.DataFrame({"substring": ["1", "2", "3", "4", "5", "6"], "check": [1, pd.NA, 3, 4, pd.NA, 6]}) + returned_value = utl.col_must_or_not_empty_based_on_other_col( + df=original_df, + substring_list=["1", "3", "6"], + substring_colname="substring", + check_empty_colname="check", + must_have_value=True, + ) + self.assertIsNone(returned_value) + expected_series = pd.Series([True, False, False, False, False, False]) + returned_series = utl.col_must_or_not_empty_based_on_other_col( + df=original_df, + substring_list=["1", "2"], + substring_colname="substring", + check_empty_colname="check", + must_have_value=False, + ) + assert_series_equal(expected_series, returned_series) + + def test__get_labels(self) -> None: + original_df = pd.DataFrame( + { + "label_en": ["text_en", "text_en"], + "label_de": ["text_de", pd.NA], + "label_fr": ["text_fr", pd.NA], + "label_it": ["text_it", pd.NA], + "label_rm": ["text_rm", pd.NA], + } + ) + expected_dict = {"de": "text_de", "en": "text_en", "fr": "text_fr", "it": "text_it", "rm": "text_rm"} + returned_dict = utl.get_labels(original_df.loc[0, :]) + self.assertDictEqual(expected_dict, returned_dict) + expected_dict = {"en": "text_en"} + returned_dict = utl.get_labels(original_df.loc[1, :]) + self.assertDictEqual(expected_dict, returned_dict) + + def test_get_comments(self) -> None: + original_df = pd.DataFrame( + { + "comment_en": ["text_en", pd.NA], + "comment_de": ["text_de", pd.NA], + "comment_fr": ["text_fr", pd.NA], + "comment_it": ["text_it", pd.NA], + "comment_rm": ["text_rm", pd.NA], + } + ) + expected_dict = {"de": "text_de", "en": "text_en", "fr": "text_fr", "it": "text_it", "rm": "text_rm"} + returned_dict = utl.get_comments(original_df.loc[0, :]) + self.assertDictEqual(expected_dict, cast(dict[str, str], returned_dict)) + returned_none = utl.get_comments(original_df.loc[1, :]) + self.assertIsNone(cast(None, returned_none)) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/test/unittests/test_excel_to_json_properties.py b/test/unittests/test_excel_to_json_properties.py deleted file mode 100644 index 2985eac5c..000000000 --- a/test/unittests/test_excel_to_json_properties.py +++ /dev/null @@ -1,358 +0,0 @@ -"""unit tests for excel to properties""" - -# pylint: disable=missing-class-docstring,missing-function-docstring - -import json -import os -import unittest -from typing import Any - -import jsonpath_ng -import jsonpath_ng.ext -import pytest - -from dsp_tools.models.exceptions import BaseError -from dsp_tools.utils import excel_to_json_properties as e2j - - -class TestExcelToProperties(unittest.TestCase): - outfile = "testdata/tmp/_out_properties.json" - - @classmethod - def setUpClass(cls) -> None: - """Is executed once before the methods of this class are run""" - os.makedirs("testdata/tmp", exist_ok=True) - - @classmethod - def tearDownClass(cls) -> None: - """Is executed after the methods of this class have all run through""" - for file in os.listdir("testdata/tmp"): - os.remove("testdata/tmp/" + file) - os.rmdir("testdata/tmp") - - def test_excel2properties(self) -> None: - excelfile = "testdata/excel2json/excel2json_files/test-name (test_label)/properties.xlsx" - output_from_method, _ = e2j.excel2properties(excelfile, self.outfile) - - # define the expected values from the excel file - excel_names = [ - "correspondsToGenericAnthroponym", - "hasAnthroponym", - "hasGender", - "isDesignatedAs", - "hasTitle", - "hasStatus", - "hasLifeYearAmount", - "hasBirthDate", - "hasRepresentation", - "hasRemarks", - "hasTerminusPostQuem", - "hasGND", - "hasColor", - "hasDecimal", - "hasTime", - "hasInterval", - "hasBoolean", - "hasGeoname", - "partOfDocument", - "linkstoRegion", - "hasLinkToImage", - "hasLinkToResource", - "hasLinkToArchiveRepresentation", - "hasLinkToMovingImageRepesentation", - "hasLinkToAudioRepesentation", - ] - excel_supers = [ - ["hasLinkTo"], - ["hasValue", "dcterms:creator"], - ["hasValue"], - ["hasValue"], - ["hasLinkTo"], - ["hasValue"], - ["hasValue"], - ["hasValue"], - ["hasRepresentation"], - ["hasValue", "dcterms:description"], - ["hasValue"], - ["hasValue"], - ["hasColor"], - ["hasValue"], - ["hasValue"], - ["hasSequenceBounds"], - ["hasValue"], - ["hasValue"], - ["isPartOf"], - ["hasLinkTo"], - ["hasLinkTo"], - ["hasLinkTo"], - ["hasLinkTo"], - ["hasLinkTo"], - ["hasLinkTo"], - ] - excel_objects = [ - ":GenericAnthroponym", - "TextValue", - "ListValue", - "ListValue", - ":Titles", - "ListValue", - "IntValue", - "DateValue", - "Representation", - "TextValue", - "DateValue", - "UriValue", - "ColorValue", - "DecimalValue", - "TimeValue", - "IntervalValue", - "BooleanValue", - "GeonameValue", - ":Documents", - "Region", - "StillImageRepresentation", - "Resource", - "ArchiveRepresentation", - "MovingImageRepresentation", - "AudioRepresentation", - ] - - excel_labels = dict() - # there are also labels in other languages, but they are not tested - excel_labels["de"] = [ - "", - "only German", - "", - "", - "", - "", - "", - "", - "hat eine Multimediadatei", - "", - "", - "GND", - "Farbe", - "Dezimalzahl", - "Zeit", - "Zeitintervall", - "Bool'sche Variable", - "Link zu Geonames", - "ist Teil eines Dokuments", - "", - "", - "", - "", - "", - "", - ] - excel_labels["it"] = [ - "", - "", - "", - "only Italian", - "", - "", - "", - "", - "", - "", - "", - "GND", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - ] - - excel_comments = dict() - # there are also comments in other languages, but they are not tested - excel_comments["comment_fr"] = [ - "J'avais déjà examiné plusieurs propriétés quand, un jour, le notaire, qui me " - "donnait des indications nécessaires pour une de mes explorations, me dit :", - "Un étrange hasard m'a mis en possession de ce journal.", - "Je n'en sais rien du tout ; mais si vous voulez la voir, monsieur, voici les " - "indications précises pour la trouver.", - "Vous devrez arranger l'affaire avec le curé du village de --.\"", - "Un étrange hasard m'a mis en possession de ce journal.", - "", - "", - "only French", - "", - "", - "J'avais déjà examiné plusieurs propriétés quand, un jour, le notaire, qui me " - "donnait des indications nécessaires pour une de mes explorations, me dit :", - "Gemeinsame Normdatei", - "", - "Chiffre décimale", - "Temps", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - ] - excel_comments["comment_it"] = [ - "Avevo già visto diverse proprietà quando un giorno il notaio,", - "Uno strano caso mi mise in possesso di questo diario.", - "Non ne so nulla; ma se volete vederla, signore, eccovi le indicazioni precise per trovarla.", - "Dovrete organizzare l'affare con il curato del villaggio di --\".", - "Uno strano caso mi mise in possesso di questo diario.", - "", - "", - "", - "", - "", - "Avevo già visto diverse proprietà quando un giorno il notaio,", - "Gemeinsame Normdatei", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", - ] - - excel_gui_elements = [ - "Searchbox", - "Richtext", - "List", - "Radio", - "Searchbox", - "List", - "Spinbox", - "Date", - "Searchbox", - "Textarea", - "Date", - "SimpleText", - "Colorpicker", - "Spinbox", - "TimeStamp", - "Interval", - "Checkbox", - "Geonames", - "Searchbox", - "Searchbox", - "Searchbox", - "Searchbox", - "Searchbox", - "Searchbox", - "Searchbox", - ] - - excel_gui_attributes_hasGender = {"hlist": "gender"} - excel_gui_attributes_hasGND = {"size": 100} - excel_gui_attributes_hasDecimal = {"min": 0.0, "max": 100.0} - - # read json file - with open(self.outfile, encoding="utf-8") as f: - output_from_file: list[dict[str, Any]] = json.load(f) - - # check that output from file and from method are equal - self.assertListEqual(output_from_file, output_from_method) - - # extract infos from json file - json_names = [match.value for match in jsonpath_ng.parse("$[*].name").find(output_from_file)] - json_supers = [match.value for match in jsonpath_ng.parse("$[*].super").find(output_from_file)] - json_objects = [match.value for match in jsonpath_ng.parse("$[*].object").find(output_from_file)] - - json_labels_all = [match.value for match in jsonpath_ng.parse("$[*].labels").find(output_from_file)] - json_labels: dict[str, list[str]] = dict() - for lang in ["de", "it"]: - json_labels[lang] = [label.get(lang, "").strip() for label in json_labels_all] - - json_comments: dict[str, list[str]] = dict() - for lang in ["fr", "it"]: - json_comments[f"comment_{lang}"] = [ - resource.get("comments", {}).get(lang, "").strip() for resource in output_from_file - ] - - json_gui_elements = [match.value for match in jsonpath_ng.parse("$[*].gui_element").find(output_from_file)] - - json_gui_attributes_hasGender = ( - jsonpath_ng.ext.parse("$[?name='hasGender'].gui_attributes").find(output_from_file)[0].value - ) - json_gui_attributes_hasGND = ( - jsonpath_ng.ext.parse("$[?name='hasGND'].gui_attributes").find(output_from_file)[0].value - ) - json_gui_attributes_hasDecimal = ( - jsonpath_ng.ext.parse("$[?name='hasDecimal'].gui_attributes").find(output_from_file)[0].value - ) - - # make checks - self.assertListEqual(excel_names, json_names) - self.assertListEqual(excel_supers, json_supers) - self.assertListEqual(excel_objects, json_objects) - self.assertDictEqual(excel_labels, json_labels) - self.assertDictEqual(excel_comments, json_comments) - self.assertListEqual(excel_gui_elements, json_gui_elements) - self.assertDictEqual(excel_gui_attributes_hasGND, json_gui_attributes_hasGND) - self.assertDictEqual(excel_gui_attributes_hasDecimal, json_gui_attributes_hasDecimal) - self.assertDictEqual(excel_gui_attributes_hasGender, json_gui_attributes_hasGender) - - def test_validate_properties(self) -> None: - # it is not possible to call the method to be tested directly. - # So let's make a reference to it, so that it can be found by the usage search - lambda x: e2j._validate_properties([], "file") # pylint: disable=expression-not-assigned,protected-access - - testcases = [ - ( - "testdata/invalid-testdata/excel2json/properties-invalid-super.xlsx", - "did not pass validation. The problematic property is 'hasGeoname' in Excel row 3. " - "The problem is that the column 'super' has an invalid value: " - "'GeonameValue' is not valid under any of the given schemas", - ), - ( - "testdata/invalid-testdata/excel2json/properties-invalid-object.xlsx", - "did not pass validation. The problematic property is 'hasBoolean' in Excel row 2. " - "The problem is that the column 'object' has an invalid value: " - "'hasValue' is not valid under any of the given schemas", - ), - ( - "testdata/invalid-testdata/excel2json/properties-invalid-gui_element.xlsx", - "did not pass validation. The problematic property is 'hasInterval' in Excel row 4. " - r"The problem is that the column 'gui_element' has an invalid value: " - r"'Interval' was expected", - ), - ( - "testdata/invalid-testdata/excel2json/properties-invalid-gui_attribute.xlsx", - "did not pass validation. The problematic property is 'hasInteger' in Excel row 4. " - r"The problem is that the column 'gui_attributes' has an invalid value: " - r"Additional properties are not allowed \('rows' was unexpected\)", - ), - ( - "testdata/invalid-testdata/excel2json/properties-duplicate-name.xlsx", - "Property names must be unique inside every ontology, but '.+' contains duplicates:\n" - r" - Row 3: hasGender\n - Row 4: hasGender\n - Row 5: isDesignatedAs\n - Row 6: isDesignatedAs", - ), - ] - - for file, message in testcases: - with self.assertRaisesRegex(BaseError, message): - e2j.excel2properties(file, self.outfile) - - -if __name__ == "__main__": - pytest.main([__file__]) diff --git a/test/unittests/test_id_to_iri.py b/test/unittests/test_id_to_iri.py index 545051ae5..dc9387d33 100644 --- a/test/unittests/test_id_to_iri.py +++ b/test/unittests/test_id_to_iri.py @@ -1,8 +1,8 @@ # pylint: disable=missing-class-docstring,missing-function-docstring -from pathlib import Path import shutil import unittest +from pathlib import Path import pytest import regex diff --git a/test/unittests/test_shared.py b/test/unittests/test_shared.py index f1466e6e8..2326ad6d2 100644 --- a/test/unittests/test_shared.py +++ b/test/unittests/test_shared.py @@ -100,9 +100,9 @@ def test_validate_xml_tags_in_text_properties(self) -> None: def test_prepare_dataframe(self) -> None: original_df = pd.DataFrame( { - " TitLE of Column 1 ": ["1", " 0-1 ", "1-n ", pd.NA, " ", " ", "", " 0-n ", np.nan], + " TitLE of Column 1 ": ["1", " 0-1 ", "1-n ", pd.NA, " ", " ", "", " 0-n ", pd.NA], " Title of Column 2 ": [None, "1", 1, "text", "text", "text", "text", "text", "text"], - "Title of Column 3": ["", pd.NA, None, "text", "text", "text", "text", np.nan, "text"], + "Title of Column 3": ["", pd.NA, None, "text", "text", "text", "text", pd.NA, "text"], } ) expected_df = pd.DataFrame( @@ -115,16 +115,14 @@ def test_prepare_dataframe(self) -> None: returned_df = shared.prepare_dataframe( df=original_df, required_columns=[" TitLE of Column 1 ", " Title of Column 2 "], location_of_sheet="" ) - for expected, returned in zip(expected_df.iterrows(), returned_df.iterrows()): - i, expected_row = expected - _, returned_row = returned + for (i, expected_row), (_, returned_row) in zip(expected_df.iterrows(), returned_df.iterrows()): self.assertListEqual(list(expected_row), list(returned_row), msg=f"Failed in row {i}") def test_check_notna(self) -> None: na_values = [ None, pd.NA, - np.nan, + pd.NA, "", " ", "-", diff --git a/test/unittests/test_xmlupload.py b/test/unittests/test_xmlupload.py index d7c149580..75be9eb5c 100644 --- a/test/unittests/test_xmlupload.py +++ b/test/unittests/test_xmlupload.py @@ -13,9 +13,9 @@ from dsp_tools.utils.xml_upload import ( _convert_ark_v0_to_resource_iri, _determine_save_location_of_diagnostic_info, - parse_xml_file, _remove_circular_references, _transform_server_url_to_foldername, + parse_xml_file, ) diff --git a/testdata/invalid-testdata/excel2json/properties-invalid-super.xlsx b/testdata/invalid-testdata/excel2json/properties-invalid-super.xlsx index 05e57d864..47a2bf43e 100644 Binary files a/testdata/invalid-testdata/excel2json/properties-invalid-super.xlsx and b/testdata/invalid-testdata/excel2json/properties-invalid-super.xlsx differ