Skip to content

Commit

Permalink
chore(excel2json-lists): fix two ruff PLW0603 (#778)
Browse files Browse the repository at this point in the history
  • Loading branch information
Nora-Olivia-Ammann committed Jan 31, 2024
1 parent cf491e9 commit 5f53c38
Show file tree
Hide file tree
Showing 9 changed files with 95 additions and 66 deletions.
136 changes: 75 additions & 61 deletions src/dsp_tools/commands/excel2json/lists.py
@@ -1,6 +1,7 @@
"""This module handles all the operations which are used for the creation of JSON lists from Excel files."""
import importlib.resources
import json
from copy import deepcopy
from pathlib import Path
from typing import Any, Optional, Union

Expand All @@ -10,16 +11,11 @@
from openpyxl.cell import Cell
from openpyxl.worksheet.worksheet import Worksheet

from dsp_tools.commands.excel2json.input_error import MoreThanOneSheetProblem
from dsp_tools.commands.excel2json.models.input_error import MoreThanOneSheetProblem
from dsp_tools.commands.excel2json.models.list_node_name import ListNodeNames
from dsp_tools.models.exceptions import BaseError, InputError, UserError
from dsp_tools.utils.shared import simplify_name

list_of_lists_of_previous_cell_values: list[list[str]] = []
"""Module level variable used to ensure that there are no duplicate node names"""

list_of_previous_node_names: list[str] = []
"""Module level variable used to ensure that there are no duplicate node names"""


def expand_lists_from_excel(
lists_section: list[dict[str, Union[str, dict[str, Any]]]],
Expand All @@ -41,37 +37,42 @@ def expand_lists_from_excel(
Returns:
the same "lists" section, but without references to Excel files
"""
new_lists = []
new_lists: list[dict[str, Any]] = []
for _list in lists_section:
# case 1: this list is a JSON list: return it as it is
if "folder" not in _list["nodes"]:
new_lists.append(_list)
continue

# case 2: this is a reference to a folder with Excel files
foldername = _list["nodes"]["folder"] # type: ignore[index] # types are too complex to annotate them correctly
excel_file_paths = _extract_excel_file_paths(foldername)
try:
returned_lists_section = _make_json_lists_from_excel(excel_file_paths, verbose=False)
# we only need the "nodes" section of the first element of the returned "lists" section. This "nodes"
# section needs to replace the Excel folder reference. But the rest of the user-defined list element
# needs to stay intact, e.g. the labels and comments.
_list["nodes"] = returned_lists_section[0]["nodes"]
new_lists.append(_list)
print(
f" The list '{_list['name']}' contains a reference to the folder '{foldername}'. The Excel "
f"files therein have been temporarily expanded into the 'lists' section of your project."
)
except BaseError as err:
raise UserError(
f" WARNING: The list '{_list['name']}' contains a reference to the folder '{foldername}', but a "
f"problem occurred while trying to expand the Excel files therein into the 'lists' section of "
f"your project: {err.message}"
) from None
new_lists.append(_read_excel_make_lists(_list))

return new_lists


def _read_excel_make_lists(_list: dict[str, Union[str, dict[str, Any]]]) -> dict[str, Any]:
foldername = _list["nodes"]["folder"] # type: ignore[index] # types are too complex to annotate them correctly
excel_file_paths = _extract_excel_file_paths(foldername)
new_list = deepcopy(_list)
try:
returned_lists_section = _make_json_lists_from_excel(excel_file_paths, verbose=False)
# we only need the "nodes" section of the first element of the returned "lists" section. This "nodes"
# section needs to replace the Excel folder reference. But the rest of the user-defined list element
# needs to stay intact, e.g. the labels and comments.
new_list["nodes"] = returned_lists_section[0]["nodes"]
print(
f" The list '{_list['name']}' contains a reference to the folder '{foldername}'. The Excel "
f"files therein have been temporarily expanded into the 'lists' section of your project."
)
return new_list
except BaseError as err:
raise UserError(
f" WARNING: The list '{_list['name']}' contains a reference to the folder '{foldername}', but a "
f"problem occurred while trying to expand the Excel files therein into the 'lists' section of "
f"your project: {err.message}"
) from None


def excel2lists(
excelfolder: str,
path_to_output_file: Optional[str] = None,
Expand All @@ -92,13 +93,11 @@ def excel2lists(
Returns:
a tuple consisting of the "lists" section as Python list, and the success status (True if everything went well)
"""
# read the data
excel_file_paths = _extract_excel_file_paths(excelfolder)
if verbose:
print("The following Excel files will be processed:")
print(*(f" - {filename}" for filename in excel_file_paths), sep="\n")

# construct the "lists" section
finished_lists = _make_json_lists_from_excel(excel_file_paths, verbose=verbose)
validate_lists_section_with_schema(lists_section=finished_lists)

Expand All @@ -117,6 +116,7 @@ def _get_values_from_excel(
row: int,
col: int,
preval: list[str],
list_node_names: ListNodeNames,
verbose: bool = False,
) -> tuple[int, dict[str, Any]]:
"""
Expand All @@ -130,6 +130,7 @@ def _get_values_from_excel(
row: The index of the current row of the Excel sheet
col: The index of the current column of the Excel sheet
preval: List of previous values, needed to check the consistency of the list hierarchy
list_node_names: object containing the information which node names were used by previous lists
verbose: verbose switch
Raises:
Expand Down Expand Up @@ -159,13 +160,7 @@ def _get_values_from_excel(
preval.append(str(base_file_ws.cell(column=col - 1, row=row).value).strip())

while cell.value and regex.search(r"\p{L}", str(cell.value), flags=regex.UNICODE):
# check if all predecessors in row (values to the left) are consistent with the values in preval list
for idx, val in enumerate(preval[:-1]):
if val != str(base_file_ws.cell(column=idx + 1, row=row).value).strip():
raise UserError(
"ERROR: Inconsistency in Excel list: "
f"{val} not equal to {str(base_file_ws.cell(column=idx+1, row=row).value).strip()}"
)
_check_if_predecessors_in_row_are_consistent_with_preval(base_file_ws, preval, row)

# loop through the row until the last (furthest right) value is found
next_value = base_file_ws.cell(column=col + 1, row=row).value
Expand All @@ -177,12 +172,13 @@ def _get_values_from_excel(
col=col + 1,
row=row,
preval=preval,
list_node_names=list_node_names,
verbose=verbose,
)

# if value was last in row (no further values to the right), it's a node, continue here
else:
currentnode = _make_new_node(cell, col, excelfiles, preval, row, verbose)
currentnode = _make_new_node(cell, col, excelfiles, preval, row, list_node_names, verbose)
nodes.append(currentnode)

# go one row down and repeat loop if there is a value
Expand All @@ -198,33 +194,60 @@ def _get_values_from_excel(
return row - 1, parentnode


def _check_if_predecessors_in_row_are_consistent_with_preval(
base_file_ws: Worksheet, preval: list[str], row: int
) -> None:
for idx, val in enumerate(preval[:-1]):
if val != str(base_file_ws.cell(column=idx + 1, row=row).value).strip():
raise UserError(
"ERROR: Inconsistency in Excel list: "
f"{val} not equal to {str(base_file_ws.cell(column=idx + 1, row=row).value).strip()}"
)


def _make_new_node(
cell: Cell,
col: int,
excelfiles: dict[str, Worksheet],
preval: list[str],
row: int,
list_node_names: ListNodeNames,
verbose: bool = False,
) -> dict[str, Any]:
# check if there are duplicate nodes (i.e. identical rows), raise a UserError if so
_check_if_duplicate_nodes_exist(cell, list_node_names, preval)
# create a simplified version of the cell value and use it as name of the node
nodename = simplify_name(str(cell.value).strip())
list_node_names.previous_node_names.append(nodename)
# append a number (p.ex. node-name-2) if there are list nodes with identical names
n = list_node_names.previous_node_names.count(nodename)

if n > 1:
nodename = f"{nodename}-{n}"

labels_dict = _get_all_languages_of_node(excelfiles, col, row)

# create the current node from extracted cell values and append it to the nodes list
currentnode = {"name": nodename, "labels": labels_dict}
if verbose:
print(f"Added list node: {str(cell.value).strip()} ({nodename})")
return currentnode


def _check_if_duplicate_nodes_exist(cell: Cell, list_node_names: ListNodeNames, preval: list[str]) -> None:
new_check_list = preval.copy()
new_check_list.append(str(cell.value).strip())
list_of_lists_of_previous_cell_values.append(new_check_list)
if any(list_of_lists_of_previous_cell_values.count(x) > 1 for x in list_of_lists_of_previous_cell_values):
list_node_names.lists_with_previous_cell_values.append(new_check_list)
if any(
list_node_names.lists_with_previous_cell_values.count(x) > 1
for x in list_node_names.lists_with_previous_cell_values
):
raise UserError(
f"ERROR: There is at least one duplicate node in the list. "
f"Found duplicate in column {cell.column}, row {cell.row}:\n'{str(cell.value).strip()}'"
)

# create a simplified version of the cell value and use it as name of the node
nodename = simplify_name(str(cell.value).strip())
list_of_previous_node_names.append(nodename)
# append a number (p.ex. node-name-2) if there are list nodes with identical names
n = list_of_previous_node_names.count(nodename)

if n > 1:
nodename = f"{nodename}-{n}"
# read label values from the other Excel files (other languages)
def _get_all_languages_of_node(excelfiles: dict[str, Worksheet], col: int, row: int) -> dict[str, str]:
labels_dict: dict[str, str] = {}
for other_lang, ws_other_lang in excelfiles.items():
cell_value = ws_other_lang.cell(column=col, row=row).value
Expand All @@ -235,12 +258,7 @@ def _make_new_node(
)
else:
labels_dict[other_lang] = cell_value.strip()

# create the current node from extracted cell values and append it to the nodes list
currentnode = {"name": nodename, "labels": labels_dict}
if verbose:
print(f"Added list node: {str(cell.value).strip()} ({nodename})")
return currentnode
return labels_dict


def _make_json_lists_from_excel(
Expand All @@ -261,23 +279,18 @@ def _make_json_lists_from_excel(
Returns:
The finished "lists" section
"""
# reset the global variables
global list_of_previous_node_names
global list_of_lists_of_previous_cell_values
list_of_previous_node_names = []
list_of_lists_of_previous_cell_values = []

# Define starting point in Excel file
startrow = 1
startcol = 1

# make a dict with the language labels and the worksheets
lang_to_worksheet: dict[str, Worksheet] = {x.stem: _read_and_check_workbook(x) for x in excel_file_paths}

# take English as base file. If English is not available, take a random one.
base_lang = "en" if "en" in lang_to_worksheet else next(iter(lang_to_worksheet.keys()))
base_file = {base_lang: lang_to_worksheet[base_lang]}

list_node_names = ListNodeNames()

# construct the entire "lists" section as children of a fictive dummy parent node
_, _list = _get_values_from_excel(
excelfiles=lang_to_worksheet,
Expand All @@ -286,6 +299,7 @@ def _make_json_lists_from_excel(
row=startrow,
col=startcol,
preval=[],
list_node_names=list_node_names,
verbose=verbose,
)

Expand Down
Empty file.
15 changes: 15 additions & 0 deletions src/dsp_tools/commands/excel2json/models/list_node_name.py
@@ -0,0 +1,15 @@
from dataclasses import dataclass, field


@dataclass
class ListNodeNames:
"""
Contains the information to construct a list
Args:
lists_with_previous_cell_values: used to ensure that there are no duplicate node names
previous_node_names: used to ensure that there are no duplicate node names
"""

lists_with_previous_cell_values: list[list[str]] = field(default_factory=list)
previous_node_names: list[str] = field(default_factory=list)
2 changes: 1 addition & 1 deletion src/dsp_tools/commands/excel2json/properties.py
Expand Up @@ -11,7 +11,7 @@
import pandas as pd
import regex

from dsp_tools.commands.excel2json.input_error import (
from dsp_tools.commands.excel2json.models.input_error import (
InvalidExcelContentProblem,
JsonValidationPropertyProblem,
MissingValuesInRowProblem,
Expand Down
2 changes: 1 addition & 1 deletion src/dsp_tools/commands/excel2json/resources.py
Expand Up @@ -8,7 +8,7 @@
import pandas as pd
import regex

from dsp_tools.commands.excel2json.input_error import (
from dsp_tools.commands.excel2json.models.input_error import (
JsonValidationResourceProblem,
MissingValuesInRowProblem,
PositionInExcel,
Expand Down
2 changes: 1 addition & 1 deletion src/dsp_tools/commands/excel2json/utils.py
Expand Up @@ -8,7 +8,7 @@
import pandas as pd
import regex

from dsp_tools.commands.excel2json.input_error import (
from dsp_tools.commands.excel2json.models.input_error import (
DuplicatesInColumnProblem,
InvalidSheetNameProblem,
RequiredColumnMissingProblem,
Expand Down
2 changes: 1 addition & 1 deletion test/unittests/commands/excel2json/test_properties.py
Expand Up @@ -11,7 +11,7 @@
from pandas.testing import assert_frame_equal

from dsp_tools.commands.excel2json import properties as e2j
from dsp_tools.commands.excel2json.input_error import InvalidExcelContentProblem
from dsp_tools.commands.excel2json.models.input_error import InvalidExcelContentProblem
from dsp_tools.models.exceptions import InputError

# ruff: noqa: PT009 (pytest-unittest-assertion) (remove this line when pytest is used instead of unittest)
Expand Down
2 changes: 1 addition & 1 deletion test/unittests/commands/excel2json/test_utils.py
Expand Up @@ -7,7 +7,7 @@
from pytest_unordered import unordered

import dsp_tools.commands.excel2json.utils as utl
from dsp_tools.commands.excel2json.input_error import DuplicatesInColumnProblem
from dsp_tools.commands.excel2json.models.input_error import DuplicatesInColumnProblem

# ruff: noqa: PT009 (pytest-unittest-assertion) (remove this line when pytest is used instead of unittest)

Expand Down

0 comments on commit 5f53c38

Please sign in to comment.