-
Notifications
You must be signed in to change notification settings - Fork 4
/
lists.py
376 lines (309 loc) · 14.9 KB
/
lists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
"""This module handles all the operations which are used for the creation of JSON lists from Excel files."""
import glob
import importlib.resources
import json
import os
from typing import Any, Optional, Union
import jsonschema
import regex
from openpyxl import load_workbook
from openpyxl.cell import Cell
from openpyxl.worksheet.worksheet import Worksheet
from dsp_tools.models.exceptions import BaseError, UserError
from dsp_tools.utils.shared import simplify_name
list_of_lists_of_previous_cell_values: list[list[str]] = []
"""Module level variable used to ensure that there are no duplicate node names"""
list_of_previous_node_names: list[str] = []
"""Module level variable used to ensure that there are no duplicate node names"""
def expand_lists_from_excel(
lists_section: list[dict[str, Union[str, dict[str, Any]]]],
) -> list[dict[str, Any]]:
"""
Checks if the "lists" section of a JSON project file contains references to Excel files.
Expands all Excel files to JSON,
and returns the expanded "lists" section.
If there are no references to Excel files,
the "lists" section is returned as is.
Args:
lists_section: the "lists" section of a parsed JSON project file.
If this is an empty list, an empty list will be returned.
Raises:
UserError: if a problem occurred while trying to expand the Excel files
Returns:
the same "lists" section, but without references to Excel files
"""
new_lists = []
for _list in lists_section:
# case 1: this list is a JSON list: return it as it is
if "folder" not in _list["nodes"]:
new_lists.append(_list)
continue
# case 2: this is a reference to a folder with Excel files
foldername = _list["nodes"]["folder"] # type: ignore[index] # types are too complex to annotate them correctly
excel_file_paths = _extract_excel_file_paths(foldername)
try:
returned_lists_section = _make_json_lists_from_excel(excel_file_paths, verbose=False)
# we only need the "nodes" section of the first element of the returned "lists" section. This "nodes"
# section needs to replace the Excel folder reference. But the rest of the user-defined list element
# needs to stay intact, e.g. the labels and comments.
_list["nodes"] = returned_lists_section[0]["nodes"]
new_lists.append(_list)
print(
f"\tThe list '{_list['name']}' contains a reference to the folder '{foldername}'. The Excel "
f"files therein have been temporarily expanded into the 'lists' section of your project."
)
except BaseError as err:
raise UserError(
f"\tWARNING: The list '{_list['name']}' contains a reference to the folder '{foldername}', but a "
f"problem occurred while trying to expand the Excel files therein into the 'lists' section of "
f"your project: {err.message}"
) from None
return new_lists
def _get_values_from_excel(
excelfiles: dict[str, Worksheet],
base_file: dict[str, Worksheet],
parentnode: dict[str, Any],
row: int,
col: int,
preval: list[str],
verbose: bool = False,
) -> tuple[int, dict[str, Any]]:
"""
This function calls itself recursively to go through the Excel files. It extracts the cell values and composes
the JSON list.
Args:
excelfiles: List of Excel files with the values in different languages
base_file: File name of the base file
parentnode: Name(s) of the parent node(s) of the current node
row: The index of the current row of the Excel sheet
col: The index of the current column of the Excel sheet
preval: List of previous values, needed to check the consistency of the list hierarchy
verbose: verbose switch
Raises:
UserError: if one of the Excel files contains invalid data
Returns:
int: Row index for the next loop (current row index minus 1)
dict: The JSON list up to the current recursion. At the last recursion, this is the final JSON list.
"""
nodes: list[dict[str, Any]] = []
currentnode: dict[str, Any] = dict()
base_file_ws: Worksheet = list(base_file.values())[0]
cell: Cell = base_file_ws.cell(column=col, row=row)
for excelfile in excelfiles.values():
if any((not excelfile["A1"].value, excelfile["B1"].value)):
raise UserError(
f"ERROR: Inconsistency in Excel list: The first row must consist of exactly one value, in cell A1. "
f"All other cells of row 1 must be empty.\nInstead, found the following:\n"
f" - Cell A1: '{excelfile['A1'].value}'\n"
f" - Cell B1: '{excelfile['B1'].value}'"
)
if col > 1:
# append the cell value of the parent node (which is one value to the left of the current cell) to the list of
# previous values
preval.append(str(base_file_ws.cell(column=col - 1, row=row).value).strip())
while cell.value and regex.search(r"\p{L}", str(cell.value), flags=regex.UNICODE):
# check if all predecessors in row (values to the left) are consistent with the values in preval list
for idx, val in enumerate(preval[:-1]):
if val != str(base_file_ws.cell(column=idx + 1, row=row).value).strip():
raise UserError(
"ERROR: Inconsistency in Excel list: "
f"{val} not equal to {str(base_file_ws.cell(column=idx+1, row=row).value).strip()}"
)
# loop through the row until the last (furthest right) value is found
next_value = base_file_ws.cell(column=col + 1, row=row).value
if next_value and regex.search(r"\p{L}", str(next_value), flags=regex.UNICODE):
row, _ = _get_values_from_excel(
excelfiles=excelfiles,
base_file=base_file,
parentnode=currentnode,
col=col + 1,
row=row,
preval=preval,
verbose=verbose,
)
# if value was last in row (no further values to the right), it's a node, continue here
else:
# check if there are duplicate nodes (i.e. identical rows), raise a UserError if so
new_check_list = preval.copy()
new_check_list.append(str(cell.value).strip())
list_of_lists_of_previous_cell_values.append(new_check_list)
if any(list_of_lists_of_previous_cell_values.count(x) > 1 for x in list_of_lists_of_previous_cell_values):
raise UserError(
f"ERROR: There is at least one duplicate node in the list. "
f"Found duplicate in column {cell.column}, row {cell.row}:\n'{str(cell.value).strip()}'"
)
# create a simplified version of the cell value and use it as name of the node
nodename = simplify_name(str(cell.value).strip())
list_of_previous_node_names.append(nodename)
# append a number (p.ex. node-name-2) if there are list nodes with identical names
n = list_of_previous_node_names.count(nodename)
if n > 1:
nodename = nodename + "-" + str(n)
# read label values from the other Excel files (other languages)
labels_dict: dict[str, str] = {}
for other_lang, ws_other_lang in excelfiles.items():
cell_value = ws_other_lang.cell(column=col, row=row).value
if not (isinstance(cell_value, str) and len(cell_value) > 0):
raise UserError(
"ERROR: Malformed Excel file: The Excel file with the language code "
f"'{other_lang}' should have a value in row {row}, column {col}"
)
else:
labels_dict[other_lang] = cell_value.strip()
# create current node from extracted cell values and append it to the nodes list
currentnode = {"name": nodename, "labels": labels_dict}
nodes.append(currentnode)
if verbose:
print(f"Added list node: {str(cell.value).strip()} ({nodename})")
# go one row down and repeat loop if there is a value
row += 1
cell = base_file_ws.cell(column=col, row=row)
if col > 1:
preval.pop()
# add the new nodes to the parentnode
parentnode["nodes"] = nodes
return row - 1, parentnode
def _make_json_lists_from_excel(
excel_file_paths: list[str],
verbose: bool = False,
) -> list[dict[str, Any]]:
"""
Reads Excel files and transforms them into a list of dictionaries that can be used as "lists" array of a JSON
project file.
Args:
excel_file_paths: Excel files to be processed
verbose: verbose switch
Raises:
UserError: if one of the Excel files contains invalid data
Returns:
The finished "lists" section
"""
# reset the global variables
global list_of_previous_node_names
global list_of_lists_of_previous_cell_values
list_of_previous_node_names = []
list_of_lists_of_previous_cell_values = []
# Define starting point in Excel file
startrow = 1
startcol = 1
# make a dict with the language labels and the worksheets
lang_to_worksheet: dict[str, Worksheet] = {}
for filepath in excel_file_paths:
lang_to_worksheet[os.path.basename(filepath)[0:2]] = load_workbook(filepath, read_only=True).worksheets[0]
# take English as base file. If English is not available, take a random one.
base_lang = "en" if "en" in lang_to_worksheet else list(lang_to_worksheet.keys())[0]
base_file = {base_lang: lang_to_worksheet[base_lang]}
# construct the entire "lists" section as children of a fictive dummy parent node
_, _list = _get_values_from_excel(
excelfiles=lang_to_worksheet,
base_file=base_file,
parentnode={},
row=startrow,
col=startcol,
preval=[],
verbose=verbose,
)
# extract the children of the fictive dummy parent node
finished_lists: list[dict[str, Any]] = _list["nodes"]
# the "comments" section is mandatory for the root node of each list, so make a copy of "labels" and insert it at
# the right place
for i, _list in enumerate(finished_lists):
finished_lists[i] = {
"name": _list["name"],
"labels": _list["labels"],
"comments": _list["labels"],
"nodes": _list["nodes"],
}
return finished_lists
def validate_lists_section_with_schema(
path_to_json_project_file: Optional[str] = None,
lists_section: Optional[list[dict[str, Any]]] = None,
) -> bool:
"""
This function checks if a "lists" section of a JSON project is valid according to the schema. The "lists" section
can be passed as path to the JSON project file, or as Python object. Only one of the two arguments should be passed.
Args:
path_to_json_project_file: path to the JSON project file that contains the "lists" section to validate
lists_section: the "lists" section as Python object
Raises:
UserError: if the validation fails
BaseError: if this function is called with invalid parameters
Returns:
True if the "lists" section passed validation
"""
if bool(path_to_json_project_file) == bool(lists_section):
raise BaseError("Validation of the 'lists' section works only if exactly one of the two arguments is given.")
with importlib.resources.files("dsp_tools").joinpath("resources/schema/lists-only.json").open(
encoding="utf-8"
) as schema_file:
lists_schema = json.load(schema_file)
if path_to_json_project_file:
with open(path_to_json_project_file, encoding="utf-8") as f:
project = json.load(f)
lists_section = project["project"].get("lists")
if not lists_section:
raise UserError(
f"Cannot validate 'lists' section of {path_to_json_project_file}, "
"because there is no 'lists' section in this file."
)
try:
jsonschema.validate(instance={"lists": lists_section}, schema=lists_schema)
except jsonschema.ValidationError as err:
raise UserError(
f"'lists' section did not pass validation. The error message is: {err.message}\n"
f"The error occurred at {err.json_path}"
) from None
return True
def _extract_excel_file_paths(excelfolder: str) -> list[str]:
"""
This method extracts the names of the Excel files that are in the folder, and asserts that they are named according
to the requirements.
Args:
excelfolder: path to the folder containing the Excel file(s)
Raises:
UserError: if excelfolder is not a directory, or if one of the files in it has an invalid name
Returns:
list of the Excel file paths to process
"""
if not os.path.isdir(excelfolder):
raise UserError(f"ERROR: {excelfolder} is not a directory.")
excel_file_paths = [
filename
for filename in glob.iglob(f"{excelfolder}/*.xlsx")
if not os.path.basename(filename).startswith("~$") and os.path.isfile(filename)
]
for filepath in excel_file_paths:
if not regex.search(r"^(de|en|fr|it|rm)\.xlsx$", os.path.basename(filepath)):
raise UserError(f"Invalid file name '{filepath}'. Expected format: 'languagecode.xlsx'")
return excel_file_paths
def excel2lists(
excelfolder: str,
path_to_output_file: Optional[str] = None,
verbose: bool = False,
) -> tuple[list[dict[str, Any]], bool]:
"""
Converts lists described in Excel files into a "lists" section that can be inserted into a JSON project file.
Args:
excelfolder: path to the folder containing the Excel file(s)
path_to_output_file: if provided, the output is written into this JSON file
verbose: verbose switch
Raises:
UserError: if something went wrong
BaseError: if something went wrong
Returns:
a tuple consisting of the "lists" section as Python list, and the success status (True if everything went well)
"""
# read the data
excel_file_paths = _extract_excel_file_paths(excelfolder)
if verbose:
print("The following Excel files will be processed:")
print(*(f" - {filename}" for filename in excel_file_paths), sep="\n")
# construct the "lists" section
finished_lists = _make_json_lists_from_excel(excel_file_paths, verbose=verbose)
validate_lists_section_with_schema(lists_section=finished_lists)
# write final "lists" section
if path_to_output_file:
with open(path_to_output_file, "w", encoding="utf-8") as fp:
json.dump(finished_lists, fp, indent=4, ensure_ascii=False)
print(f"lists section was created successfully and written to file '{path_to_output_file}'")
return finished_lists, True