Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(excel2json): modularise functions (DEV-3025) #655

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
159 changes: 84 additions & 75 deletions src/dsp_tools/commands/excel2json/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,44 @@ def expand_lists_from_excel(
return new_lists


def excel2lists(
excelfolder: str,
path_to_output_file: Optional[str] = None,
verbose: bool = False,
) -> tuple[list[dict[str, Any]], bool]:
"""
Converts lists described in Excel files into a "lists" section that can be inserted into a JSON project file.

Args:
excelfolder: path to the folder containing the Excel file(s)
path_to_output_file: if provided, the output is written into this JSON file
verbose: verbose switch

Raises:
UserError: if something went wrong
BaseError: if something went wrong

Returns:
a tuple consisting of the "lists" section as Python list, and the success status (True if everything went well)
"""
# read the data
excel_file_paths = _extract_excel_file_paths(excelfolder)
if verbose:
print("The following Excel files will be processed:")
print(*(f" - {filename}" for filename in excel_file_paths), sep="\n")

# construct the "lists" section
finished_lists = _make_json_lists_from_excel(excel_file_paths, verbose=verbose)
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
validate_lists_section_with_schema(lists_section=finished_lists)

if path_to_output_file:
with open(path_to_output_file, "w", encoding="utf-8") as fp:
json.dump(finished_lists, fp, indent=4, ensure_ascii=False)
print(f"lists section was created successfully and written to file '{path_to_output_file}'")

return finished_lists, True


def _get_values_from_excel(
excelfiles: dict[str, Worksheet],
base_file: dict[str, Worksheet],
Expand Down Expand Up @@ -143,43 +181,8 @@ def _get_values_from_excel(

# if value was last in row (no further values to the right), it's a node, continue here
else:
# check if there are duplicate nodes (i.e. identical rows), raise a UserError if so
new_check_list = preval.copy()
new_check_list.append(str(cell.value).strip())
list_of_lists_of_previous_cell_values.append(new_check_list)

if any(list_of_lists_of_previous_cell_values.count(x) > 1 for x in list_of_lists_of_previous_cell_values):
raise UserError(
f"ERROR: There is at least one duplicate node in the list. "
f"Found duplicate in column {cell.column}, row {cell.row}:\n'{str(cell.value).strip()}'"
)

# create a simplified version of the cell value and use it as name of the node
nodename = simplify_name(str(cell.value).strip())
list_of_previous_node_names.append(nodename)

# append a number (p.ex. node-name-2) if there are list nodes with identical names
n = list_of_previous_node_names.count(nodename)
if n > 1:
nodename = f"{nodename}-{n}"

# read label values from the other Excel files (other languages)
labels_dict: dict[str, str] = {}
for other_lang, ws_other_lang in excelfiles.items():
cell_value = ws_other_lang.cell(column=col, row=row).value
if not (isinstance(cell_value, str) and len(cell_value) > 0):
raise UserError(
"ERROR: Malformed Excel file: The Excel file with the language code "
f"'{other_lang}' should have a value in row {row}, column {col}"
)
else:
labels_dict[other_lang] = cell_value.strip()

# create current node from extracted cell values and append it to the nodes list
currentnode = {"name": nodename, "labels": labels_dict}
currentnode = _make_new_node(cell, col, excelfiles, preval, row, verbose)
nodes.append(currentnode)
if verbose:
print(f"Added list node: {str(cell.value).strip()} ({nodename})")

# go one row down and repeat loop if there is a value
row += 1
Expand All @@ -194,6 +197,51 @@ def _get_values_from_excel(
return row - 1, parentnode


def _make_new_node(
cell: Cell,
col: int,
excelfiles: dict[str, Worksheet],
preval: list[str],
row: int,
verbose: bool = False,
) -> dict[str, Any]:
# check if there are duplicate nodes (i.e. identical rows), raise a UserError if so
new_check_list = preval.copy()
new_check_list.append(str(cell.value).strip())
list_of_lists_of_previous_cell_values.append(new_check_list)
if any(list_of_lists_of_previous_cell_values.count(x) > 1 for x in list_of_lists_of_previous_cell_values):
raise UserError(
f"ERROR: There is at least one duplicate node in the list. "
f"Found duplicate in column {cell.column}, row {cell.row}:\n'{str(cell.value).strip()}'"
)

# create a simplified version of the cell value and use it as name of the node
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
nodename = simplify_name(str(cell.value).strip())
list_of_previous_node_names.append(nodename)
# append a number (p.ex. node-name-2) if there are list nodes with identical names
n = list_of_previous_node_names.count(nodename)

if n > 1:
nodename = f"{nodename}-{n}"
# read label values from the other Excel files (other languages)
labels_dict: dict[str, str] = {}
for other_lang, ws_other_lang in excelfiles.items():
cell_value = ws_other_lang.cell(column=col, row=row).value
if not (isinstance(cell_value, str) and len(cell_value) > 0):
raise UserError(
"ERROR: Malformed Excel file: The Excel file with the language code "
f"'{other_lang}' should have a value in row {row}, column {col}"
)
else:
labels_dict[other_lang] = cell_value.strip()

# create the current node from extracted cell values and append it to the nodes list
currentnode = {"name": nodename, "labels": labels_dict}
if verbose:
print(f"Added list node: {str(cell.value).strip()} ({nodename})")
return currentnode


def _make_json_lists_from_excel(
excel_file_paths: list[Path],
verbose: bool = False,
Expand Down Expand Up @@ -329,42 +377,3 @@ def _extract_excel_file_paths(excelfolder: str) -> list[Path]:
raise UserError(f"Invalid file name '{filepath}'. Expected format: 'languagecode.xlsx'")

return excel_file_paths


def excel2lists(
excelfolder: str,
path_to_output_file: Optional[str] = None,
verbose: bool = False,
) -> tuple[list[dict[str, Any]], bool]:
"""
Converts lists described in Excel files into a "lists" section that can be inserted into a JSON project file.

Args:
excelfolder: path to the folder containing the Excel file(s)
path_to_output_file: if provided, the output is written into this JSON file
verbose: verbose switch

Raises:
UserError: if something went wrong
BaseError: if something went wrong

Returns:
a tuple consisting of the "lists" section as Python list, and the success status (True if everything went well)
"""
# read the data
excel_file_paths = _extract_excel_file_paths(excelfolder)
if verbose:
print("The following Excel files will be processed:")
print(*(f" - {filename}" for filename in excel_file_paths), sep="\n")

# construct the "lists" section
finished_lists = _make_json_lists_from_excel(excel_file_paths, verbose=verbose)
validate_lists_section_with_schema(lists_section=finished_lists)

# write final "lists" section
if path_to_output_file:
with open(path_to_output_file, "w", encoding="utf-8") as fp:
json.dump(finished_lists, fp, indent=4, ensure_ascii=False)
print(f"lists section was created successfully and written to file '{path_to_output_file}'")

return finished_lists, True
112 changes: 68 additions & 44 deletions src/dsp_tools/commands/excel2json/project.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
from pathlib import Path
from typing import Any

import regex

Expand Down Expand Up @@ -41,14 +42,51 @@ def excel2json(
True if everything went well
"""

overall_success = True
listfolder, onto_folders = _validate_folder_structure_get_filenames(data_model_files)

overall_success, project = _create_project_json(data_model_files, listfolder, onto_folders)
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved

with open(path_to_output_file, "w", encoding="utf-8") as f:
json.dump(project, f, indent=4, ensure_ascii=False)

# validate input
# --------------
print(f"JSON project file successfully saved at {path_to_output_file}")

return overall_success


def _validate_folder_structure_get_filenames(data_model_files: str) -> tuple[list[Path], list[Path]]:
Nora-Olivia-Ammann marked this conversation as resolved.
Show resolved Hide resolved
if not Path(data_model_files).is_dir():
raise UserError(f"ERROR: {data_model_files} is not a directory.")
folder = [x for x in Path(data_model_files).glob("*") if not regex.search(r"^(\.|~\$).+", x.name)]
processed_files = []
onto_folders, processed_onto = _get_validate_onto_folder(data_model_files, folder)
processed_files.extend(processed_onto)
listfolder, processed_lists = _get_validate_list_folder(data_model_files, folder)
processed_files.extend(processed_lists)
if len(onto_folders) + len(listfolder) != len(folder):
raise UserError(
f"The only allowed subfolders in '{data_model_files}' are 'lists' "
"and folders that match the pattern 'onto_name (onto_label)'"
)
print("The following files will be processed:")
print(*(f" - {file}" for file in processed_files), sep="\n")
return listfolder, onto_folders


def _get_validate_list_folder(data_model_files: str, folder: list[Path]) -> tuple[list[Path], list[str]]:
processed_files: list[str] = []
listfolder = [x for x in folder if x.is_dir() and x.name == "lists"]
if listfolder:
listfolder_contents = [x for x in Path(listfolder[0]).glob("*") if not regex.search(r"^(\.|~\$).+", x.name)]
if not all(regex.search(r"(de|en|fr|it|rm).xlsx", file.name) for file in listfolder_contents):
raise UserError(
f"The only files allowed in '{data_model_files}/lists' are en.xlsx, de.xlsx, fr.xlsx, it.xlsx, rm.xlsx"
)
processed_files = [f"{data_model_files}/lists/{file.name}" for file in listfolder_contents]
return listfolder, processed_files


def _get_validate_onto_folder(data_model_files: str, folder: list[Path]) -> tuple[list[Path], list[str]]:
processed_files = []
onto_folders = [x for x in folder if x.is_dir() and regex.search(r"([\w.-]+) \(([\w.\- ]+)\)", x.name)]
if not onto_folders:
Expand All @@ -63,47 +101,19 @@ def excel2json(
"and one file 'resources.xlsx', but nothing else."
)
processed_files.extend([f"{data_model_files}/{onto_folder.name}/{file}" for file in contents])
return onto_folders, processed_files

listfolder = [x for x in folder if x.is_dir() and x.name == "lists"]
if listfolder:
listfolder_contents = [x for x in Path(listfolder[0]).glob("*") if not regex.search(r"^(\.|~\$).+", x.name)]
if not all(regex.search(r"(de|en|fr|it|rm).xlsx", file.name) for file in listfolder_contents):
raise UserError(
f"The only files allowed in '{data_model_files}/lists' are en.xlsx, de.xlsx, fr.xlsx, it.xlsx, rm.xlsx"
)
processed_files = [f"{data_model_files}/lists/{file.name}" for file in listfolder_contents] + processed_files

if len(onto_folders) + len(listfolder) != len(folder):
raise UserError(
f"The only allowed subfolders in '{data_model_files}' are 'lists' "
"and folders that match the pattern 'onto_name (onto_label)'"
)

print("The following files will be processed:")
print(*(f" - {file}" for file in processed_files), sep="\n")

# create output
# -------------
def _create_project_json(
data_model_files: str, listfolder: list[Path], onto_folders: list[Path]
) -> tuple[bool, dict[str, Any]]:
overall_success = True
lists, success = excel2lists(excelfolder=f"{data_model_files}/lists") if listfolder else (None, True)
if not success:
overall_success = False

ontologies = []
for onto_folder in onto_folders:
name, label = regex.search(r"([\w.-]+) \(([\w.\- ]+)\)", onto_folder.name).groups() # type: ignore[union-attr]
resources, success1 = excel2resources(f"{data_model_files}/{onto_folder.name}/resources.xlsx")
properties, success2 = excel2properties(f"{data_model_files}/{onto_folder.name}/properties.xlsx")
if not success1 or not success2:
overall_success = False
ontologies.append(
{
"name": name,
"label": label,
"properties": properties,
"resources": resources,
}
)

ontologies, success = _get_ontologies(data_model_files, onto_folders)
if not success:
overall_success = False
schema = "https://raw.githubusercontent.com/dasch-swiss/dsp-tools/main/src/dsp_tools/resources/schema/project.json"
project = {
"prefixes": {"": ""},
Expand All @@ -119,10 +129,24 @@ def excel2json(
if lists:
project["project"]["lists"] = lists # type: ignore[index]
project["project"]["ontologies"] = ontologies # type: ignore[index]
return overall_success, project

with open(path_to_output_file, "w", encoding="utf-8") as f:
json.dump(project, f, indent=4, ensure_ascii=False)

print(f"JSON project file successfully saved at {path_to_output_file}")

return overall_success
def _get_ontologies(data_model_files: str, onto_folders: list[Path]) -> tuple[list[dict[str, Any]], bool]:
success = True
ontologies = []
for onto_folder in onto_folders:
name, label = regex.search(r"([\w.-]+) \(([\w.\- ]+)\)", onto_folder.name).groups() # type: ignore[union-attr]
resources, success1 = excel2resources(f"{data_model_files}/{onto_folder.name}/resources.xlsx")
properties, success2 = excel2properties(f"{data_model_files}/{onto_folder.name}/properties.xlsx")
if not success1 or not success2:
success = False
ontologies.append(
{
"name": name,
"label": label,
"properties": properties,
"resources": resources,
}
)
return ontologies, success