dasch-swiss · Nora-Olivia-Ammann · Nov 28, 2023 · Nov 27, 2023 · Nov 27, 2023 · Nov 27, 2023
diff --git a/src/dsp_tools/commands/excel2json/lists.py b/src/dsp_tools/commands/excel2json/lists.py
@@ -71,6 +71,44 @@ def expand_lists_from_excel(
     return new_lists
 
 
+def excel2lists(
+    excelfolder: str,
+    path_to_output_file: Optional[str] = None,
+    verbose: bool = False,
+) -> tuple[list[dict[str, Any]], bool]:
+    """
+    Converts lists described in Excel files into a "lists" section that can be inserted into a JSON project file.
+
+    Args:
+        excelfolder: path to the folder containing the Excel file(s)
+        path_to_output_file: if provided, the output is written into this JSON file
+        verbose: verbose switch
+
+    Raises:
+        UserError: if something went wrong
+        BaseError: if something went wrong
+
+    Returns:
+        a tuple consisting of the "lists" section as Python list, and the success status (True if everything went well)
+    """
+    # read the data
+    excel_file_paths = _extract_excel_file_paths(excelfolder)
+    if verbose:
+        print("The following Excel files will be processed:")
+        print(*(f" - {filename}" for filename in excel_file_paths), sep="\n")
+
+    # construct the "lists" section
+    finished_lists = _make_json_lists_from_excel(excel_file_paths, verbose=verbose)
+    validate_lists_section_with_schema(lists_section=finished_lists)
+
+    if path_to_output_file:
+        with open(path_to_output_file, "w", encoding="utf-8") as fp:
+            json.dump(finished_lists, fp, indent=4, ensure_ascii=False)
+            print(f"lists section was created successfully and written to file '{path_to_output_file}'")
+
+    return finished_lists, True
+
+
 def _get_values_from_excel(
     excelfiles: dict[str, Worksheet],
     base_file: dict[str, Worksheet],
@@ -143,43 +181,8 @@ def _get_values_from_excel(
 
         # if value was last in row (no further values to the right), it's a node, continue here
         else:
-            # check if there are duplicate nodes (i.e. identical rows), raise a UserError if so
-            new_check_list = preval.copy()
-            new_check_list.append(str(cell.value).strip())
-            list_of_lists_of_previous_cell_values.append(new_check_list)
-
-            if any(list_of_lists_of_previous_cell_values.count(x) > 1 for x in list_of_lists_of_previous_cell_values):
-                raise UserError(
-                    f"ERROR: There is at least one duplicate node in the list. "
-                    f"Found duplicate in column {cell.column}, row {cell.row}:\n'{str(cell.value).strip()}'"
-                )
-
-            # create a simplified version of the cell value and use it as name of the node
-            nodename = simplify_name(str(cell.value).strip())
-            list_of_previous_node_names.append(nodename)
-
-            # append a number (p.ex. node-name-2) if there are list nodes with identical names
-            n = list_of_previous_node_names.count(nodename)
-            if n > 1:
-                nodename = f"{nodename}-{n}"
-
-            # read label values from the other Excel files (other languages)
-            labels_dict: dict[str, str] = {}
-            for other_lang, ws_other_lang in excelfiles.items():
-                cell_value = ws_other_lang.cell(column=col, row=row).value
-                if not (isinstance(cell_value, str) and len(cell_value) > 0):
-                    raise UserError(
-                        "ERROR: Malformed Excel file: The Excel file with the language code "
-                        f"'{other_lang}' should have a value in row {row}, column {col}"
-                    )
-                else:
-                    labels_dict[other_lang] = cell_value.strip()
-
-            # create current node from extracted cell values and append it to the nodes list
-            currentnode = {"name": nodename, "labels": labels_dict}
+            currentnode = _make_new_node(cell, col, excelfiles, preval, row, verbose)
             nodes.append(currentnode)
-            if verbose:
-                print(f"Added list node: {str(cell.value).strip()} ({nodename})")
 
         # go one row down and repeat loop if there is a value
         row += 1
@@ -194,6 +197,51 @@ def _get_values_from_excel(
     return row - 1, parentnode
 
 
+def _make_new_node(
+    cell: Cell,
+    col: int,
+    excelfiles: dict[str, Worksheet],
+    preval: list[str],
+    row: int,
+    verbose: bool = False,
+) -> dict[str, Any]:
+    # check if there are duplicate nodes (i.e. identical rows), raise a UserError if so
+    new_check_list = preval.copy()
+    new_check_list.append(str(cell.value).strip())
+    list_of_lists_of_previous_cell_values.append(new_check_list)
+    if any(list_of_lists_of_previous_cell_values.count(x) > 1 for x in list_of_lists_of_previous_cell_values):
+        raise UserError(
+            f"ERROR: There is at least one duplicate node in the list. "
+            f"Found duplicate in column {cell.column}, row {cell.row}:\n'{str(cell.value).strip()}'"
+        )
+
+    # create a simplified version of the cell value and use it as name of the node
+    nodename = simplify_name(str(cell.value).strip())
+    list_of_previous_node_names.append(nodename)
+    # append a number (p.ex. node-name-2) if there are list nodes with identical names
+    n = list_of_previous_node_names.count(nodename)
+
+    if n > 1:
+        nodename = f"{nodename}-{n}"
+    # read label values from the other Excel files (other languages)
+    labels_dict: dict[str, str] = {}
+    for other_lang, ws_other_lang in excelfiles.items():
+        cell_value = ws_other_lang.cell(column=col, row=row).value
+        if not (isinstance(cell_value, str) and len(cell_value) > 0):
+            raise UserError(
+                "ERROR: Malformed Excel file: The Excel file with the language code "
+                f"'{other_lang}' should have a value in row {row}, column {col}"
+            )
+        else:
+            labels_dict[other_lang] = cell_value.strip()
+
+    # create the current node from extracted cell values and append it to the nodes list
+    currentnode = {"name": nodename, "labels": labels_dict}
+    if verbose:
+        print(f"Added list node: {str(cell.value).strip()} ({nodename})")
+    return currentnode
+
+
 def _make_json_lists_from_excel(
     excel_file_paths: list[Path],
     verbose: bool = False,
@@ -329,42 +377,3 @@ def _extract_excel_file_paths(excelfolder: str) -> list[Path]:
             raise UserError(f"Invalid file name '{filepath}'. Expected format: 'languagecode.xlsx'")
 
     return excel_file_paths
-
-
-def excel2lists(
-    excelfolder: str,
-    path_to_output_file: Optional[str] = None,
-    verbose: bool = False,
-) -> tuple[list[dict[str, Any]], bool]:
-    """
-    Converts lists described in Excel files into a "lists" section that can be inserted into a JSON project file.
-
-    Args:
-        excelfolder: path to the folder containing the Excel file(s)
-        path_to_output_file: if provided, the output is written into this JSON file
-        verbose: verbose switch
-
-    Raises:
-        UserError: if something went wrong
-        BaseError: if something went wrong
-
-    Returns:
-        a tuple consisting of the "lists" section as Python list, and the success status (True if everything went well)
-    """
-    # read the data
-    excel_file_paths = _extract_excel_file_paths(excelfolder)
-    if verbose:
-        print("The following Excel files will be processed:")
-        print(*(f" - {filename}" for filename in excel_file_paths), sep="\n")
-
-    # construct the "lists" section
-    finished_lists = _make_json_lists_from_excel(excel_file_paths, verbose=verbose)
-    validate_lists_section_with_schema(lists_section=finished_lists)
-
-    # write final "lists" section
-    if path_to_output_file:
-        with open(path_to_output_file, "w", encoding="utf-8") as fp:
-            json.dump(finished_lists, fp, indent=4, ensure_ascii=False)
-            print(f"lists section was created successfully and written to file '{path_to_output_file}'")
-
-    return finished_lists, True
diff --git a/src/dsp_tools/commands/excel2json/project.py b/src/dsp_tools/commands/excel2json/project.py
@@ -1,5 +1,6 @@
 import json
 from pathlib import Path
+from typing import Any
 
 import regex
 
@@ -41,14 +42,51 @@ def excel2json(
         True if everything went well
     """
 
-    overall_success = True
+    listfolder, onto_folders = _validate_folder_structure_get_filenames(data_model_files)
+
+    overall_success, project = _create_project_json(data_model_files, listfolder, onto_folders)
+
+    with open(path_to_output_file, "w", encoding="utf-8") as f:
+        json.dump(project, f, indent=4, ensure_ascii=False)
 
-    # validate input
-    # --------------
+    print(f"JSON project file successfully saved at {path_to_output_file}")
+
+    return overall_success
+
+
+def _validate_folder_structure_get_filenames(data_model_files: str) -> tuple[list[Path], list[Path]]:
     if not Path(data_model_files).is_dir():
         raise UserError(f"ERROR: {data_model_files} is not a directory.")
     folder = [x for x in Path(data_model_files).glob("*") if not regex.search(r"^(\.|~\$).+", x.name)]
+    processed_files = []
+    onto_folders, processed_onto = _get_validate_onto_folder(data_model_files, folder)
+    processed_files.extend(processed_onto)
+    listfolder, processed_lists = _get_validate_list_folder(data_model_files, folder)
+    processed_files.extend(processed_lists)
+    if len(onto_folders) + len(listfolder) != len(folder):
+        raise UserError(
+            f"The only allowed subfolders in '{data_model_files}' are 'lists' "
+            "and folders that match the pattern 'onto_name (onto_label)'"
+        )
+    print("The following files will be processed:")
+    print(*(f" - {file}" for file in processed_files), sep="\n")
+    return listfolder, onto_folders
+
 
+def _get_validate_list_folder(data_model_files: str, folder: list[Path]) -> tuple[list[Path], list[str]]:
+    processed_files: list[str] = []
+    listfolder = [x for x in folder if x.is_dir() and x.name == "lists"]
+    if listfolder:
+        listfolder_contents = [x for x in Path(listfolder[0]).glob("*") if not regex.search(r"^(\.|~\$).+", x.name)]
+        if not all(regex.search(r"(de|en|fr|it|rm).xlsx", file.name) for file in listfolder_contents):
+            raise UserError(
+                f"The only files allowed in '{data_model_files}/lists' are en.xlsx, de.xlsx, fr.xlsx, it.xlsx, rm.xlsx"
+            )
+        processed_files = [f"{data_model_files}/lists/{file.name}" for file in listfolder_contents]
+    return listfolder, processed_files
+
+
+def _get_validate_onto_folder(data_model_files: str, folder: list[Path]) -> tuple[list[Path], list[str]]:
     processed_files = []
     onto_folders = [x for x in folder if x.is_dir() and regex.search(r"([\w.-]+) \(([\w.\- ]+)\)", x.name)]
     if not onto_folders:
@@ -63,47 +101,19 @@ def excel2json(
                 "and one file 'resources.xlsx', but nothing else."
             )
         processed_files.extend([f"{data_model_files}/{onto_folder.name}/{file}" for file in contents])
+    return onto_folders, processed_files
 
-    listfolder = [x for x in folder if x.is_dir() and x.name == "lists"]
-    if listfolder:
-        listfolder_contents = [x for x in Path(listfolder[0]).glob("*") if not regex.search(r"^(\.|~\$).+", x.name)]
-        if not all(regex.search(r"(de|en|fr|it|rm).xlsx", file.name) for file in listfolder_contents):
-            raise UserError(
-                f"The only files allowed in '{data_model_files}/lists' are en.xlsx, de.xlsx, fr.xlsx, it.xlsx, rm.xlsx"
-            )
-        processed_files = [f"{data_model_files}/lists/{file.name}" for file in listfolder_contents] + processed_files
-
-    if len(onto_folders) + len(listfolder) != len(folder):
-        raise UserError(
-            f"The only allowed subfolders in '{data_model_files}' are 'lists' "
-            "and folders that match the pattern 'onto_name (onto_label)'"
-        )
-
-    print("The following files will be processed:")
-    print(*(f" - {file}" for file in processed_files), sep="\n")
 
-    # create output
-    # -------------
+def _create_project_json(
+    data_model_files: str, listfolder: list[Path], onto_folders: list[Path]
+) -> tuple[bool, dict[str, Any]]:
+    overall_success = True
     lists, success = excel2lists(excelfolder=f"{data_model_files}/lists") if listfolder else (None, True)
     if not success:
         overall_success = False
-
-    ontologies = []
-    for onto_folder in onto_folders:
-        name, label = regex.search(r"([\w.-]+) \(([\w.\- ]+)\)", onto_folder.name).groups()  # type: ignore[union-attr]
-        resources, success1 = excel2resources(f"{data_model_files}/{onto_folder.name}/resources.xlsx")
-        properties, success2 = excel2properties(f"{data_model_files}/{onto_folder.name}/properties.xlsx")
-        if not success1 or not success2:
-            overall_success = False
-        ontologies.append(
-            {
-                "name": name,
-                "label": label,
-                "properties": properties,
-                "resources": resources,
-            }
-        )
-
+    ontologies, success = _get_ontologies(data_model_files, onto_folders)
+    if not success:
+        overall_success = False
     schema = "https://raw.githubusercontent.com/dasch-swiss/dsp-tools/main/src/dsp_tools/resources/schema/project.json"
     project = {
         "prefixes": {"": ""},
@@ -119,10 +129,24 @@ def excel2json(
     if lists:
         project["project"]["lists"] = lists  # type: ignore[index]
     project["project"]["ontologies"] = ontologies  # type: ignore[index]
+    return overall_success, project
 
-    with open(path_to_output_file, "w", encoding="utf-8") as f:
-        json.dump(project, f, indent=4, ensure_ascii=False)
-
-    print(f"JSON project file successfully saved at {path_to_output_file}")
 
-    return overall_success
+def _get_ontologies(data_model_files: str, onto_folders: list[Path]) -> tuple[list[dict[str, Any]], bool]:
+    success = True
+    ontologies = []
+    for onto_folder in onto_folders:
+        name, label = regex.search(r"([\w.-]+) \(([\w.\- ]+)\)", onto_folder.name).groups()  # type: ignore[union-attr]
+        resources, success1 = excel2resources(f"{data_model_files}/{onto_folder.name}/resources.xlsx")
+        properties, success2 = excel2properties(f"{data_model_files}/{onto_folder.name}/properties.xlsx")
+        if not success1 or not success2:
+            success = False
+        ontologies.append(
+            {
+                "name": name,
+                "label": label,
+                "properties": properties,
+                "resources": resources,
+            }
+        )
+    return ontologies, success