-
Notifications
You must be signed in to change notification settings - Fork 4
/
project.py
156 lines (129 loc) · 6.17 KB
/
project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import json
from pathlib import Path
from typing import Any
import regex
from dsp_tools.commands.excel2json.lists import excel2lists
from dsp_tools.commands.excel2json.properties import excel2properties
from dsp_tools.commands.excel2json.resources import excel2resources
from dsp_tools.models.exceptions import UserError
def excel2json(
data_model_files: str,
path_to_output_file: str,
) -> bool:
"""
Converts a folder containing Excel files into a JSON data model file. The folder must be structured like this:
::
data_model_files
|-- lists
| |-- de.xlsx
| `-- en.xlsx
`-- onto_name (onto_label)
|-- properties.xlsx
`-- resources.xlsx
The names of the files must be exactly like in the example. The folder "lists" can be missing, because it is
optional to have lists in a DSP project. Only XLSX files are allowed.
Args:
data_model_files: path to the folder (called "data_model_files" in the example)
path_to_output_file: path to the file where the output JSON file will be saved
Raises:
UserError: if something went wrong
BaseError: if something went wrong
Returns:
True if everything went well
"""
listfolder, onto_folders = _validate_folder_structure_get_filenames(data_model_files)
overall_success, project = _create_project_json(data_model_files, listfolder, onto_folders)
with open(path_to_output_file, "w", encoding="utf-8") as f:
json.dump(project, f, indent=4, ensure_ascii=False)
print(f"JSON project file successfully saved at {path_to_output_file}")
return overall_success
def _validate_folder_structure_get_filenames(data_model_files: str) -> tuple[list[Path], list[Path]]:
if not Path(data_model_files).is_dir():
raise UserError(f"ERROR: {data_model_files} is not a directory.")
folder = [x for x in Path(data_model_files).glob("*") if _non_hidden(x)]
processed_files = []
onto_folders, processed_onto = _get_validate_onto_folder(data_model_files, folder)
processed_files.extend(processed_onto)
listfolder, processed_lists = _get_validate_list_folder(data_model_files, folder)
processed_files.extend(processed_lists)
if len(onto_folders) + len(listfolder) != len(folder):
raise UserError(
f"The only allowed subfolders in '{data_model_files}' are 'lists' "
"and folders that match the pattern 'onto_name (onto_label)'"
)
print("The following files will be processed:")
print(*(f" - {file}" for file in processed_files), sep="\n")
return listfolder, onto_folders
def _get_validate_list_folder(data_model_files: str, folder: list[Path]) -> tuple[list[Path], list[str]]:
processed_files: list[str] = []
listfolder = [x for x in folder if x.is_dir() and x.name == "lists"]
if listfolder:
listfolder_contents = [x for x in Path(listfolder[0]).glob("*") if _non_hidden(x)]
if not all(regex.search(r"(de|en|fr|it|rm).xlsx", file.name) for file in listfolder_contents):
raise UserError(
f"The only files allowed in '{data_model_files}/lists' are en.xlsx, de.xlsx, fr.xlsx, it.xlsx, rm.xlsx"
)
processed_files = [f"{data_model_files}/lists/{file.name}" for file in listfolder_contents]
return listfolder, processed_files
def _get_validate_onto_folder(data_model_files: str, folder: list[Path]) -> tuple[list[Path], list[str]]:
processed_files = []
onto_folders = [x for x in folder if x.is_dir() and regex.search(r"([\w.-]+) \(([\w.\- ]+)\)", x.name)]
if not onto_folders:
raise UserError(
f"'{data_model_files}' must contain at least one subfolder named after the pattern 'onto_name (onto_label)'"
)
for onto_folder in onto_folders:
contents = sorted([x.name for x in Path(onto_folder).glob("*") if _non_hidden(x)])
if contents != ["properties.xlsx", "resources.xlsx"]:
raise UserError(
f"ERROR: '{data_model_files}/{onto_folder.name}' must contain one file 'properties.xlsx' "
"and one file 'resources.xlsx', but nothing else."
)
processed_files.extend([f"{data_model_files}/{onto_folder.name}/{file}" for file in contents])
return onto_folders, processed_files
def _non_hidden(path: Path) -> bool:
return not regex.search(r"^(\.|~\$).+", path.name)
def _create_project_json(
data_model_files: str, listfolder: list[Path], onto_folders: list[Path]
) -> tuple[bool, dict[str, Any]]:
overall_success = True
lists, success = excel2lists(excelfolder=f"{data_model_files}/lists") if listfolder else (None, True)
if not success:
overall_success = False
ontologies, success = _get_ontologies(data_model_files, onto_folders)
if not success:
overall_success = False
schema = "https://raw.githubusercontent.com/dasch-swiss/dsp-tools/main/src/dsp_tools/resources/schema/project.json"
project = {
"prefixes": {"": ""},
"$schema": schema,
"project": {
"shortcode": "",
"shortname": "",
"longname": "",
"descriptions": {"en": ""},
"keywords": [""],
},
}
if lists:
project["project"]["lists"] = lists # type: ignore[index]
project["project"]["ontologies"] = ontologies # type: ignore[index]
return overall_success, project
def _get_ontologies(data_model_files: str, onto_folders: list[Path]) -> tuple[list[dict[str, Any]], bool]:
success = True
ontologies = []
for onto_folder in onto_folders:
name, label = regex.search(r"([\w.-]+) \(([\w.\- ]+)\)", onto_folder.name).groups() # type: ignore[union-attr]
resources, success1 = excel2resources(f"{data_model_files}/{onto_folder.name}/resources.xlsx")
properties, success2 = excel2properties(f"{data_model_files}/{onto_folder.name}/properties.xlsx")
if not success1 or not success2:
success = False
ontologies.append(
{
"name": name,
"label": label,
"properties": properties,
"resources": resources,
}
)
return ontologies, success