-
Notifications
You must be signed in to change notification settings - Fork 4
/
project.py
127 lines (105 loc) · 4.82 KB
/
project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import json
import os
import regex
from dsp_tools.models.exceptions import BaseError
from dsp_tools.utils.excel_to_json.lists import excel2lists
from dsp_tools.utils.excel_to_json.properties import excel2properties
from dsp_tools.utils.excel_to_json.resources import excel2resources
def excel2json(
data_model_files: str,
path_to_output_file: str,
) -> bool:
"""
Converts a folder containing Excel files into a JSON data model file. The folder must be structured like this:
::
data_model_files
|-- lists
| |-- de.xlsx
| `-- en.xlsx
`-- onto_name (onto_label)
|-- properties.xlsx
`-- resources.xlsx
The names of the files must be exactly like in the example. The folder "lists" can be missing, because it is
optional to have lists in a DSP project. Only XLSX files are allowed.
Args:
data_model_files: path to the folder (called "data_model_files" in the example)
path_to_output_file: path to the file where the output JSON file will be saved
Raises:
BaseError: if something went wrong
Returns:
True if everything went well
"""
overall_success = True
# validate input
# --------------
if not os.path.isdir(data_model_files):
raise BaseError(f"ERROR: {data_model_files} is not a directory.")
folder = [x for x in os.scandir(data_model_files) if not regex.search(r"^(\.|~\$).+", x.name)]
processed_files = []
onto_folders = [x for x in folder if os.path.isdir(x) and regex.search(r"([\w.-]+) \(([\w.\- ]+)\)", x.name)]
if len(onto_folders) == 0:
raise BaseError(
f"'{data_model_files}' must contain at least one subfolder named after the pattern 'onto_name (onto_label)'"
)
for onto_folder in onto_folders:
contents = sorted([x.name for x in os.scandir(onto_folder) if not regex.search(r"^(\.|~\$).+", x.name)])
if contents != ["properties.xlsx", "resources.xlsx"]:
raise BaseError(
f"ERROR: '{data_model_files}/{onto_folder.name}' must contain one file 'properties.xlsx' "
"and one file 'resources.xlsx', but nothing else."
)
processed_files.extend([f"{data_model_files}/{onto_folder.name}/{file}" for file in contents])
listfolder = [x for x in folder if os.path.isdir(x) and x.name == "lists"]
if listfolder:
listfolder_contents = [x for x in os.scandir(listfolder[0]) if not regex.search(r"^(\.|~\$).+", x.name)]
if not all(regex.search(r"(de|en|fr|it|rm).xlsx", file.name) for file in listfolder_contents):
raise BaseError(
f"The only files allowed in '{data_model_files}/lists' are en.xlsx, de.xlsx, fr.xlsx, it.xlsx, rm.xlsx"
)
processed_files = [f"{data_model_files}/lists/{file.name}" for file in listfolder_contents] + processed_files
if len(onto_folders) + len(listfolder) != len(folder):
raise BaseError(
f"The only allowed subfolders in '{data_model_files}' are 'lists' "
"and folders that match the pattern 'onto_name (onto_label)'"
)
print("The following files will be processed:")
print(*(f" - {file}" for file in processed_files), sep="\n")
# create output
# -------------
lists, success = excel2lists(excelfolder=f"{data_model_files}/lists") if listfolder else (None, True)
if not success:
overall_success = False
ontologies = []
for onto_folder in onto_folders:
name, label = regex.search(r"([\w.-]+) \(([\w.\- ]+)\)", onto_folder.name).groups() # type: ignore[union-attr]
resources, success1 = excel2resources(f"{data_model_files}/{onto_folder.name}/resources.xlsx")
properties, success2 = excel2properties(f"{data_model_files}/{onto_folder.name}/properties.xlsx")
if not success1 or not success2:
overall_success = False
ontologies.append(
{
"name": name,
"label": label,
"properties": properties,
"resources": resources,
}
)
schema = "https://raw.githubusercontent.com/dasch-swiss/dsp-tools/main/src/dsp_tools/resources/schema/project.json"
project = {
"prefixes": {"": ""},
"$schema": schema,
"project": {
"shortcode": "",
"shortname": "",
"longname": "",
"descriptions": {"en": ""},
"keywords": [""],
},
}
if lists:
project["project"]["lists"] = lists # type: ignore[index]
project["project"]["ontologies"] = ontologies # type: ignore[index]
with open(path_to_output_file, "w", encoding="utf-8") as f:
json.dump(project, f, indent=4, ensure_ascii=False)
print(f"JSON project file successfully saved at {path_to_output_file}")
return overall_success