-
Notifications
You must be signed in to change notification settings - Fork 4
/
test_lists.py
178 lines (148 loc) · 9.15 KB
/
test_lists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
"""unit tests for Excel to JSON list"""
# pylint: disable=missing-class-docstring,missing-function-docstring
import json
import re
import unittest
import jsonpath_ng
import jsonpath_ng.ext
import pandas as pd
import pytest
import regex
from pytest_unordered import unordered
from dsp_tools.commands.excel2json import lists as e2l
from dsp_tools.models.exceptions import BaseError
with open("testdata/excel2json/lists-multilingual-output-expected.json", encoding="utf-8") as valf:
lists_section_valid = json.load(valf)
def test_expand_lists_from_excel() -> None:
# take the "lists" section of the systematic test project, expand it, and check if it is equal to the expanded
# version stored in the testdata folder
list_name = "expanded lists section of json-project/test-project-systematic.json"
with open("testdata/json-project/test-project-systematic.json", encoding="utf-8") as f:
lists_with_excel_reference = json.load(f)["project"]["lists"]
lists_with_excel_reference_output = e2l.expand_lists_from_excel(lists_with_excel_reference)
with open("testdata/excel2json/lists-section-expanded.json", encoding="utf-8") as f:
lists_with_excel_reference_output_expected = json.load(f)[list_name]
assert unordered(lists_with_excel_reference_output) == lists_with_excel_reference_output_expected
# take the expanded version, and make sure that it is returned unchanged
lists_without_excel_reference = lists_with_excel_reference_output_expected
lists_without_excel_reference_output = e2l.expand_lists_from_excel(lists_without_excel_reference)
assert unordered(lists_without_excel_reference_output) == lists_without_excel_reference
def test_excel2lists_invalid_excel1() -> None:
# make sure that the invalid lists raise an Error
expected_msg = r"Found duplicate in column 2, row 9"
with pytest.raises(BaseError, match=expected_msg):
e2l.excel2lists(excelfolder="testdata/invalid-testdata/excel2json/lists-invalid-1")
def test_excel2lists_invalid_excel2() -> None:
expected_msg = r"The Excel file with the language code 'de' should have a value in row 10, column 2"
with pytest.raises(BaseError, match=expected_msg):
e2l.excel2lists(excelfolder="testdata/invalid-testdata/excel2json/lists-invalid-2")
class TestValidateListSection:
def test_correct(self) -> None:
"""validate the valid "lists" section: should not raise an error"""
assert e2l.validate_lists_section_with_schema(lists_section=lists_section_valid) is True
def test_without_comments(self) -> None:
"""remove mandatory "comments" section from root node: should raise an error"""
with open("testdata/excel2json/lists-multilingual-output-expected.json", encoding="utf-8") as f:
lists_section_valid_invalid = json.load(f)
del lists_section_valid_invalid[0]["comments"]
expected_msg = re.escape(
"'lists' section did not pass validation. The error message is: 'comments' is a required property"
)
with pytest.raises(BaseError, match=expected_msg):
e2l.validate_lists_section_with_schema(lists_section=lists_section_valid_invalid)
def test_invalid_lang(self) -> None:
"""insert invalid language code in "comments" section: should raise an error"""
lists_section_valid[0]["comments"]["eng"] = "wrong English label"
expected_msg = re.escape(
"'lists' section did not pass validation. The error message is: 'eng' does not match any of the regexes"
)
with pytest.raises(BaseError, match=expected_msg):
e2l.validate_lists_section_with_schema(lists_section=lists_section_valid)
def test_wrong_signature_wrong_data(self) -> None:
"""wrong usage of the function: should raise an error"""
expected_msg = re.escape("works only if exactly one of the two arguments is given")
with pytest.raises(BaseError, match=expected_msg):
e2l.validate_lists_section_with_schema()
def test_wrong_signature_no_data(self) -> None:
"""wrong usage of the function: should raise an error"""
expected_msg = r"works only if exactly one of the two arguments is given"
with pytest.raises(BaseError, match=expected_msg):
e2l.validate_lists_section_with_schema(
path_to_json_project_file="testdata/json-project/test-project-systematic.json",
lists_section=lists_section_valid,
)
def test_file_without_list(self) -> None:
"""pass a file that doesn't have a "lists" section"""
tp_minimal = "testdata/json-project/test-project-minimal.json"
expected_msg = r"there is no 'lists' section"
with pytest.raises(BaseError, match=expected_msg):
e2l.validate_lists_section_with_schema(path_to_json_project_file=tp_minimal)
class TestExcelToJSONList(unittest.TestCase):
def test_excel2lists_multilingual(self) -> None:
# create output files
input_df = pd.read_excel("testdata/excel2json/lists-multilingual/de.xlsx", header=None, dtype="str")
input_df = input_df.map(
lambda x: x if pd.notna(x) and regex.search(r"\p{L}", str(x), flags=regex.UNICODE) else pd.NA
)
input_df = input_df.dropna(axis="index", how="all")
excelfolder = "testdata/excel2json/lists-multilingual"
output_from_method, _ = e2l.excel2lists(excelfolder=excelfolder, path_to_output_file="")
# check that the output file has the same number of nodes than the Excel file has rows
output_nodes_matches = jsonpath_ng.parse("$..name").find(output_from_method)
self.assertTrue(
len(input_df.index) == len(output_nodes_matches),
"The output JSON file doesn't have the same number of nodes than the Excel file has rows",
)
# check that the longest Excel row(s) were correctly translated to the deepest-nested node(s)
last_non_empty_column_index = input_df.count().index[-1]
longest_rows_selector = input_df[last_non_empty_column_index].notna()
# count() returns a Series that maps each column number to the number of entries it contains
# index[-1] returns the number of the last non-empty column (in this test case: 3)
# input_df[3].notna() returns a boolean Series with 'true' for every non-empty cell in column 3
for index, row in input_df.loc[longest_rows_selector].iterrows():
index = int(str(index)) # index is a label/index/hashable, but we need an int
jsonpath_elems = [cell.strip() for cell in row]
parser_string = "$"
for elem in jsonpath_elems:
parser_string = parser_string + f'.nodes[?(@.labels.en == "{elem}")]'
node_match = jsonpath_ng.ext.parse(parser_string).find(output_from_method)
self.assertTrue(
len(node_match) == 1,
f'The node "{jsonpath_elems[-1]}" from Excel row {index+1} was not correctly translated to the '
f"output JSON file.",
)
def test_excel2lists_monolingual(self) -> None:
# create output files
input_df = pd.read_excel("testdata/excel2json/lists-monolingual/de.xlsx", header=None, dtype="str")
input_df = input_df.map(
lambda x: x if pd.notna(x) and regex.search(r"\p{L}", str(x), flags=regex.UNICODE) else pd.NA
)
input_df = input_df.dropna(axis="index", how="all")
excelfolder = "testdata/excel2json/lists-monolingual"
output_from_method, _ = e2l.excel2lists(excelfolder=excelfolder, path_to_output_file="")
# check that the output file has the same number of nodes than the Excel file has rows
output_nodes_matches = jsonpath_ng.parse("$..name").find(output_from_method)
self.assertTrue(
len(input_df.index) == len(output_nodes_matches),
"The output JSON file doesn't have the same number of nodes than the Excel file has rows",
)
# check that the longest Excel row(s) were correctly translated to the deepest-nested node(s)
last_non_empty_column_index = input_df.count().index[-1]
longest_rows_selector = input_df[last_non_empty_column_index].notna()
# count() returns a Series that maps each column number to the number of entries it contains
# index[-1] returns the number of the last non-empty column (in this test case: 3)
# input_df[3].notna() returns a boolean Series with 'true' for every non-empty cell in column 3
for index, row in input_df.loc[longest_rows_selector].iterrows():
index = int(str(index)) # index is a label/index/hashable, but we need an int
jsonpath_elems = [cell.strip() for cell in row]
parser_string = "$"
for elem in jsonpath_elems:
parser_string = parser_string + f'.nodes[?(@.labels.en == "{elem}")]'
node_match = jsonpath_ng.ext.parse(parser_string).find(output_from_method)
self.assertTrue(
len(node_match) == 1,
f'The node "{jsonpath_elems[-1]}" from Excel row {index+1} was not correctly translated to the '
f"output JSON file.",
)
if __name__ == "__main__":
pytest.main([__file__])