Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(excel2json): modularise unittests (DEV-3040) #659

Merged
merged 8 commits into from
Nov 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 2 additions & 3 deletions src/dsp_tools/commands/excel2json/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,8 @@ def check_contains_required_columns_else_raise_error(df: pd.DataFrame, required_
UserError: if there are required columns missing
"""
if not required_columns.issubset(set(df.columns)):
raise UserError(
f"The following columns are missing in the excel:\n" f"{required_columns.difference(set(df.columns))}"
)
cols = ", ".join(required_columns.difference(set(df.columns)))
raise UserError(f"The following columns are missing in the excel:\n{cols}")


def check_column_for_duplicate_else_raise_error(df: pd.DataFrame, to_check_column: str) -> None:
Expand Down
37 changes: 0 additions & 37 deletions test/unittests/commands/excel2json/test_excel2json.py

This file was deleted.

247 changes: 138 additions & 109 deletions test/unittests/commands/excel2json/test_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,146 +3,175 @@
# pylint: disable=missing-class-docstring,missing-function-docstring

import json
import os
import shutil
import re
import unittest
from typing import Any

import jsonpath_ng
import jsonpath_ng.ext
import pandas as pd
import pytest
import regex
from pytest_unordered import unordered

from dsp_tools.commands.excel2json import lists as e2l
from dsp_tools.models.exceptions import BaseError

with open("testdata/excel2json/lists-multilingual-output-expected.json", encoding="utf-8") as valf:
lists_section_valid = json.load(valf)

class TestExcelToJSONList(unittest.TestCase):
lists_section_valid: list[dict[str, Any]]

@classmethod
def setUpClass(cls) -> None:
"""Is executed once before the methods of this class are run"""
os.makedirs("testdata/tmp", exist_ok=True)
def test_expand_lists_from_excel() -> None:
# take the "lists" section of the systematic test project, expand it, and check if it is equal to the expanded
# version stored in the testdata folder
list_name = "expanded lists section of json-project/test-project-systematic.json"
with open("testdata/json-project/test-project-systematic.json", encoding="utf-8") as f:
lists_with_excel_reference = json.load(f)["project"]["lists"]
lists_with_excel_reference_output = e2l.expand_lists_from_excel(lists_with_excel_reference)
with open("testdata/excel2json/lists-section-expanded.json", encoding="utf-8") as f:
lists_with_excel_reference_output_expected = json.load(f)[list_name]

@classmethod
def tearDownClass(cls) -> None:
"""Is executed after the methods of this class have all run through"""
shutil.rmtree("testdata/tmp", ignore_errors=True)
assert unordered(lists_with_excel_reference_output) == lists_with_excel_reference_output_expected

def setUp(self) -> None:
"""Is executed before each test method"""
with open("testdata/excel2json/lists-multilingual-output-expected.json", encoding="utf-8") as f:
self.lists_section_valid = json.load(f)

def test_expand_lists_from_excel(self) -> None:
# take the "lists" section of the systematic test project, expand it, and check if it is equal to the expanded
# version stored in the testdata folder
list_name = "expanded lists section of json-project/test-project-systematic.json"
with open("testdata/json-project/test-project-systematic.json", encoding="utf-8") as f:
lists_with_excel_reference = json.load(f)["project"]["lists"]
lists_with_excel_reference_output = e2l.expand_lists_from_excel(lists_with_excel_reference)
with open("testdata/excel2json/lists-section-expanded.json", encoding="utf-8") as f:
lists_with_excel_reference_output_expected = json.load(f)[list_name]
self.assertListEqual(lists_with_excel_reference_output, lists_with_excel_reference_output_expected)

# take the expanded version, and make sure that it is returned unchanged
lists_without_excel_reference = lists_with_excel_reference_output_expected
lists_without_excel_reference_output = e2l.expand_lists_from_excel(lists_without_excel_reference)
self.assertListEqual(lists_without_excel_reference, lists_without_excel_reference_output)

def test_validate_lists_section(self) -> None:
# take the expanded version, and make sure that it is returned unchanged
lists_without_excel_reference = lists_with_excel_reference_output_expected
lists_without_excel_reference_output = e2l.expand_lists_from_excel(lists_without_excel_reference)
assert unordered(lists_without_excel_reference_output) == lists_without_excel_reference


def test_excel2lists_invalid_excel1() -> None:
# make sure that the invalid lists raise an Error
expected_msg = r"Found duplicate in column 2, row 9"
with pytest.raises(BaseError, match=expected_msg):
e2l.excel2lists(excelfolder="testdata/invalid-testdata/excel2json/lists-invalid-1")


def test_excel2lists_invalid_excel2() -> None:
expected_msg = r"The Excel file with the language code 'de' should have a value in row 10, column 2"
with pytest.raises(BaseError, match=expected_msg):
e2l.excel2lists(excelfolder="testdata/invalid-testdata/excel2json/lists-invalid-2")


class TestValidateListSection:
def test_correct(self) -> None:
"""validate the valid "lists" section: should not raise an error"""
self.assertTrue(e2l.validate_lists_section_with_schema(lists_section=self.lists_section_valid))
assert e2l.validate_lists_section_with_schema(lists_section=lists_section_valid) is True

def test_validate_lists_section_without_comments(self) -> None:
def test_without_comments(self) -> None:
"""remove mandatory "comments" section from root node: should raise an error"""
del self.lists_section_valid[0]["comments"]
with self.assertRaisesRegex(
BaseError,
"'lists' section did not pass validation. The error message is: 'comments' is a required property",
):
e2l.validate_lists_section_with_schema(lists_section=self.lists_section_valid)

def test_validate_lists_section_with_invalid_lang(self) -> None:
with open("testdata/excel2json/lists-multilingual-output-expected.json", encoding="utf-8") as f:
lists_section_valid_invalid = json.load(f)
del lists_section_valid_invalid[0]["comments"]
expected_msg = re.escape(
"'lists' section did not pass validation. The error message is: 'comments' is a required property"
)
with pytest.raises(BaseError, match=expected_msg):
e2l.validate_lists_section_with_schema(lists_section=lists_section_valid_invalid)

def test_invalid_lang(self) -> None:
"""insert invalid language code in "comments" section: should raise an error"""
self.lists_section_valid[0]["comments"]["eng"] = "wrong English label"
with self.assertRaisesRegex(
BaseError,
"'lists' section did not pass validation. The error message is: 'eng' does not match any of the regexes",
):
e2l.validate_lists_section_with_schema(lists_section=self.lists_section_valid)

def test_validate_lists_section_wrong_signature(self) -> None:
lists_section_valid[0]["comments"]["eng"] = "wrong English label"

expected_msg = re.escape(
"'lists' section did not pass validation. The error message is: 'eng' does not match any of the regexes"
)
with pytest.raises(BaseError, match=expected_msg):
e2l.validate_lists_section_with_schema(lists_section=lists_section_valid)

def test_wrong_signature_wrong_data(self) -> None:
"""wrong usage of the function: should raise an error"""
with self.assertRaisesRegex(BaseError, "works only if exactly one of the two arguments is given"):
expected_msg = re.escape("works only if exactly one of the two arguments is given")
with pytest.raises(BaseError, match=expected_msg):
e2l.validate_lists_section_with_schema()

def test_wrong_signature_no_data(self) -> None:
"""wrong usage of the function: should raise an error"""
expected_msg = r"works only if exactly one of the two arguments is given"
with pytest.raises(BaseError, match=expected_msg):
e2l.validate_lists_section_with_schema(
path_to_json_project_file="testdata/json-project/test-project-systematic.json",
lists_section=self.lists_section_valid,
lists_section=lists_section_valid,
)
with self.assertRaisesRegex(BaseError, "works only if exactly one of the two arguments is given"):
e2l.validate_lists_section_with_schema()

def test_validate_lists_section_file_without_list(self) -> None:
def test_file_without_list(self) -> None:
"""pass a file that doesn't have a "lists" section"""
tp_minimal = "testdata/json-project/test-project-minimal.json"
with self.assertRaisesRegex(BaseError, "there is no 'lists' section"):
expected_msg = r"there is no 'lists' section"
with pytest.raises(BaseError, match=expected_msg):
e2l.validate_lists_section_with_schema(path_to_json_project_file=tp_minimal)

def test_excel2lists_monolingual_multilingual(self) -> None:
for mode in ["monolingual", "multilingual"]:
# create output files
input_df = pd.read_excel(f"testdata/excel2json/lists-{mode}/de.xlsx", header=None, dtype="str")
input_df = input_df.map(
lambda x: x if pd.notna(x) and regex.search(r"\p{L}", str(x), flags=regex.UNICODE) else pd.NA
)
input_df = input_df.dropna(axis="index", how="all")
excelfolder = f"testdata/excel2json/lists-{mode}"
outfile = f"testdata/tmp/lists_output_{mode}.json"
output_from_method, _ = e2l.excel2lists(excelfolder=excelfolder, path_to_output_file=outfile)

# check that output from file and from method are equal
with open(outfile, encoding="utf-8") as f:
output_from_file: list[dict[str, Any]] = json.load(f)
self.assertListEqual(output_from_file, output_from_method)

# check that the output file has the same number of nodes than the Excel file has rows
output_nodes_matches = jsonpath_ng.parse("$..name").find(output_from_file)

class TestExcelToJSONList(unittest.TestCase):
def test_excel2lists_multilingual(self) -> None:
# create output files
input_df = pd.read_excel("testdata/excel2json/lists-multilingual/de.xlsx", header=None, dtype="str")
input_df = input_df.map(
lambda x: x if pd.notna(x) and regex.search(r"\p{L}", str(x), flags=regex.UNICODE) else pd.NA
)
input_df = input_df.dropna(axis="index", how="all")
excelfolder = "testdata/excel2json/lists-multilingual"
output_from_method, _ = e2l.excel2lists(excelfolder=excelfolder, path_to_output_file="")

# check that the output file has the same number of nodes than the Excel file has rows
output_nodes_matches = jsonpath_ng.parse("$..name").find(output_from_method)
self.assertTrue(
len(input_df.index) == len(output_nodes_matches),
"The output JSON file doesn't have the same number of nodes than the Excel file has rows",
)

# check that the longest Excel row(s) were correctly translated to the deepest-nested node(s)
last_non_empty_column_index = input_df.count().index[-1]
longest_rows_selector = input_df[last_non_empty_column_index].notna()
# count() returns a Series that maps each column number to the number of entries it contains
# index[-1] returns the number of the last non-empty column (in this test case: 3)
# input_df[3].notna() returns a boolean Series with 'true' for every non-empty cell in column 3
for index, row in input_df.loc[longest_rows_selector].iterrows():
index = int(str(index)) # index is a label/index/hashable, but we need an int
jsonpath_elems = [cell.strip() for cell in row]
parser_string = "$"
for elem in jsonpath_elems:
parser_string = parser_string + f'.nodes[?(@.labels.en == "{elem}")]'
node_match = jsonpath_ng.ext.parse(parser_string).find(output_from_method)
self.assertTrue(
len(input_df.index) == len(output_nodes_matches),
"The output JSON file doesn't have the same number of nodes than the Excel file has rows",
len(node_match) == 1,
f'The node "{jsonpath_elems[-1]}" from Excel row {index+1} was not correctly translated to the '
f"output JSON file.",
)

# check that the longest Excel row(s) were correctly translated to the deepest-nested node(s)
last_non_empty_column_index = input_df.count().index[-1]
longest_rows_selector = input_df[last_non_empty_column_index].notna()
# count() returns a Series that maps each column number to the number of entries it contains
# index[-1] returns the number of the last non-empty column (in this test case: 3)
# input_df[3].notna() returns a boolean Series with 'true' for every non-empty cell in column 3
for index, row in input_df.loc[longest_rows_selector].iterrows():
index = int(str(index)) # index is a label/index/hashable, but we need an int
jsonpath_elems = [cell.strip() for cell in row]
parser_string = "$"
for elem in jsonpath_elems:
parser_string = parser_string + f'.nodes[?(@.labels.en == "{elem}")]'
node_match = jsonpath_ng.ext.parse(parser_string).find(output_from_file)
self.assertTrue(
len(node_match) == 1,
f'The node "{jsonpath_elems[-1]}" from Excel row {index+1} was not correctly translated to the '
f"output JSON file.",
)

def test_excel2lists_invalid1(self) -> None:
# make sure that the invalid lists raise an Error
with self.assertRaisesRegex(BaseError, r"Found duplicate in column 2, row 9"):
e2l.excel2lists(excelfolder="testdata/invalid-testdata/excel2json/lists-invalid-1")

def test_excel2lists_invalid2(self) -> None:
with self.assertRaisesRegex(
BaseError, r"The Excel file with the language code 'de' should have a value in row 10, column 2"
):
e2l.excel2lists(excelfolder="testdata/invalid-testdata/excel2json/lists-invalid-2")
def test_excel2lists_monolingual(self) -> None:
# create output files
input_df = pd.read_excel("testdata/excel2json/lists-monolingual/de.xlsx", header=None, dtype="str")
input_df = input_df.map(
lambda x: x if pd.notna(x) and regex.search(r"\p{L}", str(x), flags=regex.UNICODE) else pd.NA
)
input_df = input_df.dropna(axis="index", how="all")
excelfolder = "testdata/excel2json/lists-monolingual"
output_from_method, _ = e2l.excel2lists(excelfolder=excelfolder, path_to_output_file="")

# check that the output file has the same number of nodes than the Excel file has rows
output_nodes_matches = jsonpath_ng.parse("$..name").find(output_from_method)
self.assertTrue(
len(input_df.index) == len(output_nodes_matches),
"The output JSON file doesn't have the same number of nodes than the Excel file has rows",
)

# check that the longest Excel row(s) were correctly translated to the deepest-nested node(s)
last_non_empty_column_index = input_df.count().index[-1]
longest_rows_selector = input_df[last_non_empty_column_index].notna()
# count() returns a Series that maps each column number to the number of entries it contains
# index[-1] returns the number of the last non-empty column (in this test case: 3)
# input_df[3].notna() returns a boolean Series with 'true' for every non-empty cell in column 3
for index, row in input_df.loc[longest_rows_selector].iterrows():
index = int(str(index)) # index is a label/index/hashable, but we need an int
jsonpath_elems = [cell.strip() for cell in row]
parser_string = "$"
for elem in jsonpath_elems:
parser_string = parser_string + f'.nodes[?(@.labels.en == "{elem}")]'
node_match = jsonpath_ng.ext.parse(parser_string).find(output_from_method)
self.assertTrue(
len(node_match) == 1,
f'The node "{jsonpath_elems[-1]}" from Excel row {index+1} was not correctly translated to the '
f"output JSON file.",
)


if __name__ == "__main__":
Expand Down
18 changes: 18 additions & 0 deletions test/unittests/commands/excel2json/test_project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import json
import unittest

from dsp_tools.commands.excel2json.project import _create_project_json, _validate_folder_structure_get_filenames

# pylint: disable=missing-function-docstring


class TestCreateProject(unittest.TestCase):
"""Test the create project command"""

def test_excel_to_json_project(self) -> None:
excel_folder = "testdata/excel2json/excel2json_files"
listfolder, onto_folders = _validate_folder_structure_get_filenames(excel_folder)
_, project = _create_project_json(excel_folder, listfolder, onto_folders)
with open("testdata/excel2json/excel2json-expected-output.json", encoding="utf-8") as f:
output_expected = json.load(f)
self.assertDictEqual(project, output_expected)