Skip to content

Commit

Permalink
refactor(excel2json): modularise unittests (DEV-3040) (#659)
Browse files Browse the repository at this point in the history
  • Loading branch information
Nora-Olivia-Ammann committed Nov 30, 2023
1 parent 876fb11 commit 107b1b0
Show file tree
Hide file tree
Showing 6 changed files with 283 additions and 274 deletions.
5 changes: 2 additions & 3 deletions src/dsp_tools/commands/excel2json/utils.py
Expand Up @@ -80,9 +80,8 @@ def check_contains_required_columns_else_raise_error(df: pd.DataFrame, required_
UserError: if there are required columns missing
"""
if not required_columns.issubset(set(df.columns)):
raise UserError(
f"The following columns are missing in the excel:\n" f"{required_columns.difference(set(df.columns))}"
)
cols = ", ".join(required_columns.difference(set(df.columns)))
raise UserError(f"The following columns are missing in the excel:\n{cols}")


def check_column_for_duplicate_else_raise_error(df: pd.DataFrame, to_check_column: str) -> None:
Expand Down
37 changes: 0 additions & 37 deletions test/unittests/commands/excel2json/test_excel2json.py

This file was deleted.

247 changes: 138 additions & 109 deletions test/unittests/commands/excel2json/test_lists.py
Expand Up @@ -3,146 +3,175 @@
# pylint: disable=missing-class-docstring,missing-function-docstring

import json
import os
import shutil
import re
import unittest
from typing import Any

import jsonpath_ng
import jsonpath_ng.ext
import pandas as pd
import pytest
import regex
from pytest_unordered import unordered

from dsp_tools.commands.excel2json import lists as e2l
from dsp_tools.models.exceptions import BaseError

with open("testdata/excel2json/lists-multilingual-output-expected.json", encoding="utf-8") as valf:
lists_section_valid = json.load(valf)

class TestExcelToJSONList(unittest.TestCase):
lists_section_valid: list[dict[str, Any]]

@classmethod
def setUpClass(cls) -> None:
"""Is executed once before the methods of this class are run"""
os.makedirs("testdata/tmp", exist_ok=True)
def test_expand_lists_from_excel() -> None:
# take the "lists" section of the systematic test project, expand it, and check if it is equal to the expanded
# version stored in the testdata folder
list_name = "expanded lists section of json-project/test-project-systematic.json"
with open("testdata/json-project/test-project-systematic.json", encoding="utf-8") as f:
lists_with_excel_reference = json.load(f)["project"]["lists"]
lists_with_excel_reference_output = e2l.expand_lists_from_excel(lists_with_excel_reference)
with open("testdata/excel2json/lists-section-expanded.json", encoding="utf-8") as f:
lists_with_excel_reference_output_expected = json.load(f)[list_name]

@classmethod
def tearDownClass(cls) -> None:
"""Is executed after the methods of this class have all run through"""
shutil.rmtree("testdata/tmp", ignore_errors=True)
assert unordered(lists_with_excel_reference_output) == lists_with_excel_reference_output_expected

def setUp(self) -> None:
"""Is executed before each test method"""
with open("testdata/excel2json/lists-multilingual-output-expected.json", encoding="utf-8") as f:
self.lists_section_valid = json.load(f)

def test_expand_lists_from_excel(self) -> None:
# take the "lists" section of the systematic test project, expand it, and check if it is equal to the expanded
# version stored in the testdata folder
list_name = "expanded lists section of json-project/test-project-systematic.json"
with open("testdata/json-project/test-project-systematic.json", encoding="utf-8") as f:
lists_with_excel_reference = json.load(f)["project"]["lists"]
lists_with_excel_reference_output = e2l.expand_lists_from_excel(lists_with_excel_reference)
with open("testdata/excel2json/lists-section-expanded.json", encoding="utf-8") as f:
lists_with_excel_reference_output_expected = json.load(f)[list_name]
self.assertListEqual(lists_with_excel_reference_output, lists_with_excel_reference_output_expected)

# take the expanded version, and make sure that it is returned unchanged
lists_without_excel_reference = lists_with_excel_reference_output_expected
lists_without_excel_reference_output = e2l.expand_lists_from_excel(lists_without_excel_reference)
self.assertListEqual(lists_without_excel_reference, lists_without_excel_reference_output)

def test_validate_lists_section(self) -> None:
# take the expanded version, and make sure that it is returned unchanged
lists_without_excel_reference = lists_with_excel_reference_output_expected
lists_without_excel_reference_output = e2l.expand_lists_from_excel(lists_without_excel_reference)
assert unordered(lists_without_excel_reference_output) == lists_without_excel_reference


def test_excel2lists_invalid_excel1() -> None:
# make sure that the invalid lists raise an Error
expected_msg = r"Found duplicate in column 2, row 9"
with pytest.raises(BaseError, match=expected_msg):
e2l.excel2lists(excelfolder="testdata/invalid-testdata/excel2json/lists-invalid-1")


def test_excel2lists_invalid_excel2() -> None:
expected_msg = r"The Excel file with the language code 'de' should have a value in row 10, column 2"
with pytest.raises(BaseError, match=expected_msg):
e2l.excel2lists(excelfolder="testdata/invalid-testdata/excel2json/lists-invalid-2")


class TestValidateListSection:
def test_correct(self) -> None:
"""validate the valid "lists" section: should not raise an error"""
self.assertTrue(e2l.validate_lists_section_with_schema(lists_section=self.lists_section_valid))
assert e2l.validate_lists_section_with_schema(lists_section=lists_section_valid) is True

def test_validate_lists_section_without_comments(self) -> None:
def test_without_comments(self) -> None:
"""remove mandatory "comments" section from root node: should raise an error"""
del self.lists_section_valid[0]["comments"]
with self.assertRaisesRegex(
BaseError,
"'lists' section did not pass validation. The error message is: 'comments' is a required property",
):
e2l.validate_lists_section_with_schema(lists_section=self.lists_section_valid)

def test_validate_lists_section_with_invalid_lang(self) -> None:
with open("testdata/excel2json/lists-multilingual-output-expected.json", encoding="utf-8") as f:
lists_section_valid_invalid = json.load(f)
del lists_section_valid_invalid[0]["comments"]
expected_msg = re.escape(
"'lists' section did not pass validation. The error message is: 'comments' is a required property"
)
with pytest.raises(BaseError, match=expected_msg):
e2l.validate_lists_section_with_schema(lists_section=lists_section_valid_invalid)

def test_invalid_lang(self) -> None:
"""insert invalid language code in "comments" section: should raise an error"""
self.lists_section_valid[0]["comments"]["eng"] = "wrong English label"
with self.assertRaisesRegex(
BaseError,
"'lists' section did not pass validation. The error message is: 'eng' does not match any of the regexes",
):
e2l.validate_lists_section_with_schema(lists_section=self.lists_section_valid)

def test_validate_lists_section_wrong_signature(self) -> None:
lists_section_valid[0]["comments"]["eng"] = "wrong English label"

expected_msg = re.escape(
"'lists' section did not pass validation. The error message is: 'eng' does not match any of the regexes"
)
with pytest.raises(BaseError, match=expected_msg):
e2l.validate_lists_section_with_schema(lists_section=lists_section_valid)

def test_wrong_signature_wrong_data(self) -> None:
"""wrong usage of the function: should raise an error"""
with self.assertRaisesRegex(BaseError, "works only if exactly one of the two arguments is given"):
expected_msg = re.escape("works only if exactly one of the two arguments is given")
with pytest.raises(BaseError, match=expected_msg):
e2l.validate_lists_section_with_schema()

def test_wrong_signature_no_data(self) -> None:
"""wrong usage of the function: should raise an error"""
expected_msg = r"works only if exactly one of the two arguments is given"
with pytest.raises(BaseError, match=expected_msg):
e2l.validate_lists_section_with_schema(
path_to_json_project_file="testdata/json-project/test-project-systematic.json",
lists_section=self.lists_section_valid,
lists_section=lists_section_valid,
)
with self.assertRaisesRegex(BaseError, "works only if exactly one of the two arguments is given"):
e2l.validate_lists_section_with_schema()

def test_validate_lists_section_file_without_list(self) -> None:
def test_file_without_list(self) -> None:
"""pass a file that doesn't have a "lists" section"""
tp_minimal = "testdata/json-project/test-project-minimal.json"
with self.assertRaisesRegex(BaseError, "there is no 'lists' section"):
expected_msg = r"there is no 'lists' section"
with pytest.raises(BaseError, match=expected_msg):
e2l.validate_lists_section_with_schema(path_to_json_project_file=tp_minimal)

def test_excel2lists_monolingual_multilingual(self) -> None:
for mode in ["monolingual", "multilingual"]:
# create output files
input_df = pd.read_excel(f"testdata/excel2json/lists-{mode}/de.xlsx", header=None, dtype="str")
input_df = input_df.map(
lambda x: x if pd.notna(x) and regex.search(r"\p{L}", str(x), flags=regex.UNICODE) else pd.NA
)
input_df = input_df.dropna(axis="index", how="all")
excelfolder = f"testdata/excel2json/lists-{mode}"
outfile = f"testdata/tmp/lists_output_{mode}.json"
output_from_method, _ = e2l.excel2lists(excelfolder=excelfolder, path_to_output_file=outfile)

# check that output from file and from method are equal
with open(outfile, encoding="utf-8") as f:
output_from_file: list[dict[str, Any]] = json.load(f)
self.assertListEqual(output_from_file, output_from_method)

# check that the output file has the same number of nodes than the Excel file has rows
output_nodes_matches = jsonpath_ng.parse("$..name").find(output_from_file)

class TestExcelToJSONList(unittest.TestCase):
def test_excel2lists_multilingual(self) -> None:
# create output files
input_df = pd.read_excel("testdata/excel2json/lists-multilingual/de.xlsx", header=None, dtype="str")
input_df = input_df.map(
lambda x: x if pd.notna(x) and regex.search(r"\p{L}", str(x), flags=regex.UNICODE) else pd.NA
)
input_df = input_df.dropna(axis="index", how="all")
excelfolder = "testdata/excel2json/lists-multilingual"
output_from_method, _ = e2l.excel2lists(excelfolder=excelfolder, path_to_output_file="")

# check that the output file has the same number of nodes than the Excel file has rows
output_nodes_matches = jsonpath_ng.parse("$..name").find(output_from_method)
self.assertTrue(
len(input_df.index) == len(output_nodes_matches),
"The output JSON file doesn't have the same number of nodes than the Excel file has rows",
)

# check that the longest Excel row(s) were correctly translated to the deepest-nested node(s)
last_non_empty_column_index = input_df.count().index[-1]
longest_rows_selector = input_df[last_non_empty_column_index].notna()
# count() returns a Series that maps each column number to the number of entries it contains
# index[-1] returns the number of the last non-empty column (in this test case: 3)
# input_df[3].notna() returns a boolean Series with 'true' for every non-empty cell in column 3
for index, row in input_df.loc[longest_rows_selector].iterrows():
index = int(str(index)) # index is a label/index/hashable, but we need an int
jsonpath_elems = [cell.strip() for cell in row]
parser_string = "$"
for elem in jsonpath_elems:
parser_string = parser_string + f'.nodes[?(@.labels.en == "{elem}")]'
node_match = jsonpath_ng.ext.parse(parser_string).find(output_from_method)
self.assertTrue(
len(input_df.index) == len(output_nodes_matches),
"The output JSON file doesn't have the same number of nodes than the Excel file has rows",
len(node_match) == 1,
f'The node "{jsonpath_elems[-1]}" from Excel row {index+1} was not correctly translated to the '
f"output JSON file.",
)

# check that the longest Excel row(s) were correctly translated to the deepest-nested node(s)
last_non_empty_column_index = input_df.count().index[-1]
longest_rows_selector = input_df[last_non_empty_column_index].notna()
# count() returns a Series that maps each column number to the number of entries it contains
# index[-1] returns the number of the last non-empty column (in this test case: 3)
# input_df[3].notna() returns a boolean Series with 'true' for every non-empty cell in column 3
for index, row in input_df.loc[longest_rows_selector].iterrows():
index = int(str(index)) # index is a label/index/hashable, but we need an int
jsonpath_elems = [cell.strip() for cell in row]
parser_string = "$"
for elem in jsonpath_elems:
parser_string = parser_string + f'.nodes[?(@.labels.en == "{elem}")]'
node_match = jsonpath_ng.ext.parse(parser_string).find(output_from_file)
self.assertTrue(
len(node_match) == 1,
f'The node "{jsonpath_elems[-1]}" from Excel row {index+1} was not correctly translated to the '
f"output JSON file.",
)

def test_excel2lists_invalid1(self) -> None:
# make sure that the invalid lists raise an Error
with self.assertRaisesRegex(BaseError, r"Found duplicate in column 2, row 9"):
e2l.excel2lists(excelfolder="testdata/invalid-testdata/excel2json/lists-invalid-1")

def test_excel2lists_invalid2(self) -> None:
with self.assertRaisesRegex(
BaseError, r"The Excel file with the language code 'de' should have a value in row 10, column 2"
):
e2l.excel2lists(excelfolder="testdata/invalid-testdata/excel2json/lists-invalid-2")
def test_excel2lists_monolingual(self) -> None:
# create output files
input_df = pd.read_excel("testdata/excel2json/lists-monolingual/de.xlsx", header=None, dtype="str")
input_df = input_df.map(
lambda x: x if pd.notna(x) and regex.search(r"\p{L}", str(x), flags=regex.UNICODE) else pd.NA
)
input_df = input_df.dropna(axis="index", how="all")
excelfolder = "testdata/excel2json/lists-monolingual"
output_from_method, _ = e2l.excel2lists(excelfolder=excelfolder, path_to_output_file="")

# check that the output file has the same number of nodes than the Excel file has rows
output_nodes_matches = jsonpath_ng.parse("$..name").find(output_from_method)
self.assertTrue(
len(input_df.index) == len(output_nodes_matches),
"The output JSON file doesn't have the same number of nodes than the Excel file has rows",
)

# check that the longest Excel row(s) were correctly translated to the deepest-nested node(s)
last_non_empty_column_index = input_df.count().index[-1]
longest_rows_selector = input_df[last_non_empty_column_index].notna()
# count() returns a Series that maps each column number to the number of entries it contains
# index[-1] returns the number of the last non-empty column (in this test case: 3)
# input_df[3].notna() returns a boolean Series with 'true' for every non-empty cell in column 3
for index, row in input_df.loc[longest_rows_selector].iterrows():
index = int(str(index)) # index is a label/index/hashable, but we need an int
jsonpath_elems = [cell.strip() for cell in row]
parser_string = "$"
for elem in jsonpath_elems:
parser_string = parser_string + f'.nodes[?(@.labels.en == "{elem}")]'
node_match = jsonpath_ng.ext.parse(parser_string).find(output_from_method)
self.assertTrue(
len(node_match) == 1,
f'The node "{jsonpath_elems[-1]}" from Excel row {index+1} was not correctly translated to the '
f"output JSON file.",
)


if __name__ == "__main__":
Expand Down
18 changes: 18 additions & 0 deletions test/unittests/commands/excel2json/test_project.py
@@ -0,0 +1,18 @@
import json
import unittest

from dsp_tools.commands.excel2json.project import _create_project_json, _validate_folder_structure_get_filenames

# pylint: disable=missing-function-docstring


class TestCreateProject(unittest.TestCase):
"""Test the create project command"""

def test_excel_to_json_project(self) -> None:
excel_folder = "testdata/excel2json/excel2json_files"
listfolder, onto_folders = _validate_folder_structure_get_filenames(excel_folder)
_, project = _create_project_json(excel_folder, listfolder, onto_folders)
with open("testdata/excel2json/excel2json-expected-output.json", encoding="utf-8") as f:
output_expected = json.load(f)
self.assertDictEqual(project, output_expected)

0 comments on commit 107b1b0

Please sign in to comment.