diff --git a/docs/file-formats/xml-data-file.md b/docs/file-formats/xml-data-file.md index 75f1cce9e..cfba02093 100644 --- a/docs/file-formats/xml-data-file.md +++ b/docs/file-formats/xml-data-file.md @@ -730,35 +730,35 @@ and the third column is how DSP-APP displays the second column. Behavior of simple text (`SimpleText`/`Textarea` + `utf8`): -| input to `excel2xml` | XML file | DSP-APP | Remarks | -| -------------------- | -------- | ------- | -------------------------------------------- | -| `<` | `<` | < | | -| `>` | `>` | > | | -| | `<` | ⛔ | invalid XML | -| | `>` | > | discouraged by XML standard, but possible | -| `&` | `&` | & | | -| | `&` | ⛔ | invalid XML | -| `>` | ⛔ | | discouraged: The leading `&` will be escaped | -| `` | ⛔ | | discouraged: Simple text is not rich text | -| | `` | ⛔ | forbidden: Simple text is not rich text | +| input to `excel2xml` | XML file | DSP-APP | Remarks | +| -------------------- | ------------- | ------- | -------------------------------------------- | +| `<` | `<` | < | | +| `>` | `>` | > | | +| | `<` | ⛔ | invalid XML | +| | `>` | > | discouraged by XML standard, but possible | +| `&` | `&` | & | | +| | `&` | ⛔ | invalid XML | +| `>` | ⛔ | | discouraged: The leading `&` will be escaped | +| `` | `<tag>` | `` | discouraged: Simple text is not rich text | +| | `` | ⛔ | forbidden: Simple text is not rich text | Behavior of text with markup (`Richtext` + `xml`): -| input to `excel2xml` | XML file | DSP-APP | Remarks | -| --------------------- | ------------------- | ------------- | ----------------------------------------- | -| `<` | ⛔ | | invalid XML | -| `>` | `>` | > | discouraged by XML standard, but possible | -| | `<` | ⛔ | invalid XML | -| | `>` | > | discouraged by XML standard, but possible | -| `<` | `<` | < | | -| `>` | `>` | > | | -| `&` | ⛔ | | invalid XML | -| | `&` | ⛔ | invalid XML | -| `&` | `&` | & | | -| `text` | `text` | _text_ | | -| `unclosed text` | ⛔ | | invalid XML | -| | `<not a tag>` | `` | | +| input to `excel2xml` | XML file | DSP-APP | Remarks | +| -------------------- | ---------------------- | ---------------- | ----------------------------------------- | +| `<` | `<` | `<` | | +| `>` | `>` | > | | +| | `<` | ⛔ | invalid XML | +| | `>` | > | discouraged by XML standard, but possible | +| `<` | `<` | < | | +| `>` | `>` | > | | +| `&` | `&` | & | | +| | `&` | ⛔ | invalid XML | +| `&` | `&` | & | | +| `text` | `text` | _text_ | | +| `unclosed ` | `unclosed <tag>` | `unclosed ` | | +| | `<not a tag>` | `` | | #### Special Characters: Rules diff --git a/src/dsp_tools/commands/excel2xml/excel2xml_lib.py b/src/dsp_tools/commands/excel2xml/excel2xml_lib.py index b4ed6257f..6f50a0d5a 100644 --- a/src/dsp_tools/commands/excel2xml/excel2xml_lib.py +++ b/src/dsp_tools/commands/excel2xml/excel2xml_lib.py @@ -1315,13 +1315,14 @@ def make_text_prop( # write the text into the tag, without validation value_.text = str(val.value) else: + escaped_text = _escape_reserved_chars(str(val.value)) # enforce that the text is well-formed XML: serialize tag ... - content = etree.tostring(value_, encoding="unicode") + serialized = etree.tostring(value_, encoding="unicode") # ... insert text at the very end of the string, and add ending tag to the previously single tag ... - content = regex.sub(r"/>$", f">{val.value}", content) + serialized = regex.sub(r"/>$", f">{escaped_text}", serialized) # ... try to parse it again try: - value_ = etree.fromstring(content) + value_ = etree.fromstring(serialized) except etree.XMLSyntaxError: raise BaseError( "The XML tags contained in a richtext property (encoding=xml) must be well-formed. " @@ -1333,6 +1334,55 @@ def make_text_prop( return prop_ +def _escape_reserved_chars(text: str) -> str: + """ + From richtext strings (encoding="xml"), escape the reserved characters <, > and &, + but only if they are not part of a standard standoff tag or escape sequence. + The standard standoff tags allowed by DSP-API are documented here: + https://docs.dasch.swiss/2023.12.01/DSP-API/03-endpoints/api-v2/text/standard-standoff/ + + Args: + text: the richtext string to be escaped + + Returns: + the escaped richtext string + """ + allowed_tags = [ + "a( [^>]+)?", # is the only tag that can have attributes + "p", + "em", + "strong", + "u", + "sub", + "sup", + "strike", + "h1", + "ol", + "ul", + "li", + "tbody", + "table", + "tr", + "td", + "br", + "hr", + "pre", + "cite", + "blockquote", + "code", + ] + allowed_tags_regex = "|".join(allowed_tags) + lookahead = rf"(?!/?({allowed_tags_regex})>)" + lookbehind = rf"(?" + illegal_amp = r"&(?![#a-zA-Z0-9]+;)" + text = regex.sub(illegal_lt, "<", text) + text = regex.sub(illegal_gt, ">", text) + text = regex.sub(illegal_amp, "&", text) + return text + + def make_time_prop( name: str, value: Union[PropertyElement, str, Iterable[Union[PropertyElement, str]]], diff --git a/src/dsp_tools/commands/xmlupload/read_validate_xml_file.py b/src/dsp_tools/commands/xmlupload/read_validate_xml_file.py index 2684977fa..d682dea32 100644 --- a/src/dsp_tools/commands/xmlupload/read_validate_xml_file.py +++ b/src/dsp_tools/commands/xmlupload/read_validate_xml_file.py @@ -48,7 +48,7 @@ def validate_and_parse_xml_file( def _check_if_link_targets_exist(root: etree._Element) -> None: """ Make sure that all targets of links (resptr and salsah-links) - are either IRIsl or IDs that exist in the present XML file. + are either IRIs or IDs that exist in the present XML file. Args: root: parsed XML file diff --git a/test/unittests/commands/excel2xml/test_excel2xml_lib.py b/test/unittests/commands/excel2xml/test_excel2xml_lib.py index 46cb2154a..5c56630eb 100644 --- a/test/unittests/commands/excel2xml/test_excel2xml_lib.py +++ b/test/unittests/commands/excel2xml/test_excel2xml_lib.py @@ -490,7 +490,6 @@ def test_make_resptr_prop(self) -> None: @pytest.mark.filterwarnings("ignore::UserWarning") def test_make_text_prop(self) -> None: - # standard tests prop = "text" method = excel2xml.make_text_prop different_values = ["text_1", " ", "!", "?", "-", "_", "None"] @@ -501,82 +500,80 @@ def test_make_text_prop(self) -> None: lambda: excel2xml.make_text_prop(":test", excel2xml.PropertyElement(value="a", encoding="unicode")), ) - # encoding="utf8" - testcases_utf8 = [ - [ - "text < text/>", - "text < text/>", - ], - [ - "text < text> & text", - "text < text> & text", - ], - [ - "text text", - "text <text text > text", - ], - [ - 'text < text text="text"> text', - 'text < text text="text"> text', - ], - [ - 'text text', - 'text <text text="text" > text', - ], - ] - for orig, exp in testcases_utf8: - received = etree.tostring( - excel2xml.make_text_prop(":test", excel2xml.PropertyElement(orig, encoding="utf8")), encoding="unicode" - ) - received = regex.sub(r" xmlns(:.+?)?=\".+?\"", "", received) - expected = ( - '' - + exp - + "" - ) - self.assertEqual(received, expected) - - # test encoding="xml" - testcases_xml = [ - [ - "text and text", - "text and text", - ], - [ - 'a link text', - 'a link text', - ], - [ - "1 < 2", - "1 < 2", - ], - [ - "<escaped tag>", - "<escaped tag>", - ], - ] - all_inputs = " ".join([inp for inp, _ in testcases_xml]) - all_outputs = " ".join([output for _, output in testcases_xml]) - testcases_xml.append([all_inputs, all_outputs]) - - for orig, exp in testcases_xml: - received = etree.tostring( - excel2xml.make_text_prop(":test", excel2xml.PropertyElement(orig, encoding="xml")), encoding="unicode" - ) - received = regex.sub(r" xmlns(:.+?)?=\".+?\"", "", received) - expected = ( - '' + exp + "" - ) - self.assertEqual(received, expected) - - invalid_xml_texts = ["text < text", "text & text", "text tag", 'text text'] - for inv in invalid_xml_texts: - with self.assertRaisesRegex( - BaseError, - r"The XML tags contained in a richtext property \(encoding=xml\) must be well-formed", - msg=f"Failed with '{inv}'", - ): - excel2xml.make_text_prop(":test", excel2xml.PropertyElement(inv, encoding="xml")) + def test_make_text_prop_utf8_lt_gt_amp(self) -> None: + original = "1 < 2 & 4 > 3" + expected = "1 < 2 & 4 > 3" + returned = etree.tostring(excel2xml.make_text_prop(":test", original), encoding="unicode") + returned = regex.sub(r"]+)?>", "", returned) + assert returned == expected + + def test_make_text_prop_utf8_pseudo_tag(self) -> None: + original = "txt txt txt" + expected = "txt <txt>txt</txt> txt" + returned = etree.tostring(excel2xml.make_text_prop(":test", original), encoding="unicode") + returned = regex.sub(r"]+)?>", "", returned) + assert returned == expected + + def test_make_text_prop_utf8_escape(self) -> None: + original = "txt & txt" + expected = "txt &amp; txt" + returned = etree.tostring(excel2xml.make_text_prop(":test", original), encoding="unicode") + returned = regex.sub(r"]+)?>", "", returned) + assert returned == expected + + def test_make_text_prop_xml_standard_standoff_tag(self) -> None: + original = "text and text" + expected = "text and text" + returned = etree.tostring( + excel2xml.make_text_prop(":test", excel2xml.PropertyElement(original, encoding="xml")), encoding="unicode" + ) + returned = regex.sub(r"]+)?>", "", returned) + assert returned == expected + + def test_make_text_prop_xml_unsupported_tag(self) -> None: + original = "text tag text" + expected = "text <unsupported>tag</unsupported> text" + returned = etree.tostring( + excel2xml.make_text_prop(":test", excel2xml.PropertyElement(original, encoding="xml")), encoding="unicode" + ) + returned = regex.sub(r"]+)?>", "", returned) + assert returned == expected + + def test_make_text_prop_xml_salsah_link(self) -> None: + original = 'a link text' + expected = 'a link text' + returned = etree.tostring( + excel2xml.make_text_prop(":test", excel2xml.PropertyElement(original, encoding="xml")), encoding="unicode" + ) + returned = regex.sub(r"]+)?>", "", returned) + assert returned == expected + + def test_make_text_prop_xml_escaped_tag(self) -> None: + original = "text and text" + expected = "text and text" + returned = etree.tostring( + excel2xml.make_text_prop(":test", excel2xml.PropertyElement(original, encoding="xml")), encoding="unicode" + ) + returned = regex.sub(r"]+)?>", "", returned) + assert returned == expected + + def test_make_text_prop_xml_lt_gt_amp(self) -> None: + original = "1 < 2 & 4 > 3" + expected = "1 < 2 & 4 > 3" + returned = etree.tostring( + excel2xml.make_text_prop(":test", excel2xml.PropertyElement(original, encoding="xml")), encoding="unicode" + ) + returned = regex.sub(r"]+)?>", "", returned) + assert returned == expected + + def test_make_text_prop_xml_escape_sequence(self) -> None: + original = "text & text" + expected = "text & text" + returned = etree.tostring( + excel2xml.make_text_prop(":test", excel2xml.PropertyElement(original, encoding="xml")), encoding="unicode" + ) + returned = regex.sub(r"]+)?>", "", returned) + assert returned == expected def test_make_time_prop(self) -> None: prop = "time"