Skip to content

Commit

Permalink
fix(excel2xml): make_text_prop: allow <, >, & in rich texts (DEV-3131) (
Browse files Browse the repository at this point in the history
  • Loading branch information
jnussbaum committed Dec 29, 2023
1 parent ea56066 commit 228c79f
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 106 deletions.
50 changes: 25 additions & 25 deletions docs/file-formats/xml-data-file.md
Expand Up @@ -730,35 +730,35 @@ and the third column is how DSP-APP displays the second column.

Behavior of simple text (`SimpleText`/`Textarea` + `utf8`):

| input to `excel2xml` | XML file | DSP-APP | Remarks |
| -------------------- | -------- | ------- | -------------------------------------------- |
| `<` | `&lt;` | &lt; | |
| `>` | `&gt;` | &gt; | |
| | `<` || invalid XML |
| | `>` | &gt; | discouraged by XML standard, but possible |
| `&` | `&amp;` | &amp; | |
| | `&` || invalid XML |
| `&gt;` || | discouraged: The leading `&` will be escaped |
| `<tag>` | | | discouraged: Simple text is not rich text |
| | `<tag>` || forbidden: Simple text is not rich text |
| input to `excel2xml` | XML file | DSP-APP | Remarks |
| -------------------- | ------------- | ------- | -------------------------------------------- |
| `<` | `&lt;` | &lt; | |
| `>` | `&gt;` | &gt; | |
| | `<` || invalid XML |
| | `>` | &gt; | discouraged by XML standard, but possible |
| `&` | `&amp;` | &amp; | |
| | `&` || invalid XML |
| `&gt;` | | | discouraged: The leading `&` will be escaped |
| `<tag>` | `&lt;tag&gt;` | `<tag>` | discouraged: Simple text is not rich text |
| | `<tag>` || forbidden: Simple text is not rich text |


Behavior of text with markup (`Richtext` + `xml`):

| input to `excel2xml` | XML file | DSP-APP | Remarks |
| --------------------- | ------------------- | ------------- | ----------------------------------------- |
| `<` | | | invalid XML |
| `>` | `&gt;` | &gt; | discouraged by XML standard, but possible |
| | `<` || invalid XML |
| | `>` | &gt; | discouraged by XML standard, but possible |
| `&lt;` | `&lt;` | &lt; | |
| `&gt;` | `&gt;` | &gt; | |
| `&` | | | invalid XML |
| | `&` || invalid XML |
| `&amp;` | `&amp;` | & | |
| `<em>text</em>` | `<em>text</em>` | _text_ | |
| `unclosed <tag> text` | | | invalid XML |
| | `&lt;not a tag&gt;` | `<not a tag>` | |
| input to `excel2xml` | XML file | DSP-APP | Remarks |
| -------------------- | ---------------------- | ---------------- | ----------------------------------------- |
| `<` | `&lt;` | `&lt;` | |
| `>` | `&gt;` | &gt; | |
| | `<` | | invalid XML |
| | `>` | &gt; | discouraged by XML standard, but possible |
| `&lt;` | `&lt;` | &lt; | |
| `&gt;` | `&gt;` | &gt; | |
| `&` | `&amp;` | & | |
| | `&` | | invalid XML |
| `&amp;` | `&amp;` | & | |
| `<em>text</em>` | `<em>text</em>` | _text_ | |
| `unclosed <tag>` | `unclosed &lt;tag&gt;` | `unclosed <tag>` | |
| | `&lt;not a tag&gt;` | `<not a tag>` | |


#### Special Characters: Rules
Expand Down
56 changes: 53 additions & 3 deletions src/dsp_tools/commands/excel2xml/excel2xml_lib.py
Expand Up @@ -1315,13 +1315,14 @@ def make_text_prop(
# write the text into the tag, without validation
value_.text = str(val.value)
else:
escaped_text = _escape_reserved_chars(str(val.value))
# enforce that the text is well-formed XML: serialize tag ...
content = etree.tostring(value_, encoding="unicode")
serialized = etree.tostring(value_, encoding="unicode")
# ... insert text at the very end of the string, and add ending tag to the previously single <text/> tag ...
content = regex.sub(r"/>$", f">{val.value}</text>", content)
serialized = regex.sub(r"/>$", f">{escaped_text}</text>", serialized)
# ... try to parse it again
try:
value_ = etree.fromstring(content)
value_ = etree.fromstring(serialized)
except etree.XMLSyntaxError:
raise BaseError(
"The XML tags contained in a richtext property (encoding=xml) must be well-formed. "
Expand All @@ -1333,6 +1334,55 @@ def make_text_prop(
return prop_


def _escape_reserved_chars(text: str) -> str:
"""
From richtext strings (encoding="xml"), escape the reserved characters <, > and &,
but only if they are not part of a standard standoff tag or escape sequence.
The standard standoff tags allowed by DSP-API are documented here:
https://docs.dasch.swiss/2023.12.01/DSP-API/03-endpoints/api-v2/text/standard-standoff/
Args:
text: the richtext string to be escaped
Returns:
the escaped richtext string
"""
allowed_tags = [
"a( [^>]+)?", # <a> is the only tag that can have attributes
"p",
"em",
"strong",
"u",
"sub",
"sup",
"strike",
"h1",
"ol",
"ul",
"li",
"tbody",
"table",
"tr",
"td",
"br",
"hr",
"pre",
"cite",
"blockquote",
"code",
]
allowed_tags_regex = "|".join(allowed_tags)
lookahead = rf"(?!/?({allowed_tags_regex})>)"
lookbehind = rf"(?<!</?({allowed_tags_regex}))"
illegal_lt = rf"<{lookahead}"
illegal_gt = rf"{lookbehind}>"
illegal_amp = r"&(?![#a-zA-Z0-9]+;)"
text = regex.sub(illegal_lt, "&lt;", text)
text = regex.sub(illegal_gt, "&gt;", text)
text = regex.sub(illegal_amp, "&amp;", text)
return text


def make_time_prop(
name: str,
value: Union[PropertyElement, str, Iterable[Union[PropertyElement, str]]],
Expand Down
2 changes: 1 addition & 1 deletion src/dsp_tools/commands/xmlupload/read_validate_xml_file.py
Expand Up @@ -48,7 +48,7 @@ def validate_and_parse_xml_file(
def _check_if_link_targets_exist(root: etree._Element) -> None:
"""
Make sure that all targets of links (resptr and salsah-links)
are either IRIsl or IDs that exist in the present XML file.
are either IRIs or IDs that exist in the present XML file.
Args:
root: parsed XML file
Expand Down
151 changes: 74 additions & 77 deletions test/unittests/commands/excel2xml/test_excel2xml_lib.py
Expand Up @@ -490,7 +490,6 @@ def test_make_resptr_prop(self) -> None:

@pytest.mark.filterwarnings("ignore::UserWarning")
def test_make_text_prop(self) -> None:
# standard tests
prop = "text"
method = excel2xml.make_text_prop
different_values = ["text_1", " ", "!", "?", "-", "_", "None"]
Expand All @@ -501,82 +500,80 @@ def test_make_text_prop(self) -> None:
lambda: excel2xml.make_text_prop(":test", excel2xml.PropertyElement(value="a", encoding="unicode")),
)

# encoding="utf8"
testcases_utf8 = [
[
"text < text/>",
"text &lt; text/&gt;",
],
[
"text < text> & text",
"text &lt; text&gt; &amp; text",
],
[
"text <text text > text",
"text &lt;text text &gt; text",
],
[
'text < text text="text"> text',
'text &lt; text text="text"&gt; text',
],
[
'text <text text="text" > text',
'text &lt;text text="text" &gt; text',
],
]
for orig, exp in testcases_utf8:
received = etree.tostring(
excel2xml.make_text_prop(":test", excel2xml.PropertyElement(orig, encoding="utf8")), encoding="unicode"
)
received = regex.sub(r" xmlns(:.+?)?=\".+?\"", "", received)
expected = (
'<text-prop name=":test"><text permissions="prop-default" encoding="utf8">'
+ exp
+ "</text></text-prop>"
)
self.assertEqual(received, expected)

# test encoding="xml"
testcases_xml = [
[
"text <strong>and</strong> text",
"text <strong>and</strong> text",
],
[
'a <a class="salsah-link" href="IRI:test_thing_0:IRI">link</a> text',
'a <a class="salsah-link" href="IRI:test_thing_0:IRI">link</a> text',
],
[
"1 &lt; 2",
"1 &lt; 2",
],
[
"&lt;escaped tag&gt;",
"&lt;escaped tag&gt;",
],
]
all_inputs = " ".join([inp for inp, _ in testcases_xml])
all_outputs = " ".join([output for _, output in testcases_xml])
testcases_xml.append([all_inputs, all_outputs])

for orig, exp in testcases_xml:
received = etree.tostring(
excel2xml.make_text_prop(":test", excel2xml.PropertyElement(orig, encoding="xml")), encoding="unicode"
)
received = regex.sub(r" xmlns(:.+?)?=\".+?\"", "", received)
expected = (
'<text-prop name=":test"><text permissions="prop-default" encoding="xml">' + exp + "</text></text-prop>"
)
self.assertEqual(received, expected)

invalid_xml_texts = ["text < text", "text & text", "text <unclosed> tag", 'text <unclosed tag="tag"> text']
for inv in invalid_xml_texts:
with self.assertRaisesRegex(
BaseError,
r"The XML tags contained in a richtext property \(encoding=xml\) must be well-formed",
msg=f"Failed with '{inv}'",
):
excel2xml.make_text_prop(":test", excel2xml.PropertyElement(inv, encoding="xml"))
def test_make_text_prop_utf8_lt_gt_amp(self) -> None:
original = "1 < 2 & 4 > 3"
expected = "1 &lt; 2 &amp; 4 &gt; 3"
returned = etree.tostring(excel2xml.make_text_prop(":test", original), encoding="unicode")
returned = regex.sub(r"</?text(-prop)?( [^>]+)?>", "", returned)
assert returned == expected

def test_make_text_prop_utf8_pseudo_tag(self) -> None:
original = "txt <txt>txt</txt> txt"
expected = "txt &lt;txt&gt;txt&lt;/txt&gt; txt"
returned = etree.tostring(excel2xml.make_text_prop(":test", original), encoding="unicode")
returned = regex.sub(r"</?text(-prop)?( [^>]+)?>", "", returned)
assert returned == expected

def test_make_text_prop_utf8_escape(self) -> None:
original = "txt &amp; txt"
expected = "txt &amp;amp; txt"
returned = etree.tostring(excel2xml.make_text_prop(":test", original), encoding="unicode")
returned = regex.sub(r"</?text(-prop)?( [^>]+)?>", "", returned)
assert returned == expected

def test_make_text_prop_xml_standard_standoff_tag(self) -> None:
original = "text <strong>and</strong> text"
expected = "text <strong>and</strong> text"
returned = etree.tostring(
excel2xml.make_text_prop(":test", excel2xml.PropertyElement(original, encoding="xml")), encoding="unicode"
)
returned = regex.sub(r"</?text(-prop)?( [^>]+)?>", "", returned)
assert returned == expected

def test_make_text_prop_xml_unsupported_tag(self) -> None:
original = "text <unsupported>tag</unsupported> text"
expected = "text &lt;unsupported&gt;tag&lt;/unsupported&gt; text"
returned = etree.tostring(
excel2xml.make_text_prop(":test", excel2xml.PropertyElement(original, encoding="xml")), encoding="unicode"
)
returned = regex.sub(r"</?text(-prop)?( [^>]+)?>", "", returned)
assert returned == expected

def test_make_text_prop_xml_salsah_link(self) -> None:
original = 'a <a class="salsah-link" href="IRI:test_thing_0:IRI">link</a> text'
expected = 'a <a class="salsah-link" href="IRI:test_thing_0:IRI">link</a> text'
returned = etree.tostring(
excel2xml.make_text_prop(":test", excel2xml.PropertyElement(original, encoding="xml")), encoding="unicode"
)
returned = regex.sub(r"</?text(-prop)?( [^>]+)?>", "", returned)
assert returned == expected

def test_make_text_prop_xml_escaped_tag(self) -> None:
original = "text <strong>and</strong> text"
expected = "text <strong>and</strong> text"
returned = etree.tostring(
excel2xml.make_text_prop(":test", excel2xml.PropertyElement(original, encoding="xml")), encoding="unicode"
)
returned = regex.sub(r"</?text(-prop)?( [^>]+)?>", "", returned)
assert returned == expected

def test_make_text_prop_xml_lt_gt_amp(self) -> None:
original = "1 < 2 & 4 > 3"
expected = "1 &lt; 2 &amp; 4 &gt; 3"
returned = etree.tostring(
excel2xml.make_text_prop(":test", excel2xml.PropertyElement(original, encoding="xml")), encoding="unicode"
)
returned = regex.sub(r"</?text(-prop)?( [^>]+)?>", "", returned)
assert returned == expected

def test_make_text_prop_xml_escape_sequence(self) -> None:
original = "text &amp; text"
expected = "text &amp; text"
returned = etree.tostring(
excel2xml.make_text_prop(":test", excel2xml.PropertyElement(original, encoding="xml")), encoding="unicode"
)
returned = regex.sub(r"</?text(-prop)?( [^>]+)?>", "", returned)
assert returned == expected

def test_make_time_prop(self) -> None:
prop = "time"
Expand Down

0 comments on commit 228c79f

Please sign in to comment.