Skip to content

Commit

Permalink
fix(xmlupload): sanitize textvalues: remove whitespaces and newlines (D…
Browse files Browse the repository at this point in the history
  • Loading branch information
jnussbaum committed Aug 24, 2023
1 parent ecd09e9 commit d6c8110
Show file tree
Hide file tree
Showing 5 changed files with 187 additions and 42 deletions.
22 changes: 17 additions & 5 deletions docs/file-formats/xml-data-file.md
Expand Up @@ -688,11 +688,23 @@ contains a link to the resource `http://rdfh.ch/4123/nyOODvYySV2nJ5RWRdmOdQ`, w
`gui_element` ([defined in the ontology](./json-project/ontologies.md#textvalue))
as follows:

| `gui_element`<br/>(JSON ontology) | `encoding`<br/>(XML data) | How DSP-APP renders the whitespaces |
| --------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
| `SimpleText` | `utf8` | Pretty-print whitespaces and newlines from the XML are taken into the text field as they are. |
| `Textarea` | `utf8` | Pretty-print whitespaces and newlines from the XML are taken into the text field as they are. |
| `Richtext` | `xml` | Pretty-print whitespaces and newlines from the XML are removed. If you want a newline in the text field, use `<br />` instead. |
| `gui_element`<br/>(JSON ontology) | `encoding`<br/>(XML data) |
| --------------------------------- | ------------------------- |
| `SimpleText` | `utf8` |
| `Textarea` | `utf8` |
| `Richtext` | `xml` |

Handling of pretty-print whitespaces and newlines:

- encoding `utf8`:
- leading and trailing whitespaces are removed
- multiple spaces are replaced by a single space
- (multiple) tabstops are replaced by a single space
- encoding `xml`:
- leading and trailing whitespaces are removed
- whitespaces and newlines don't have a meaning in XML, so they are removed.
- Newlines can be created with `<br/>`.
- Whitespaces are kept only inside `<code>` and `<pre>` tags.


#### Special characters: Overview
Expand Down
75 changes: 70 additions & 5 deletions src/dsp_tools/models/xmlvalue.py
@@ -1,6 +1,7 @@
from typing import Optional, Union, cast

from lxml import etree
import regex

from dsp_tools.models.value import KnoraStandoffXml

Expand All @@ -23,14 +24,78 @@ def __init__(
self.comment = node.get("comment")
self.permissions = node.get("permissions")
if val_type == "text" and node.get("encoding") == "xml":
node.attrib.clear()
xmlstr = etree.tostring(node, encoding="unicode", method="xml")
xmlstr = xmlstr.replace('<text xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">', "")
xmlstr = xmlstr.replace("</text>", "")
self.value = KnoraStandoffXml(xmlstr)
xmlstr_orig = etree.tostring(node, encoding="unicode", method="xml")
xmlstr_cleaned = self._cleanup_formatted_text(xmlstr_orig)
self.value = KnoraStandoffXml(xmlstr_cleaned)
self.resrefs = list({x.split(":")[1] for x in self.value.get_all_iris() or []})
elif val_type == "text" and node.get("encoding") == "utf8":
str_orig = "".join(node.itertext())
str_cleaned = self._cleanup_unformatted_text(str_orig)
self.value = str_cleaned
elif val_type == "list":
listname = cast(str, listname)
self.value = listname + ":" + "".join(node.itertext())
else:
self.value = "".join(node.itertext())

def _cleanup_formatted_text(self, xmlstr_orig: str) -> str:
"""
In a xml-encoded text value from the XML file,
there may be non-text characters that must be removed.
This method:
- removes the <text> tags
- replaces (multiple) line breaks by a space
- replaces multiple spaces or tabstops by a single space (except within <code> or <pre> tags)
Args:
xmlstr_orig: original string from the XML file
Returns:
purged string, suitable to be sent to DSP-API
"""
# remove the <text> tags
xmlstr = regex.sub("<text.*?>", "", xmlstr_orig)
xmlstr = regex.sub("</text>", "", xmlstr)

# replace (multiple) line breaks by a space
xmlstr = regex.sub("\n+", " ", xmlstr)

# replace multiple spaces or tabstops by a single space (except within <code> or <pre> tags)
# the regex selects all spaces/tabstops not followed by </xyz> without <xyz in between.
# credits: https://stackoverflow.com/a/46937770/14414188
xmlstr = regex.sub("( {2,}|\t+)(?!(.(?!<(code|pre)))*</(code|pre)>)", " ", xmlstr)

# remove spaces after <br/> tags (except within <code> tags)
xmlstr = regex.sub("((?<=<br/?>) )(?!(.(?!<code))*</code>)", "", xmlstr)

# remove leading and trailing spaces
xmlstr = xmlstr.strip()

return xmlstr

def _cleanup_unformatted_text(self, string_orig: str) -> str:
"""
In a utf8-encoded text value from the XML file,
there may be non-text characters that must be removed.
This method:
- removes the <text> tags
- replaces multiple spaces or tabstops by a single space
Args:
string_orig: original string from the XML file
Returns:
purged string, suitable to be sent to DSP-API
"""
# remove the <text> tags
string = regex.sub("<text.*?>", "", string_orig)
string = regex.sub("</text>", "", string)

# replace multiple spaces or tabstops by a single space
string = regex.sub(r" {2,}|\t+", " ", string)

# remove leading and trailing spaces (of every line, but also of the entire string)
string = "\n".join([s.strip() for s in string.split("\n")])
string = string.strip()

return string
54 changes: 26 additions & 28 deletions test/unittests/test_xmlupload.py
Expand Up @@ -148,38 +148,36 @@ def test_remove_circular_references(self) -> None:
stashed_xml_texts_expected = {
"test_thing_1": {
"testonto:hasRichtext": [
"\n This text contains links to all resources:\n"
' <a class="salsah-link" href="IRI:test_thing_0:IRI">test_thing_0</a>\n'
' <a class="salsah-link" href="IRI:test_thing_1:IRI">test_thing_1</a>\n'
' <a class="salsah-link" href="IRI:image_thing_0:IRI">image_thing_0</a>\n'
' <a class="salsah-link" href="IRI:compound_thing_0:IRI">compound_thing_0</a>\n'
' <a class="salsah-link" href="IRI:partof_thing_1:IRI">partof_thing_1</a>\n'
' <a class="salsah-link" href="IRI:partof_thing_2:IRI">partof_thing_2</a>\n'
' <a class="salsah-link" href="IRI:partof_thing_3:IRI">partof_thing_3</a>\n'
' <a class="salsah-link" href="IRI:document_thing_1:IRI">document_thing_1</a>\n'
' <a class="salsah-link" href="IRI:text_thing_1:IRI">text_thing_1</a>\n'
' <a class="salsah-link" href="IRI:zip_thing_1:IRI">zip_thing_1</a>\n'
' <a class="salsah-link" href="IRI:audio_thing_1:IRI">audio_thing_1</a>\n'
' <a class="salsah-link" href="IRI:test_thing_2:IRI">test_thing_2</a>\n'
" \n "
"This text contains links to all resources: "
'<a class="salsah-link" href="IRI:test_thing_0:IRI">test_thing_0</a> '
'<a class="salsah-link" href="IRI:test_thing_1:IRI">test_thing_1</a> '
'<a class="salsah-link" href="IRI:image_thing_0:IRI">image_thing_0</a> '
'<a class="salsah-link" href="IRI:compound_thing_0:IRI">compound_thing_0</a> '
'<a class="salsah-link" href="IRI:partof_thing_1:IRI">partof_thing_1</a> '
'<a class="salsah-link" href="IRI:partof_thing_2:IRI">partof_thing_2</a> '
'<a class="salsah-link" href="IRI:partof_thing_3:IRI">partof_thing_3</a> '
'<a class="salsah-link" href="IRI:document_thing_1:IRI">document_thing_1</a> '
'<a class="salsah-link" href="IRI:text_thing_1:IRI">text_thing_1</a> '
'<a class="salsah-link" href="IRI:zip_thing_1:IRI">zip_thing_1</a> '
'<a class="salsah-link" href="IRI:audio_thing_1:IRI">audio_thing_1</a> '
'<a class="salsah-link" href="IRI:test_thing_2:IRI">test_thing_2</a>'
]
},
"test_thing_2": {
"testonto:hasRichtext": [
"\n This text contains links to all resources:\n"
' <a class="salsah-link" href="IRI:test_thing_0:IRI">test_thing_0</a>\n'
' <a class="salsah-link" href="IRI:test_thing_1:IRI">test_thing_1</a>\n'
' <a class="salsah-link" href="IRI:image_thing_0:IRI">image_thing_0</a>\n'
' <a class="salsah-link" href="IRI:compound_thing_0:IRI">compound_thing_0</a>\n'
' <a class="salsah-link" href="IRI:partof_thing_1:IRI">partof_thing_1</a>\n'
' <a class="salsah-link" href="IRI:partof_thing_2:IRI">partof_thing_2</a>\n'
' <a class="salsah-link" href="IRI:partof_thing_3:IRI">partof_thing_3</a>\n'
' <a class="salsah-link" href="IRI:document_thing_1:IRI">document_thing_1</a>\n'
' <a class="salsah-link" href="IRI:text_thing_1:IRI">text_thing_1</a>\n'
' <a class="salsah-link" href="IRI:zip_thing_1:IRI">zip_thing_1</a>\n'
' <a class="salsah-link" href="IRI:audio_thing_1:IRI">audio_thing_1</a>\n'
' <a class="salsah-link" href="IRI:test_thing_2:IRI">test_thing_2</a>\n'
" \n "
"This text contains links to all resources: "
'<a class="salsah-link" href="IRI:test_thing_0:IRI">test_thing_0</a> '
'<a class="salsah-link" href="IRI:test_thing_1:IRI">test_thing_1</a> '
'<a class="salsah-link" href="IRI:image_thing_0:IRI">image_thing_0</a> '
'<a class="salsah-link" href="IRI:compound_thing_0:IRI">compound_thing_0</a> '
'<a class="salsah-link" href="IRI:partof_thing_1:IRI">partof_thing_1</a> '
'<a class="salsah-link" href="IRI:partof_thing_2:IRI">partof_thing_2</a> '
'<a class="salsah-link" href="IRI:partof_thing_3:IRI">partof_thing_3</a> '
'<a class="salsah-link" href="IRI:document_thing_1:IRI">document_thing_1</a> '
'<a class="salsah-link" href="IRI:text_thing_1:IRI">text_thing_1</a> '
'<a class="salsah-link" href="IRI:zip_thing_1:IRI">zip_thing_1</a> '
'<a class="salsah-link" href="IRI:audio_thing_1:IRI">audio_thing_1</a> '
'<a class="salsah-link" href="IRI:test_thing_2:IRI">test_thing_2</a>'
]
},
}
Expand Down
74 changes: 74 additions & 0 deletions test/unittests/test_xmlvalue.py
@@ -0,0 +1,74 @@
import unittest

import pytest
from lxml import etree

from dsp_tools.models.xmlvalue import XMLValue


class TestXmlValue(unittest.TestCase):
"""Test the XMLValue class"""

def test_cleanup_unformatted_text(self) -> None:
"""Test the removal of whitespaces and line breaks in utf8-encoded text values"""
unformatted_text_orig = """<text permissions="prop-default" encoding="utf8">
Poem
with 1 line break:
and 2 line breaks:
and 3 line breaks:
and multiple spaces and tabstops ...
and spaces on empty lines.
</text>"""
unformatted_text_expected = (
"Poem"
"\n"
"with 1 line break:\n"
"and 2 line breaks:\n\n"
"and 3 line breaks:\n\n\n"
"and multiple spaces and tabstops ...\n\n"
"and spaces on empty lines."
)
unformatted_node = etree.fromstring(unformatted_text_orig)
unformatted_xml_value = XMLValue(node=unformatted_node, val_type="text")
self.assertEqual(unformatted_xml_value.value, unformatted_text_expected)

def test_cleanup_formatted_text(self) -> None:
"""Test the removal of whitespaces and line breaks in xml-formatted text values"""
formatted_text_orig = """<text permissions="prop-default" encoding="xml">
This is <em>italicized and <strong>bold</strong></em> text!
It contains <code>monospace text that preserves whitespaces and &amp; HTML-escapes</code>.
The same <pre>is true for preformatted text</pre>.
It contains multiple whitespaces and tabstops.<br/><br/>
Line breaks must be done with <code><br/></code> tags.<br/>
Otherwise they will be removed.<br/><br/>
It contains links to a resource:
<a class="salsah-link" href="IRI:test_thing_0:IRI">test_thing_0</a>
</text>"""
formatted_text_expected = (
"This is <em>italicized and <strong>bold</strong></em> text! "
"It contains <code>monospace text that preserves whitespaces and &amp; HTML-escapes</code>. "
"The same <pre>is true for preformatted text</pre>. "
"It contains multiple whitespaces and tabstops.<br/><br/>"
"Line breaks must be done with <code><br/></code> tags.<br/>"
"Otherwise they will be removed.<br/><br/>"
"It contains links to a resource: "
'<a class="salsah-link" href="IRI:test_thing_0:IRI">test_thing_0</a>'
)
formatted_node = etree.fromstring(formatted_text_orig)
formatted_xml_value = XMLValue(node=formatted_node, val_type="text")
self.assertEqual(str(formatted_xml_value.value), formatted_text_expected)


if __name__ == "__main__":
pytest.main([__file__])
4 changes: 0 additions & 4 deletions testdata/xml-data/test-data-systematic.xml
Expand Up @@ -175,7 +175,6 @@
<text-prop name=":hasSimpleText">
<text permissions="prop-default" encoding="utf8">Dies ist ein einfacher Text ohne Markup</text>
<text permissions="prop-restricted" encoding="xml">Ein XML-formatierter Text <strong>mit Markup</strong></text>
<text encoding="utf8"> </text>
<text encoding="utf8">_</text>
<text encoding="utf8">!</text>
<text encoding="utf8">?</text>
Expand All @@ -202,9 +201,6 @@
<text permissions="prop-default" encoding="xml">
Another text without salsah-links
</text>
<text permissions="prop-default" encoding="xml">
Another text without salsah-links
</text>
</text-prop>
<boolean-prop name=":hasBoolean">
<boolean>false</boolean>
Expand Down

0 comments on commit d6c8110

Please sign in to comment.