Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(excel2xml): allow for single tags in XML text (DEV-3427) #885

Merged
merged 4 commits into from
Mar 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/dsp_tools/commands/excel2xml/excel2xml_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -1416,9 +1416,9 @@ def _escape_reserved_chars(text: str) -> str:
"code",
]
allowed_tags_regex = "|".join(allowed_tags)
lookahead = rf"(?!/?({allowed_tags_regex})>)"
lookbehind = rf"(?<!</?({allowed_tags_regex}))"
lookahead = rf"(?!/?({allowed_tags_regex})/?>)"
illegal_lt = rf"<{lookahead}"
lookbehind = rf"(?<!</?({allowed_tags_regex})/?)"
illegal_gt = rf"{lookbehind}>"
illegal_amp = r"&(?![#a-zA-Z0-9]+;)"
text = regex.sub(illegal_lt, "&lt;", text)
Expand Down
37 changes: 37 additions & 0 deletions test/unittests/commands/excel2xml/test_excel2xml_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from lxml import etree

from dsp_tools import excel2xml
from dsp_tools.commands.excel2xml.excel2xml_lib import _escape_reserved_chars
from dsp_tools.models.exceptions import BaseError

# ruff: noqa: PT009 (pytest-unittest-assertion) (remove this line when pytest is used instead of unittest)
Expand Down Expand Up @@ -823,6 +824,42 @@ def test_create_json_list_mapping(self) -> None:
self.assertDictEqual(testlist_mapping_returned, testlist_mapping_expected)


class TestEscapedChars:
def test_single_br(self) -> None:
test_text = "Text <br/> text after"
res = _escape_reserved_chars(test_text)
assert res == test_text

def test_single_br_with_other(self) -> None:
test_text = "Text <br/>> text after"
expected = "Text <br/>&gt; text after"
res = _escape_reserved_chars(test_text)
assert res == expected

def test_wrong_single_br(self) -> None:
test_text = "Text <br//> text after"
expected = "Text &lt;br//&gt; text after"
res = _escape_reserved_chars(test_text)
assert res == expected

def test_emphasis(self) -> None:
test_text = "Text before [<em>emphasis</em>] Text after illegal amp: &"
expected = "Text before [<em>emphasis</em>] Text after illegal amp: &amp;"
res = _escape_reserved_chars(test_text)
assert res == expected

def test_link(self) -> None:
test_text = 'Before <a class="salsah-link" href="IRI:link:IRI">link</a> after'
res = _escape_reserved_chars(test_text)
assert res == test_text

def test_illegal_angular(self) -> None:
test_text = "Before <TagNotKnown>in tags</TagNotKnown> After."
expected = "Before &lt;TagNotKnown&gt;in tags&lt;/TagNotKnown&gt; After."
res = _escape_reserved_chars(test_text)
assert res == expected


def _strip_namespace(element: etree._Element) -> str:
"""Removes the namespace from the XML element."""
xml = etree.tostring(element, encoding="unicode")
Expand Down