Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

XPath3.1: mimic handling of multiple root element nodes #2351

Draft
wants to merge 29 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
8e1f170
html_tools/fix: Add forest_transplanting to handle invalid DOM
Constantin1489 May 2, 2024
1f776ff
requirements/fix: Upgrade and pin elementpath to support fragment option
Constantin1489 May 2, 2024
bf5c2c7
html_tools/fix:
Constantin1489 May 2, 2024
9f0cb35
html_tools/fix: Another option
Constantin1489 May 2, 2024
879d0b2
html_tools/fix:
Constantin1489 May 7, 2024
ed2aaf4
tests/test_xpath_selector_unit/test: Add test.
Constantin1489 May 7, 2024
dd8b4fe
html_tools/docs: Remove comments
Constantin1489 May 7, 2024
fbd5512
tests/test_xpath_selector_unit/fix: Typo
Constantin1489 May 7, 2024
20195e7
tests/test_xpath_selector_unit/test: Fix test and add more small test…
Constantin1489 May 7, 2024
220f484
tests/test_xpath_selector_unit/test: Check error occurs.
Constantin1489 May 7, 2024
e84b9f1
tests/test_xpath_selector_unit/test: Fix
Constantin1489 May 7, 2024
60777e4
tests/test_xpath_selector_unit/test: Add more unintuitive tests
Constantin1489 May 7, 2024
e325e02
tests/test_xpath_selector_unit/test: Trigger test again
Constantin1489 May 7, 2024
6a2e1cf
tests/test_xpath_selector_unit/fix: Trigger test again. why it doesn'…
Constantin1489 May 7, 2024
55b2c6c
tests/test_xpath_selector_unit/test: Oops fix test name
Constantin1489 May 7, 2024
93a9585
tests/test_xpath_selector_unit/test: Failed successfully
Constantin1489 May 7, 2024
e6b13c9
tests/test_xpath_selector_unit/test: Add count test
Constantin1489 May 7, 2024
2e3e781
tests/test_xpath_selector_unit/chore: Trigger CICD
Constantin1489 May 7, 2024
c295c5e
tests/test_xpath_selector_unit/test: Add same behavior for xpath 1
Constantin1489 May 7, 2024
5acd31f
tests/test_xpath_selector_unit/test: Fix misc
Constantin1489 May 7, 2024
de7b66b
tests/test_xpath_selector_unit/test: Fix answer
Constantin1489 May 7, 2024
66a7dae
html_tools/docs: Fix old comment
Constantin1489 May 7, 2024
4d266ca
tests/test_xpath_selector_unit/feat: Do forest_transplanting by default
Constantin1489 May 9, 2024
ebf7fd4
tests/test_xpath_selector_unit/test: Fix tests
Constantin1489 May 9, 2024
26e4a58
tests/test_xpath_selector_unit/test: Add context node related tests
Constantin1489 May 9, 2024
dbf4e87
requirements/chore: Change minimum version of elementpath
Constantin1489 May 16, 2024
7cd764f
html_tools/fix: Improve speed for function calls
Constantin1489 May 17, 2024
48a5aa2
Merge branch 'dgtlmoon:master' into transplanting
Constantin1489 May 22, 2024
3619877
Revert "html_tools/docs: Fix old comment"
Constantin1489 May 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
30 changes: 26 additions & 4 deletions changedetectionio/html_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
from xml.sax.saxutils import escape as xml_escape
import json
import re
from itertools import chain
from elementpath import select as elementpath_select
# xpath 2.0-3.1
from elementpath.xpath3 import XPath3Parser


# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
Expand Down Expand Up @@ -110,22 +114,40 @@ def elementpath_tostring(obj):

return str(obj)

def forest_transplanting(root):
"""
libxml2 violates DOM rules. it means there can be multiple root element
nodes. So I choose just transplating them to a new root by default.
See also, https://gitlab.gnome.org/GNOME/libxml2/-/issues/716
This will emulate xpath1 of html of libxml2 like '/html[2]/*'.
To make this function work, 'fragment=True' in elementpath.select is required.
"""
from lxml import etree

root_siblings_preceding = [ s for s in root.itersiblings(preceding=True)]
root_siblings = [s for s in root.itersiblings()]

new_root = etree.Element("new_root")

root_siblings_preceding.reverse()
for node in chain(root_siblings_preceding, [root], root_siblings):
new_root.append(node)
return new_root, True

# Return str Utf-8 of matched rules
def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
from lxml import etree, html
import elementpath
# xpath 2.0-3.1
from elementpath.xpath3 import XPath3Parser

parser = etree.HTMLParser()
if is_rss:
# So that we can keep CDATA for cdata_in_document_to_text() to process
parser = etree.XMLParser(strip_cdata=False)

tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
tree, is_fragment = forest_transplanting(tree)
html_block = ""

r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)
r = elementpath_select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser, fragment=is_fragment)
#@note: //title/text() wont work where <title>CDATA..

if type(r) != list:
Expand Down
145 changes: 120 additions & 25 deletions changedetectionio/tests/test_xpath_selector_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
("some $i in //hotel/branch/staff satisfies $i/age < 20", "false"),
("every $i in /hotel/branch/staff satisfies $i/age > 20", "true"),
("every $i in //hotel/branch/staff satisfies $i/age > 20 ", "true"),
("let $x := branch[@location = 'California'], $y := branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"),
("let $x := hotel/branch[@location = 'California'], $y := hotel/branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"),
("let $x := //branch[@location = 'California'], $y := //branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"),
("let $nu := 1, $de := 1000 return 'probability = ' || $nu div $de * 100 || '%'", "0.1%"),
("let $nu := 2, $probability := function ($argument) { 'probability = ' || $nu div $argument * 100 || '%'}, $de := 5 return $probability($de)", "40%"),
Expand Down Expand Up @@ -99,45 +99,45 @@ def test_hotels(html_content, xpath, answer):
</branches_to_visit>"""
@pytest.mark.parametrize("html_content", [branches_to_visit])
@pytest.mark.parametrize("xpath, answer", [
("manager[@name = 'Godot']/branch union manager[@name = 'Freya']/branch", "Area 51"),
("branches_to_visit/manager[@name = 'Godot']/branch union branches_to_visit/manager[@name = 'Freya']/branch", "Area 51"),
("//manager[@name = 'Godot']/branch union //manager[@name = 'Freya']/branch", "Stalsk12"),
("manager[@name = 'Godot']/branch | manager[@name = 'Freya']/branch", "Stalsk12"),
("branches_to_visit/manager[@name = 'Godot']/branch | branches_to_visit/manager[@name = 'Freya']/branch", "Stalsk12"),
("//manager[@name = 'Godot']/branch | //manager[@name = 'Freya']/branch", "Stalsk12"),
("manager/branch intersect manager[@name = 'Godot']/branch", "A place with no name"),
("branches_to_visit/manager/branch intersect branches_to_visit/manager[@name = 'Godot']/branch", "A place with no name"),
("//manager/branch intersect //manager[@name = 'Godot']/branch", "A place with no name"),
("manager[@name = 'Godot']/branch intersect manager[@name = 'Freya']/branch", ""),
("manager/branch except manager[@name = 'Godot']/branch", "Barcelona"),
("manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"),
("branches_to_visit/manager[@name = 'Godot']/branch intersect branches_to_visit/manager[@name = 'Freya']/branch", ""),
("branches_to_visit/manager/branch except branches_to_visit/manager[@name = 'Godot']/branch", "Barcelona"),
("branches_to_visit/manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"),
("//manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"),
("manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"),
("branches_to_visit/manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"),
("//manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"),
("manager[@name = 'Godot']/branch[2] eq manager[@name = 'Freya']/branch[2]", "false"),
("branches_to_visit/manager[@name = 'Godot']/branch[2] eq branches_to_visit/manager[@name = 'Freya']/branch[2]", "false"),
("//manager[@name = 'Godot']/branch[2] eq //manager[@name = 'Freya']/branch[2]", "false"),
("manager[1]/@room_no lt manager[2]/@room_no", "false"),
("branches_to_visit/manager[1]/@room_no lt branches_to_visit/manager[2]/@room_no", "false"),
("//manager[1]/@room_no lt //manager[2]/@room_no", "false"),
("manager[1]/@room_no gt manager[2]/@room_no", "true"),
("branches_to_visit/manager[1]/@room_no gt branches_to_visit/manager[2]/@room_no", "true"),
("//manager[1]/@room_no gt //manager[2]/@room_no", "true"),
("manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"),
("branches_to_visit/manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"),
("//manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"),
("manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"),
("branches_to_visit/manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"),
("//manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"),
("manager[@name = 'Godot']/branch = 'Area 51'", "true"),
("branches_to_visit/manager[@name = 'Godot']/branch = 'Area 51'", "true"),
("//manager[@name = 'Godot']/branch = 'Area 51'", "true"),
("manager[@name = 'Godot']/branch = 'Barcelona'", "false"),
("branches_to_visit/manager[@name = 'Godot']/branch = 'Barcelona'", "false"),
("//manager[@name = 'Godot']/branch = 'Barcelona'", "false"),
("manager[1]/@room_no > manager[2]/@room_no", "true"),
("branches_to_visit/manager[1]/@room_no > branches_to_visit/manager[2]/@room_no", "true"),
("//manager[1]/@room_no > //manager[2]/@room_no", "true"),
("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is manager[1]/branch[1]", "false"),
("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[1]/branch[1]", "false"),
("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[1]/branch[1]", "false"),
("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is manager[1]/branch[3]", "true"),
("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[1]/branch[3]", "true"),
("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[1]/branch[3]", "true"),
("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << manager[1]/branch[1]", "false"),
("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << branches_to_visit/manager[1]/branch[1]", "false"),
("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << //manager[1]/branch[1]", "false"),
("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> manager[1]/branch[1]", "true"),
("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> branches_to_visit/manager[1]/branch[1]", "true"),
("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> //manager[1]/branch[1]", "true"),
("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"),
("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"),
("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"),
("manager[1]/@name || manager[2]/@name", "GodotFreya"),
("branches_to_visit/manager[1]/@name || branches_to_visit/manager[2]/@name", "GodotFreya"),
("//manager[1]/@name || //manager[2]/@name", "GodotFreya"),
])
def test_branches_to_visit(html_content, xpath, answer):
Expand Down Expand Up @@ -170,10 +170,10 @@ def test_branches_to_visit(html_content, xpath, answer):
("(1 + 9 * 9 + 5) div 6", "14.5"),
("23 idiv 3", "7"),
("23 div 3", "7.66666666"),
("for $i in ./trip return $i/traveler/duration * $i/traveler/price", "21002.04"),
("for $i in ./trip return $i/traveler/duration ", "4"),
("for $i in ./trips/trip return $i/traveler/duration * $i/traveler/price", "21002.04"),
("for $i in ./trips/trip return $i/traveler/duration ", "4"),
("for $i in .//trip return $i/traveler/duration * $i/traveler/price", "21002.04"),
("sum(for $i in ./trip return $i/traveler/duration * $i/traveler/price)", "29002.04"),
("sum(for $i in ./trips/trip return $i/traveler/duration * $i/traveler/price)", "29002.04"),
("sum(for $i in .//trip return $i/traveler/duration * $i/traveler/price)", "29002.04"),
#("trip[1]/depart - trip[1]/arrive", "fail_to_get_answer"),
#("//trip[1]/depart - //trip[1]/arrive", "fail_to_get_answer"),
Expand Down Expand Up @@ -201,3 +201,98 @@ def test_trips(html_content, xpath, answer):
html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True)
assert type(html_content) == str
assert answer in html_content

DOM_violation_two_html_root_element = """<!DOCTYPE html>
<html>
<body>
<h1>Hello world1</h1>
<p>First paragraph.</p>
</body>
</html>
<html>
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The second html root element.

<body>
<h1>Hello world2</h1>
<p>Browsers parse this part by fixing it but lxml doesn't and returns two root element node</p>
<p>Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one.</p>
</body>
</html>"""
@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element])
@pytest.mark.parametrize("xpath, answer", [
(".", "Hello world1"),
(".", "First paragraph."),
(".", "Hello world2"),
(".", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
(".", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."),
("/*", "Hello world1"),
("/*", "First paragraph."),
("/*", "Hello world2"),
("/*", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
("/*", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."),
("html", "Hello world1"),
("html", "First paragraph."),
("html", "Hello world2"),
("html", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
("html", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."),
("/html", "Hello world1"),
("/html", "First paragraph."),
("/html", "Hello world2"),
("/html", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
("/html", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."),
("/html/body/p[1]", "First paragraph."),
("/html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the critical point. why do I choose one element in the browser inspect window, but lxml returns two? Because there are two html tag elements and two body tag elements.

("count(/html/body/p[1])", "2"),
("count(/html)", "2"),
("count(//html)", "2"),
("count(//body)", "2"),
("count(/html/body)", "2"),
("//html/body/p[1]", "First paragraph."),
("//html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
("//body/p[1]", "First paragraph."),
("//body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
("/html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
("//html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
])
def test_broken_DOM_01(html_content, xpath, answer):
# In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs.
with pytest.raises(Exception):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I intentionally add this test to reproduce the problem.
And, in the future, libxml2 may implement "html5"(https://gitlab.gnome.org/GNOME/libxml2/-/issues/211). As I posted the issue, this problem will be gone, and this test will fail. The day, please remove these tests.

from lxml import etree, html
import elementpath
from elementpath.xpath3 import XPath3Parser
parser = etree.HTMLParser()
tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
# just example xpath
# Error will occur.
r = elementpath.select(tree, xpath.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)

html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True)
assert type(html_content) == str
assert answer in html_content

@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element])
@pytest.mark.parametrize("xpath, answer", [
("/html[2]/body/p[1]", "First paragraph."),
("//html[2]/body/p[1]", "First paragraph."),
])
def test_Broken_DOM_02(html_content, xpath, answer):
# In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs.
html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True)
assert type(html_content) == str
# Check the answer is *not in* the html_content
assert answer not in html_content

@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element])
@pytest.mark.parametrize("xpath, answer", [
("/html/body/p[1]", 2),
("/html", 2),
("//html", 2),
("//body", 2),
("/html/body", 2),
])
def test_Broken_DOM_03(html_content, xpath, answer):
"""just test for xpath1"""
from lxml import etree, html
parser = etree.HTMLParser()
tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)

# test xpath 1
assert len(tree.xpath(xpath)) == answer
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ beautifulsoup4
lxml >=4.8.0,<6,!=5.2.0,!=5.2.1

# XPath 2.0-3.1 support - 4.2.0 broke something?
elementpath==4.1.5
elementpath>=4.2.1

selenium~=4.14.0

Expand Down