-
-
Notifications
You must be signed in to change notification settings - Fork 867
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
XPath3.1: mimic handling of multiple root element nodes #2351
base: master
Are you sure you want to change the base?
Changes from all commits
8e1f170
1f776ff
bf5c2c7
9f0cb35
879d0b2
ed2aaf4
dd8b4fe
fbd5512
20195e7
220f484
e84b9f1
60777e4
e325e02
6a2e1cf
55b2c6c
93a9585
e6b13c9
2e3e781
c295c5e
5acd31f
de7b66b
66a7dae
4d266ca
ebf7fd4
26e4a58
dbf4e87
7cd764f
48a5aa2
3619877
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -68,7 +68,7 @@ | |
("some $i in //hotel/branch/staff satisfies $i/age < 20", "false"), | ||
("every $i in /hotel/branch/staff satisfies $i/age > 20", "true"), | ||
("every $i in //hotel/branch/staff satisfies $i/age > 20 ", "true"), | ||
("let $x := branch[@location = 'California'], $y := branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"), | ||
("let $x := hotel/branch[@location = 'California'], $y := hotel/branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"), | ||
("let $x := //branch[@location = 'California'], $y := //branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"), | ||
("let $nu := 1, $de := 1000 return 'probability = ' || $nu div $de * 100 || '%'", "0.1%"), | ||
("let $nu := 2, $probability := function ($argument) { 'probability = ' || $nu div $argument * 100 || '%'}, $de := 5 return $probability($de)", "40%"), | ||
|
@@ -99,45 +99,45 @@ def test_hotels(html_content, xpath, answer): | |
</branches_to_visit>""" | ||
@pytest.mark.parametrize("html_content", [branches_to_visit]) | ||
@pytest.mark.parametrize("xpath, answer", [ | ||
("manager[@name = 'Godot']/branch union manager[@name = 'Freya']/branch", "Area 51"), | ||
("branches_to_visit/manager[@name = 'Godot']/branch union branches_to_visit/manager[@name = 'Freya']/branch", "Area 51"), | ||
("//manager[@name = 'Godot']/branch union //manager[@name = 'Freya']/branch", "Stalsk12"), | ||
("manager[@name = 'Godot']/branch | manager[@name = 'Freya']/branch", "Stalsk12"), | ||
("branches_to_visit/manager[@name = 'Godot']/branch | branches_to_visit/manager[@name = 'Freya']/branch", "Stalsk12"), | ||
("//manager[@name = 'Godot']/branch | //manager[@name = 'Freya']/branch", "Stalsk12"), | ||
("manager/branch intersect manager[@name = 'Godot']/branch", "A place with no name"), | ||
("branches_to_visit/manager/branch intersect branches_to_visit/manager[@name = 'Godot']/branch", "A place with no name"), | ||
("//manager/branch intersect //manager[@name = 'Godot']/branch", "A place with no name"), | ||
("manager[@name = 'Godot']/branch intersect manager[@name = 'Freya']/branch", ""), | ||
("manager/branch except manager[@name = 'Godot']/branch", "Barcelona"), | ||
("manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"), | ||
("branches_to_visit/manager[@name = 'Godot']/branch intersect branches_to_visit/manager[@name = 'Freya']/branch", ""), | ||
("branches_to_visit/manager/branch except branches_to_visit/manager[@name = 'Godot']/branch", "Barcelona"), | ||
("branches_to_visit/manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"), | ||
("//manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"), | ||
("manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"), | ||
("branches_to_visit/manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"), | ||
("//manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"), | ||
("manager[@name = 'Godot']/branch[2] eq manager[@name = 'Freya']/branch[2]", "false"), | ||
("branches_to_visit/manager[@name = 'Godot']/branch[2] eq branches_to_visit/manager[@name = 'Freya']/branch[2]", "false"), | ||
("//manager[@name = 'Godot']/branch[2] eq //manager[@name = 'Freya']/branch[2]", "false"), | ||
("manager[1]/@room_no lt manager[2]/@room_no", "false"), | ||
("branches_to_visit/manager[1]/@room_no lt branches_to_visit/manager[2]/@room_no", "false"), | ||
("//manager[1]/@room_no lt //manager[2]/@room_no", "false"), | ||
("manager[1]/@room_no gt manager[2]/@room_no", "true"), | ||
("branches_to_visit/manager[1]/@room_no gt branches_to_visit/manager[2]/@room_no", "true"), | ||
("//manager[1]/@room_no gt //manager[2]/@room_no", "true"), | ||
("manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"), | ||
("branches_to_visit/manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"), | ||
("//manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"), | ||
("manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"), | ||
("branches_to_visit/manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"), | ||
("//manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"), | ||
("manager[@name = 'Godot']/branch = 'Area 51'", "true"), | ||
("branches_to_visit/manager[@name = 'Godot']/branch = 'Area 51'", "true"), | ||
("//manager[@name = 'Godot']/branch = 'Area 51'", "true"), | ||
("manager[@name = 'Godot']/branch = 'Barcelona'", "false"), | ||
("branches_to_visit/manager[@name = 'Godot']/branch = 'Barcelona'", "false"), | ||
("//manager[@name = 'Godot']/branch = 'Barcelona'", "false"), | ||
("manager[1]/@room_no > manager[2]/@room_no", "true"), | ||
("branches_to_visit/manager[1]/@room_no > branches_to_visit/manager[2]/@room_no", "true"), | ||
("//manager[1]/@room_no > //manager[2]/@room_no", "true"), | ||
("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is manager[1]/branch[1]", "false"), | ||
("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[1]/branch[1]", "false"), | ||
("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[1]/branch[1]", "false"), | ||
("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is manager[1]/branch[3]", "true"), | ||
("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[1]/branch[3]", "true"), | ||
("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[1]/branch[3]", "true"), | ||
("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << manager[1]/branch[1]", "false"), | ||
("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << branches_to_visit/manager[1]/branch[1]", "false"), | ||
("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << //manager[1]/branch[1]", "false"), | ||
("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> manager[1]/branch[1]", "true"), | ||
("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> branches_to_visit/manager[1]/branch[1]", "true"), | ||
("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> //manager[1]/branch[1]", "true"), | ||
("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"), | ||
("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"), | ||
("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"), | ||
("manager[1]/@name || manager[2]/@name", "GodotFreya"), | ||
("branches_to_visit/manager[1]/@name || branches_to_visit/manager[2]/@name", "GodotFreya"), | ||
("//manager[1]/@name || //manager[2]/@name", "GodotFreya"), | ||
]) | ||
def test_branches_to_visit(html_content, xpath, answer): | ||
|
@@ -170,10 +170,10 @@ def test_branches_to_visit(html_content, xpath, answer): | |
("(1 + 9 * 9 + 5) div 6", "14.5"), | ||
("23 idiv 3", "7"), | ||
("23 div 3", "7.66666666"), | ||
("for $i in ./trip return $i/traveler/duration * $i/traveler/price", "21002.04"), | ||
("for $i in ./trip return $i/traveler/duration ", "4"), | ||
("for $i in ./trips/trip return $i/traveler/duration * $i/traveler/price", "21002.04"), | ||
("for $i in ./trips/trip return $i/traveler/duration ", "4"), | ||
("for $i in .//trip return $i/traveler/duration * $i/traveler/price", "21002.04"), | ||
("sum(for $i in ./trip return $i/traveler/duration * $i/traveler/price)", "29002.04"), | ||
("sum(for $i in ./trips/trip return $i/traveler/duration * $i/traveler/price)", "29002.04"), | ||
("sum(for $i in .//trip return $i/traveler/duration * $i/traveler/price)", "29002.04"), | ||
#("trip[1]/depart - trip[1]/arrive", "fail_to_get_answer"), | ||
#("//trip[1]/depart - //trip[1]/arrive", "fail_to_get_answer"), | ||
|
@@ -201,3 +201,98 @@ def test_trips(html_content, xpath, answer): | |
html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) | ||
assert type(html_content) == str | ||
assert answer in html_content | ||
|
||
DOM_violation_two_html_root_element = """<!DOCTYPE html> | ||
<html> | ||
<body> | ||
<h1>Hello world1</h1> | ||
<p>First paragraph.</p> | ||
</body> | ||
</html> | ||
<html> | ||
<body> | ||
<h1>Hello world2</h1> | ||
<p>Browsers parse this part by fixing it but lxml doesn't and returns two root element node</p> | ||
<p>Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one.</p> | ||
</body> | ||
</html>""" | ||
@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) | ||
@pytest.mark.parametrize("xpath, answer", [ | ||
(".", "Hello world1"), | ||
(".", "First paragraph."), | ||
(".", "Hello world2"), | ||
(".", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), | ||
(".", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."), | ||
("/*", "Hello world1"), | ||
("/*", "First paragraph."), | ||
("/*", "Hello world2"), | ||
("/*", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), | ||
("/*", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."), | ||
("html", "Hello world1"), | ||
("html", "First paragraph."), | ||
("html", "Hello world2"), | ||
("html", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), | ||
("html", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."), | ||
("/html", "Hello world1"), | ||
("/html", "First paragraph."), | ||
("/html", "Hello world2"), | ||
("/html", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), | ||
("/html", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."), | ||
("/html/body/p[1]", "First paragraph."), | ||
("/html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the critical point. why do I choose one element in the browser inspect window, but lxml returns two? Because there are two html tag elements and two body tag elements. |
||
("count(/html/body/p[1])", "2"), | ||
("count(/html)", "2"), | ||
("count(//html)", "2"), | ||
("count(//body)", "2"), | ||
("count(/html/body)", "2"), | ||
("//html/body/p[1]", "First paragraph."), | ||
("//html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), | ||
("//body/p[1]", "First paragraph."), | ||
("//body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), | ||
("/html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), | ||
("//html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), | ||
]) | ||
def test_broken_DOM_01(html_content, xpath, answer): | ||
# In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs. | ||
with pytest.raises(Exception): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I intentionally add this test to reproduce the problem. |
||
from lxml import etree, html | ||
import elementpath | ||
from elementpath.xpath3 import XPath3Parser | ||
parser = etree.HTMLParser() | ||
tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) | ||
# just example xpath | ||
# Error will occur. | ||
r = elementpath.select(tree, xpath.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser) | ||
|
||
html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) | ||
assert type(html_content) == str | ||
assert answer in html_content | ||
|
||
@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) | ||
@pytest.mark.parametrize("xpath, answer", [ | ||
("/html[2]/body/p[1]", "First paragraph."), | ||
("//html[2]/body/p[1]", "First paragraph."), | ||
]) | ||
def test_Broken_DOM_02(html_content, xpath, answer): | ||
# In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs. | ||
html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) | ||
assert type(html_content) == str | ||
# Check the answer is *not in* the html_content | ||
assert answer not in html_content | ||
|
||
@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) | ||
@pytest.mark.parametrize("xpath, answer", [ | ||
("/html/body/p[1]", 2), | ||
("/html", 2), | ||
("//html", 2), | ||
("//body", 2), | ||
("/html/body", 2), | ||
]) | ||
def test_Broken_DOM_03(html_content, xpath, answer): | ||
"""just test for xpath1""" | ||
from lxml import etree, html | ||
parser = etree.HTMLParser() | ||
tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) | ||
|
||
# test xpath 1 | ||
assert len(tree.xpath(xpath)) == answer |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The second html root element.