Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup URL percent-encoding behavior. #2990

Merged
merged 13 commits into from
Dec 15, 2023
51 changes: 43 additions & 8 deletions httpx/_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,10 +260,8 @@ def urlparse(url: str = "", **kwargs: typing.Optional[str]) -> ParseResult:
# For 'path' we need to drop ? and # from the GEN_DELIMS set.
parsed_path: str = quote(path, safe=SUB_DELIMS + ":/[]@")
# For 'query' we need to drop '#' from the GEN_DELIMS set.
# We also exclude '/' because it is more robust to replace it with a percent
# encoding despite it not being a requirement of the spec.
parsed_query: typing.Optional[str] = (
None if query is None else quote(query, safe=SUB_DELIMS + ":?[]@")
None if query is None else quote(query, safe=SUB_DELIMS + ":/?[]@")
)
# For 'fragment' we can include all of the GEN_DELIMS set.
parsed_fragment: typing.Optional[str] = (
Expand Down Expand Up @@ -432,13 +430,12 @@ def is_safe(string: str, safe: str = "/") -> bool:
if char not in NON_ESCAPED_CHARS:
return False

# Any '%' characters must be valid '%xx' escape sequences.
return string.count("%") == len(PERCENT_ENCODED_REGEX.findall(string))
return True


def quote(string: str, safe: str = "/") -> str:
def percent_encoded(string: str, safe: str = "/") -> str:
"""
Use percent-encoding to quote a string if required.
Use percent-encoding to quote a string.
"""
if is_safe(string, safe=safe):
return string
tomchristie marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -449,6 +446,39 @@ def quote(string: str, safe: str = "/") -> str:
)


def quote(string: str, safe: str = "/") -> str:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe we should also add unit tests for these functions, rather than simply testing them with httpx.URL.
This would be a more robust approach, in my opinion.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO current approach is fine.

This is clearly a code-smell, because our test cases ought to be tests against our public API, rather than testing implementation details. Perhaps there's some cases where it's a necessary hack, but... perhaps not?

from #2492 (comment)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree testing the public API should be sufficient unless something private is particularly expensive to test via the public API.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a matter of preference, but if we encounter regression in our httpx.URL tests, we will go through these functions and find the method that isn't working properly, which is why unit tests are useful.

"""
Use percent-encoding to quote a string, omitting existing '%xx' escape sequences.

See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1

* `string`: The string to be percent-escaped.
* `safe`: A string containing characters that may be treated as safe, and do not
need to be escaped. Unreserved characters are always treated as safe.
See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3
"""
tomchristie marked this conversation as resolved.
Show resolved Hide resolved
parts = []
current_position = 0
for match in re.finditer(PERCENT_ENCODED_REGEX, string):
start_position, end_position = match.start(), match.end()
matched_text = match.group(0)
# Add any text up to the '%xx' escape sequence.
if start_position != current_position:
leading_text = string[current_position:start_position]
parts.append(percent_encoded(leading_text, safe=safe))

# Add the '%xx' escape sequence.
parts.append(matched_text)
current_position = end_position

# Add any text after the final '%xx' escape sequence.
if current_position != len(string):
trailing_text = string[current_position:]
parts.append(percent_encoded(trailing_text, safe=safe))

return "".join(parts)
tomchristie marked this conversation as resolved.
Show resolved Hide resolved


def urlencode(items: typing.List[typing.Tuple[str, str]]) -> str:
"""
We can use a much simpler version of the stdlib urlencode here because
Expand All @@ -464,4 +494,9 @@ def urlencode(items: typing.List[typing.Tuple[str, str]]) -> str:
- https://github.com/encode/httpx/issues/2721
- https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlencode
"""
return "&".join([quote(k, safe="") + "=" + quote(v, safe="") for k, v in items])
return "&".join(
[
percent_encoded(k, safe="") + "=" + percent_encoded(v, safe="")
for k, v in items
]
)
tomchristie marked this conversation as resolved.
Show resolved Hide resolved
104 changes: 70 additions & 34 deletions tests/models/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,56 +70,92 @@ def test_url_no_authority():
# Tests for percent encoding across path, query, and fragment...


def test_path_percent_encoding():
# Test percent encoding for SUB_DELIMS ALPHA NUM and allowable GEN_DELIMS
url = httpx.URL("https://example.com/!$&'()*+,;= abc ABC 123 :/[]@")
assert url.raw_path == b"/!$&'()*+,;=%20abc%20ABC%20123%20:/[]@"
assert url.path == "/!$&'()*+,;= abc ABC 123 :/[]@"
assert url.query == b""
assert url.fragment == ""


def test_query_percent_encoding():
# Test percent encoding for SUB_DELIMS ALPHA NUM and allowable GEN_DELIMS
url = httpx.URL("https://example.com/?!$&'()*+,;= abc ABC 123 :/[]@" + "?")
assert url.raw_path == b"/?!$&'()*+,;=%20abc%20ABC%20123%20:%2F[]@?"
assert url.path == "/"
assert url.query == b"!$&'()*+,;=%20abc%20ABC%20123%20:%2F[]@?"
assert url.fragment == ""


def test_fragment_percent_encoding():
# Test percent encoding for SUB_DELIMS ALPHA NUM and allowable GEN_DELIMS
url = httpx.URL("https://example.com/#!$&'()*+,;= abc ABC 123 :/[]@" + "?#")
assert url.raw_path == b"/"
assert url.path == "/"
assert url.query == b""
assert url.fragment == "!$&'()*+,;= abc ABC 123 :/[]@?#"
@pytest.mark.parametrize(
"url,raw_path,path,query,fragment",
[
# URL with unescaped chars in path.
(
"https://example.com/!$&'()*+,;= abc ABC 123 :/[]@",
b"/!$&'()*+,;=%20abc%20ABC%20123%20:/[]@",
"/!$&'()*+,;= abc ABC 123 :/[]@",
b"",
"",
),
# URL with escaped chars in path.
(
"https://example.com/!$&'()*+,;=%20abc%20ABC%20123%20:/[]@",
b"/!$&'()*+,;=%20abc%20ABC%20123%20:/[]@",
"/!$&'()*+,;= abc ABC 123 :/[]@",
b"",
"",
),
# URL with mix of unescaped and escaped chars in path.
# WARNING: This has the incorrect behaviour, adding the test as an interim step.
(
"https://example.com/ %61%62%63",
b"/%20%61%62%63",
"/ abc",
b"",
"",
),
# URL with unescaped chars in query.
(
"https://example.com/?!$&'()*+,;= abc ABC 123 :/[]@?",
b"/?!$&'()*+,;=%20abc%20ABC%20123%20:/[]@?",
"/",
b"!$&'()*+,;=%20abc%20ABC%20123%20:/[]@?",
"",
),
# URL with escaped chars in query.
(
"https://example.com/?!$&%27()*+,;=%20abc%20ABC%20123%20:%2F[]@?",
b"/?!$&%27()*+,;=%20abc%20ABC%20123%20:%2F[]@?",
"/",
b"!$&%27()*+,;=%20abc%20ABC%20123%20:%2F[]@?",
"",
),
# URL with mix of unescaped and escaped chars in query.
(
"https://example.com/?%20%97%98%99",
b"/?%20%97%98%99",
"/",
b"%20%97%98%99",
"",
),
# URL encoding characters in fragment.
(
"https://example.com/#!$&'()*+,;= abc ABC 123 :/[]@?#",
b"/",
"/",
b"",
"!$&'()*+,;= abc ABC 123 :/[]@?#",
),
],
)
def test_path_query_fragment(url, raw_path, path, query, fragment):
url = httpx.URL(url)
assert url.raw_path == raw_path
assert url.path == path
assert url.query == query
assert url.fragment == fragment


def test_url_query_encoding():
"""
URL query parameters should use '%20' to encoding spaces,
URL query parameters should use '%20' for encoding spaces,
and should treat '/' as a safe character. This behaviour differs
across clients, but we're matching browser behaviour here.

See https://github.com/encode/httpx/issues/2536
and https://github.com/encode/httpx/discussions/2460
"""
url = httpx.URL("https://www.example.com/?a=b c&d=e/f")
assert url.raw_path == b"/?a=b%20c&d=e%2Ff"
assert url.raw_path == b"/?a=b%20c&d=e/f"

url = httpx.URL("https://www.example.com/", params={"a": "b c", "d": "e/f"})
assert url.raw_path == b"/?a=b%20c&d=e%2Ff"


def test_url_with_url_encoded_path():
url = httpx.URL("https://www.example.com/path%20to%20somewhere")
assert url.path == "/path to somewhere"
assert url.query == b""
assert url.raw_path == b"/path%20to%20somewhere"


def test_url_params():
url = httpx.URL("https://example.org:123/path/to/somewhere", params={"a": "123"})
assert str(url) == "https://example.org:123/path/to/somewhere?a=123"
Expand Down