diff --git a/httpx/_urlparse.py b/httpx/_urlparse.py index c44e3519a0..07bbea9070 100644 --- a/httpx/_urlparse.py +++ b/httpx/_urlparse.py @@ -260,10 +260,8 @@ def urlparse(url: str = "", **kwargs: typing.Optional[str]) -> ParseResult: # For 'path' we need to drop ? and # from the GEN_DELIMS set. parsed_path: str = quote(path, safe=SUB_DELIMS + ":/[]@") # For 'query' we need to drop '#' from the GEN_DELIMS set. - # We also exclude '/' because it is more robust to replace it with a percent - # encoding despite it not being a requirement of the spec. parsed_query: typing.Optional[str] = ( - None if query is None else quote(query, safe=SUB_DELIMS + ":?[]@") + None if query is None else quote(query, safe=SUB_DELIMS + ":/?[]@") ) # For 'fragment' we can include all of the GEN_DELIMS set. parsed_fragment: typing.Optional[str] = ( @@ -432,13 +430,12 @@ def is_safe(string: str, safe: str = "/") -> bool: if char not in NON_ESCAPED_CHARS: return False - # Any '%' characters must be valid '%xx' escape sequences. - return string.count("%") == len(PERCENT_ENCODED_REGEX.findall(string)) + return True -def quote(string: str, safe: str = "/") -> str: +def percent_encoded(string: str, safe: str = "/") -> str: """ - Use percent-encoding to quote a string if required. + Use percent-encoding to quote a string. """ if is_safe(string, safe=safe): return string @@ -449,6 +446,39 @@ def quote(string: str, safe: str = "/") -> str: ) +def quote(string: str, safe: str = "/") -> str: + """ + Use percent-encoding to quote a string, omitting existing '%xx' escape sequences. + + See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1 + + * `string`: The string to be percent-escaped. + * `safe`: A string containing characters that may be treated as safe, and do not + need to be escaped. Unreserved characters are always treated as safe. + See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3 + """ + parts = [] + current_position = 0 + for match in re.finditer(PERCENT_ENCODED_REGEX, string): + start_position, end_position = match.start(), match.end() + matched_text = match.group(0) + # Add any text up to the '%xx' escape sequence. + if start_position != current_position: + leading_text = string[current_position:start_position] + parts.append(percent_encoded(leading_text, safe=safe)) + + # Add the '%xx' escape sequence. + parts.append(matched_text) + current_position = end_position + + # Add any text after the final '%xx' escape sequence. + if current_position != len(string): + trailing_text = string[current_position:] + parts.append(percent_encoded(trailing_text, safe=safe)) + + return "".join(parts) + + def urlencode(items: typing.List[typing.Tuple[str, str]]) -> str: """ We can use a much simpler version of the stdlib urlencode here because @@ -464,4 +494,9 @@ def urlencode(items: typing.List[typing.Tuple[str, str]]) -> str: - https://github.com/encode/httpx/issues/2721 - https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlencode """ - return "&".join([quote(k, safe="") + "=" + quote(v, safe="") for k, v in items]) + return "&".join( + [ + percent_encoded(k, safe="") + "=" + percent_encoded(v, safe="") + for k, v in items + ] + ) diff --git a/tests/models/test_url.py b/tests/models/test_url.py index 4683a321d6..79e1605a5a 100644 --- a/tests/models/test_url.py +++ b/tests/models/test_url.py @@ -70,36 +70,79 @@ def test_url_no_authority(): # Tests for percent encoding across path, query, and fragment... -def test_path_percent_encoding(): - # Test percent encoding for SUB_DELIMS ALPHA NUM and allowable GEN_DELIMS - url = httpx.URL("https://example.com/!$&'()*+,;= abc ABC 123 :/[]@") - assert url.raw_path == b"/!$&'()*+,;=%20abc%20ABC%20123%20:/[]@" - assert url.path == "/!$&'()*+,;= abc ABC 123 :/[]@" - assert url.query == b"" - assert url.fragment == "" - - -def test_query_percent_encoding(): - # Test percent encoding for SUB_DELIMS ALPHA NUM and allowable GEN_DELIMS - url = httpx.URL("https://example.com/?!$&'()*+,;= abc ABC 123 :/[]@" + "?") - assert url.raw_path == b"/?!$&'()*+,;=%20abc%20ABC%20123%20:%2F[]@?" - assert url.path == "/" - assert url.query == b"!$&'()*+,;=%20abc%20ABC%20123%20:%2F[]@?" - assert url.fragment == "" - - -def test_fragment_percent_encoding(): - # Test percent encoding for SUB_DELIMS ALPHA NUM and allowable GEN_DELIMS - url = httpx.URL("https://example.com/#!$&'()*+,;= abc ABC 123 :/[]@" + "?#") - assert url.raw_path == b"/" - assert url.path == "/" - assert url.query == b"" - assert url.fragment == "!$&'()*+,;= abc ABC 123 :/[]@?#" +@pytest.mark.parametrize( + "url,raw_path,path,query,fragment", + [ + # URL with unescaped chars in path. + ( + "https://example.com/!$&'()*+,;= abc ABC 123 :/[]@", + b"/!$&'()*+,;=%20abc%20ABC%20123%20:/[]@", + "/!$&'()*+,;= abc ABC 123 :/[]@", + b"", + "", + ), + # URL with escaped chars in path. + ( + "https://example.com/!$&'()*+,;=%20abc%20ABC%20123%20:/[]@", + b"/!$&'()*+,;=%20abc%20ABC%20123%20:/[]@", + "/!$&'()*+,;= abc ABC 123 :/[]@", + b"", + "", + ), + # URL with mix of unescaped and escaped chars in path. + # WARNING: This has the incorrect behaviour, adding the test as an interim step. + ( + "https://example.com/ %61%62%63", + b"/%20%61%62%63", + "/ abc", + b"", + "", + ), + # URL with unescaped chars in query. + ( + "https://example.com/?!$&'()*+,;= abc ABC 123 :/[]@?", + b"/?!$&'()*+,;=%20abc%20ABC%20123%20:/[]@?", + "/", + b"!$&'()*+,;=%20abc%20ABC%20123%20:/[]@?", + "", + ), + # URL with escaped chars in query. + ( + "https://example.com/?!$&%27()*+,;=%20abc%20ABC%20123%20:%2F[]@?", + b"/?!$&%27()*+,;=%20abc%20ABC%20123%20:%2F[]@?", + "/", + b"!$&%27()*+,;=%20abc%20ABC%20123%20:%2F[]@?", + "", + ), + # URL with mix of unescaped and escaped chars in query. + ( + "https://example.com/?%20%97%98%99", + b"/?%20%97%98%99", + "/", + b"%20%97%98%99", + "", + ), + # URL encoding characters in fragment. + ( + "https://example.com/#!$&'()*+,;= abc ABC 123 :/[]@?#", + b"/", + "/", + b"", + "!$&'()*+,;= abc ABC 123 :/[]@?#", + ), + ], +) +def test_path_query_fragment(url, raw_path, path, query, fragment): + url = httpx.URL(url) + assert url.raw_path == raw_path + assert url.path == path + assert url.query == query + assert url.fragment == fragment def test_url_query_encoding(): """ - URL query parameters should use '%20' to encoding spaces, + URL query parameters should use '%20' for encoding spaces, and should treat '/' as a safe character. This behaviour differs across clients, but we're matching browser behaviour here. @@ -107,19 +150,12 @@ def test_url_query_encoding(): and https://github.com/encode/httpx/discussions/2460 """ url = httpx.URL("https://www.example.com/?a=b c&d=e/f") - assert url.raw_path == b"/?a=b%20c&d=e%2Ff" + assert url.raw_path == b"/?a=b%20c&d=e/f" url = httpx.URL("https://www.example.com/", params={"a": "b c", "d": "e/f"}) assert url.raw_path == b"/?a=b%20c&d=e%2Ff" -def test_url_with_url_encoded_path(): - url = httpx.URL("https://www.example.com/path%20to%20somewhere") - assert url.path == "/path to somewhere" - assert url.query == b"" - assert url.raw_path == b"/path%20to%20somewhere" - - def test_url_params(): url = httpx.URL("https://example.org:123/path/to/somewhere", params={"a": "123"}) assert str(url) == "https://example.org:123/path/to/somewhere?a=123"