Skip to content

Commit

Permalink
wsgi: Stop replacing invalid UTF-8 on py3
Browse files Browse the repository at this point in the history
For more context, see #467 and #497.

On py3, urllib.parse.unquote() defaults to decoding via UTF-8 and
replacing invalid UTF-8 sequences with "\N{REPLACEMENT CHARACTER}".
This causes a few problems:

- Since WSGI requires that bytes be decoded as Latin-1 on py3, we
  have to do an extra re-encode/decode cycle in encode_dance().
- Applications written for Latin-1 are broken, as there are valid
  Latin-1 sequences that are mangled because of the replacement.
- Applications written for UTF-8 cannot differentiate between a
  replacement character that was intentionally sent by the client
  versus an invalid byte sequence.

Fortunately, unquote() allows us to specify the encoding that should
be used. By specifying Latin-1, we can drop encode_dance() entirely
and preserve as much information from the wire as we can.
  • Loading branch information
tipabu authored and jstasiak committed Feb 28, 2019
1 parent a915bb6 commit f0bc79e
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 17 deletions.
13 changes: 4 additions & 9 deletions eventlet/wsgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,6 @@ def addr_to_host_port(addr):
return (host, port)


def encode_dance(s):
if not isinstance(s, bytes):
s = s.encode('utf-8', 'replace')
if six.PY2:
return s
return s.decode('latin1')


# Collections of error codes to compare against. Not all attributes are set
# on errno module on all platforms, so some are literals :(
BAD_SOCK = set((errno.EBADF, 10053))
Expand Down Expand Up @@ -646,7 +638,10 @@ def get_environ(self):

pq = self.path.split('?', 1)
env['RAW_PATH_INFO'] = pq[0]
env['PATH_INFO'] = encode_dance(urllib.parse.unquote(pq[0]))
if six.PY2:
env['PATH_INFO'] = urllib.parse.unquote(pq[0])
else:
env['PATH_INFO'] = urllib.parse.unquote(pq[0], encoding='latin1')
if len(pq) > 1:
env['QUERY_STRING'] = pq[1]

Expand Down
23 changes: 15 additions & 8 deletions tests/wsgi_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1443,21 +1443,28 @@ def wsgi_app(environ, start_response):

self.site.application = wsgi_app
sock = eventlet.connect(self.server_addr)
sock.sendall(b'GET /%E4%BD%A0%E5%A5%BD HTTP/1.1\r\nHost: localhost\r\nConnection: close\r\n\r\n')
# This is a properly-quoted request for the UTF-8 path /你好
sock.sendall(b'GET /%E4%BD%A0%E5%A5%BD HTTP/1.1\r\nHost: localhost\r\n\r\n')
result = read_http(sock)
assert result.status == 'HTTP/1.1 200 OK'
# that was only preparation, actual test below
# Like above, but the octets are reversed before being quoted,
# so the result should *not* be interpreted as UTF-8
sock.sendall(b'GET /%BD%A5%E5%A0%BD%E4 HTTP/1.1\r\nHost: localhost\r\nConnection: close\r\n\r\n')
result = read_http(sock)
assert result.status == 'HTTP/1.1 200 OK'

# that was only preparation, actual tests below
# Per PEP-0333 https://www.python.org/dev/peps/pep-0333/#unicode-issues
# in all WSGI environment strings application must observe either bytes in latin-1 (ISO-8859-1)
# or unicode code points \u0000..\u00ff
# wsgi_decoding_dance from Werkzeug to emulate concerned application
msg = 'Expected PATH_INFO to be a native string, not {0}'.format(type(g[0]))
assert isinstance(g[0], str), msg
if six.PY2:
assert g[0] == u'/你好'.encode('utf-8')
else:
decoded = g[0].encode('latin1').decode('utf-8', 'replace')
assert decoded == u'/你好'
# Fortunately, WSGI strings have the same literal representation on both py2 and py3
assert g[0] == '/\xe4\xbd\xa0\xe5\xa5\xbd'

msg = 'Expected PATH_INFO to be a native string, not {0}'.format(type(g[1]))
assert isinstance(g[1], str), msg
assert g[1] == '/\xbd\xa5\xe5\xa0\xbd\xe4'

@tests.skip_if_no_ipv6
def test_ipv6(self):
Expand Down

0 comments on commit f0bc79e

Please sign in to comment.