wsgi: Stop replacing invalid UTF-8 on py3

For more context, see #467 and #497. On py3, urllib.parse.unquote() defaults to decoding via UTF-8 and replacing invalid UTF-8 sequences with "\N{REPLACEMENT CHARACTER}". This causes a few problems: - Since WSGI requires that bytes be decoded as Latin-1 on py3, we have to do an extra re-encode/decode cycle in encode_dance(). - Applications written for Latin-1 are broken, as there are valid Latin-1 sequences that are mangled because of the replacement. - Applications written for UTF-8 cannot differentiate between a replacement character that was intentionally sent by the client versus an invalid byte sequence. Fortunately, unquote() allows us to specify the encoding that should be used. By specifying Latin-1, we can drop encode_dance() entirely and preserve as much information from the wire as we can.
eventlet · Feb 28, 2019 · f0bc79e · f0bc79e
1 parent a915bb6
commit f0bc79e
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 17 deletions.
diff --git a/eventlet/wsgi.py b/eventlet/wsgi.py
@@ -59,14 +59,6 @@ def addr_to_host_port(addr):
     return (host, port)
 
 
-def encode_dance(s):
-    if not isinstance(s, bytes):
-        s = s.encode('utf-8', 'replace')
-    if six.PY2:
-        return s
-    return s.decode('latin1')
-
-
 # Collections of error codes to compare against.  Not all attributes are set
 # on errno module on all platforms, so some are literals :(
 BAD_SOCK = set((errno.EBADF, 10053))
@@ -646,7 +638,10 @@ def get_environ(self):
 
         pq = self.path.split('?', 1)
         env['RAW_PATH_INFO'] = pq[0]
-        env['PATH_INFO'] = encode_dance(urllib.parse.unquote(pq[0]))
+        if six.PY2:
+            env['PATH_INFO'] = urllib.parse.unquote(pq[0])
+        else:
+            env['PATH_INFO'] = urllib.parse.unquote(pq[0], encoding='latin1')
         if len(pq) > 1:
             env['QUERY_STRING'] = pq[1]
 

diff --git a/tests/wsgi_test.py b/tests/wsgi_test.py
@@ -1443,21 +1443,28 @@ def wsgi_app(environ, start_response):
 
         self.site.application = wsgi_app
         sock = eventlet.connect(self.server_addr)
-        sock.sendall(b'GET /%E4%BD%A0%E5%A5%BD HTTP/1.1\r\nHost: localhost\r\nConnection: close\r\n\r\n')
+        # This is a properly-quoted request for the UTF-8 path /你好
+        sock.sendall(b'GET /%E4%BD%A0%E5%A5%BD HTTP/1.1\r\nHost: localhost\r\n\r\n')
         result = read_http(sock)
         assert result.status == 'HTTP/1.1 200 OK'
-        # that was only preparation, actual test below
+        # Like above, but the octets are reversed before being quoted,
+        # so the result should *not* be interpreted as UTF-8
+        sock.sendall(b'GET /%BD%A5%E5%A0%BD%E4 HTTP/1.1\r\nHost: localhost\r\nConnection: close\r\n\r\n')
+        result = read_http(sock)
+        assert result.status == 'HTTP/1.1 200 OK'
+
+        # that was only preparation, actual tests below
         # Per PEP-0333 https://www.python.org/dev/peps/pep-0333/#unicode-issues
         # in all WSGI environment strings application must observe either bytes in latin-1 (ISO-8859-1)
         # or unicode code points \u0000..\u00ff
-        # wsgi_decoding_dance from Werkzeug to emulate concerned application
         msg = 'Expected PATH_INFO to be a native string, not {0}'.format(type(g[0]))
         assert isinstance(g[0], str), msg
-        if six.PY2:
-            assert g[0] == u'/你好'.encode('utf-8')
-        else:
-            decoded = g[0].encode('latin1').decode('utf-8', 'replace')
-            assert decoded == u'/你好'
+        # Fortunately, WSGI strings have the same literal representation on both py2 and py3
+        assert g[0] == '/\xe4\xbd\xa0\xe5\xa5\xbd'
+
+        msg = 'Expected PATH_INFO to be a native string, not {0}'.format(type(g[1]))
+        assert isinstance(g[1], str), msg
+        assert g[1] == '/\xbd\xa5\xe5\xa0\xbd\xe4'
 
     @tests.skip_if_no_ipv6
     def test_ipv6(self):