Permalink
Browse files

Fixed #19508 -- Implemented uri_to_iri as per RFC.

Thanks Loic Bistuer for helping in shaping the patch and Claude Paroz
for the review.
  • Loading branch information...
coder9042 authored and loic committed Jul 22, 2014
1 parent 3af5af1 commit 10b17a22bec2eaf44c3315614aea87c127caee46
@@ -206,7 +206,6 @@ def get_path_info(environ):
"""
path_info = get_bytes_from_wsgi(environ, 'PATH_INFO', '/')
- # It'd be better to implement URI-to-IRI decoding, see #19508.
return path_info.decode(UTF_8)
@@ -236,7 +235,6 @@ def get_script_name(environ):
else:
script_name = get_bytes_from_wsgi(environ, 'SCRIPT_NAME', '')
- # It'd be better to implement URI-to-IRI decoding, see #19508.
return script_name.decode(UTF_8)
@@ -251,16 +249,15 @@ def get_bytes_from_wsgi(environ, key, default):
# Under Python 3, non-ASCII values in the WSGI environ are arbitrarily
# decoded with ISO-8859-1. This is wrong for Django websites where UTF-8
# is the default. Re-encode to recover the original bytestring.
- return value if six.PY2 else value.encode(ISO_8859_1)
+ return value.encode(ISO_8859_1) if six.PY3 else value
def get_str_from_wsgi(environ, key, default):
"""
- Get a value from the WSGI environ dictionary as bytes.
+ Get a value from the WSGI environ dictionary as str.
key and default should be str objects. Under Python 2 they may also be
unicode objects provided they only contain ASCII characters.
"""
- value = environ.get(str(key), str(default))
- # Same comment as above
- return value if six.PY2 else value.encode(ISO_8859_1).decode(UTF_8, errors='replace')
+ value = get_bytes_from_wsgi(environ, key, default)
+ return value.decode(UTF_8, errors='replace') if six.PY3 else value
@@ -15,9 +15,11 @@
from wsgiref.util import FileWrapper # NOQA: for backwards compatibility
from django.core.exceptions import ImproperlyConfigured
+from django.core.handlers.wsgi import ISO_8859_1, UTF_8
from django.core.management.color import color_style
from django.core.wsgi import get_wsgi_application
from django.utils import six
+from django.utils.encoding import uri_to_iri
from django.utils.module_loading import import_string
from django.utils.six.moves import socketserver
@@ -117,6 +119,21 @@ def log_message(self, format, *args):
sys.stderr.write(msg)
+ def get_environ(self):
+ env = super(WSGIRequestHandler, self).get_environ()
+
+ path = self.path
+ if '?' in path:
+ path = path.partition('?')[0]
+
+ path = uri_to_iri(path).encode(UTF_8)
+ # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily
+ # decoded with ISO-8859-1. We replicate this behavior here.
+ # Refs comment in `get_bytes_from_wsgi()`.
+ env['PATH_INFO'] = path.decode(ISO_8859_1) if six.PY3 else path
+
+ return env
+
def run(addr, port, wsgi_handler, ipv6=False, threading=False):
server_address = (addr, port)
View
@@ -12,19 +12,19 @@
from django.conf import settings
from django.core import urlresolvers
from django.core.handlers.base import BaseHandler
-from django.core.handlers.wsgi import WSGIRequest
+from django.core.handlers.wsgi import WSGIRequest, ISO_8859_1, UTF_8
from django.core.signals import (request_started, request_finished,
got_request_exception)
from django.db import close_old_connections
from django.http import SimpleCookie, HttpRequest, QueryDict
from django.template import TemplateDoesNotExist
from django.test import signals
from django.utils.functional import curry, SimpleLazyObject
-from django.utils.encoding import force_bytes, force_str
+from django.utils.encoding import force_bytes, force_str, uri_to_iri
from django.utils.http import urlencode
from django.utils.itercompat import is_iterable
from django.utils import six
-from django.utils.six.moves.urllib.parse import unquote, urlparse, urlsplit
+from django.utils.six.moves.urllib.parse import urlparse, urlsplit
from django.test.utils import ContextList
__all__ = ('Client', 'RequestFactory', 'encode_file', 'encode_multipart')
@@ -270,11 +270,11 @@ def _get_path(self, parsed):
# If there are parameters, add them
if parsed[3]:
path += str(";") + force_str(parsed[3])
- path = unquote(path)
- # WSGI requires latin-1 encoded strings. See get_path_info().
- if six.PY3:
- path = path.encode('utf-8').decode('iso-8859-1')
- return path
+ path = uri_to_iri(path).encode(UTF_8)
+ # Under Python 3, non-ASCII values in the WSGI environ are arbitrarily
+ # decoded with ISO-8859-1. We replicate this behavior here.
+ # Refs comment in `get_bytes_from_wsgi()`.
+ return path.decode(ISO_8859_1) if six.PY3 else path
def get(self, path, data=None, secure=False, **extra):
"Construct a GET request."
View
@@ -1,3 +1,4 @@
+# -*- encoding: utf-8 -*-
from __future__ import unicode_literals
import codecs
@@ -7,7 +8,9 @@
from django.utils.functional import Promise
from django.utils import six
-from django.utils.six.moves.urllib.parse import quote
+from django.utils.six.moves.urllib.parse import quote, unquote
+if six.PY3:
+ from urllib.parse import unquote_to_bytes
class DjangoUnicodeDecodeError(UnicodeDecodeError):
@@ -185,7 +188,9 @@ def iri_to_uri(iri):
assuming input is either UTF-8 or unicode already, we can simplify things a
little from the full method.
- Returns an ASCII string containing the encoded result.
+ Takes an IRI in UTF-8 bytes (e.g. '/I \xe2\x99\xa5 Django/') or unicode
+ (e.g. '/I ♥ Django/') and returns ASCII bytes containing the encoded result
+ (e.g. '/I%20%E2%99%A5%20Django/').
"""
# The list of safe characters here is constructed from the "reserved" and
# "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
@@ -204,6 +209,38 @@ def iri_to_uri(iri):
return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~")
+def uri_to_iri(uri):
+ """
+ Converts a Uniform Resource Identifier(URI) into an Internationalized
+ Resource Identifier(IRI).
+
+ This is the algorithm from section 3.2 of RFC 3987.
+
+ Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
+ unicode containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
+ """
+ if uri is None:
+ return uri
+ uri = force_bytes(uri)
+ iri = unquote_to_bytes(uri) if six.PY3 else unquote(uri)
+ return repercent_broken_unicode(iri).decode('utf-8')
+
+
+def repercent_broken_unicode(path):
+ """
+ As per section 3.2 of RFC 3987, step three of converting a URI into an IRI,
+ we need to re-percent-encode any octet produced that is not part of a
+ strictly legal UTF-8 octet sequence.
+ """
+ try:
+ path.decode('utf-8')
+ except UnicodeDecodeError as e:
+ repercent = quote(path[e.start:e.end], safe=b"/#%[]=:;$&()+,!?*@'~")
+ path = repercent_broken_unicode(
+ path[:e.start] + force_bytes(repercent) + path[e.end:])
+ return path
+
+
def filepath_to_uri(path):
"""Convert a file system path to a URI portion that is suitable for
inclusion in a URL.
View
@@ -173,11 +173,11 @@ URL from an IRI_ -- very loosely speaking, a URI_ that can contain Unicode
characters. Quoting and converting an IRI to URI can be a little tricky, so
Django provides some assistance.
-* The function ``django.utils.encoding.iri_to_uri()`` implements the
- conversion from IRI to URI as required by the specification (:rfc:`3987`).
+* The function :func:`django.utils.encoding.iri_to_uri()` implements the
+ conversion from IRI to URI as required by the specification (:rfc:`3987#section-3.1`).
-* The functions ``django.utils.http.urlquote()`` and
- ``django.utils.http.urlquote_plus()`` are versions of Python's standard
+* The functions :func:`django.utils.http.urlquote()` and
+ :func:`django.utils.http.urlquote_plus()` are versions of Python's standard
``urllib.quote()`` and ``urllib.quote_plus()`` that work with non-ASCII
characters. (The data is converted to UTF-8 prior to encoding.)
@@ -213,12 +213,29 @@ you can construct your IRI without worrying about whether it contains
non-ASCII characters and then, right at the end, call ``iri_to_uri()`` on the
result.
-The ``iri_to_uri()`` function is also idempotent, which means the following is
-always true::
+Similarly, Django provides :func:`django.utils.encoding.uri_to_iri()` which
+implements the conversion from URI to IRI as per :rfc:`3987#section-3.2`.
+It decodes all percent-encodings except those that don't represent a valid
+UTF-8 sequence.
+
+An example to demonstrate::
+
+ >>> uri_to_iri('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93')
+ '/♥♥/?utf8=✓'
+ >>> uri_to_iri('%A9helloworld')
+ '%A9helloworld'
+
+In the first example, the UTF-8 characters and reserved characters are
+unquoted. In the second, the percent-encoding remains unchanged because it
+lies outside the valid UTF-8 range.
+
+Both ``iri_to_uri()`` and ``uri_to_iri()`` functions are idempotent, which means the
+following is always true::
iri_to_uri(iri_to_uri(some_string)) = iri_to_uri(some_string)
+ uri_to_iri(uri_to_iri(some_string)) = uri_to_iri(some_string)
-So you can safely call it multiple times on the same IRI without risking
+So you can safely call it multiple times on the same URI/IRI without risking
double-quoting problems.
.. _URI: http://www.ietf.org/rfc/rfc2396.txt
View
@@ -271,7 +271,20 @@ The functions defined in this module share the following properties:
since we are assuming input is either UTF-8 or unicode already, we can
simplify things a little from the full method.
- Returns an ASCII string containing the encoded result.
+ Takes an IRI in UTF-8 bytes and returns ASCII bytes containing the encoded
+ result.
+
+.. function:: uri_to_iri(uri)
+
+ .. versionadded:: 1.8
+
+ Converts a Uniform Resource Identifier into an Internationalized Resource
+ Identifier.
+
+ This is an algorithm from section 3.2 of :rfc:`3987#section-3.2`.
+
+ Takes a URI in ASCII bytes and returns a unicode string containing the
+ encoded result.
.. function:: filepath_to_uri(path)
View
@@ -348,6 +348,9 @@ Requests and Responses
* The :attr:`HttpResponse.charset <django.http.HttpResponse.charset>` attribute
was added.
+* ``WSGIRequestHandler`` now follows RFC in converting URI to IRI, using
+ ``uri_to_iri()``.
+
Tests
^^^^^
View
@@ -161,3 +161,28 @@ class HandlerSuspiciousOpsTest(TestCase):
def test_suspiciousop_in_view_returns_400(self):
response = self.client.get('/suspicious/')
self.assertEqual(response.status_code, 400)
+
+
+@override_settings(ROOT_URLCONF='handlers.urls')
+class HandlerNotFoundTest(TestCase):
+
+ def test_invalid_urls(self):
+ response = self.client.get('~%A9helloworld')
+ self.assertEqual(response.status_code, 404)
+ self.assertContains(response, '~%A9helloworld', status_code=404)
+
+ response = self.client.get('d%aao%aaw%aan%aal%aao%aaa%aad%aa/')
+ self.assertEqual(response.status_code, 404)
+ self.assertContains(response, 'd%AAo%AAw%AAn%AAl%AAo%AAa%AAd%AA', status_code=404)
+
+ response = self.client.get('/%E2%99%E2%99%A5/')
+ self.assertEqual(response.status_code, 404)
+ self.assertContains(response, '%E2%99\u2665', status_code=404)
+
+ response = self.client.get('/%E2%98%8E%E2%A9%E2%99%A5/')
+ self.assertEqual(response.status_code, 404)
+ self.assertContains(response, '\u260e%E2%A9\u2665', status_code=404)
+
+ def test_environ_path_info_type(self):
+ environ = RequestFactory().get('/%E2%A8%87%87%A5%E2%A8%A0').environ
+ self.assertIsInstance(environ['PATH_INFO'], six.text_type)
@@ -5,8 +5,8 @@
import datetime
from django.utils import six
-from django.utils.encoding import (filepath_to_uri, force_bytes,
- force_text, iri_to_uri, python_2_unicode_compatible)
+from django.utils.encoding import (filepath_to_uri, force_bytes, force_text,
+ iri_to_uri, uri_to_iri)
from django.utils.http import urlquote_plus
@@ -40,29 +40,67 @@ def test_force_bytes_strings_only(self):
today = datetime.date.today()
self.assertEqual(force_bytes(today, strings_only=True), today)
+
+class TestRFC3987IEncodingUtils(unittest.TestCase):
+
def test_filepath_to_uri(self):
self.assertEqual(filepath_to_uri('upload\\чубака.mp4'),
'upload/%D1%87%D1%83%D0%B1%D0%B0%D0%BA%D0%B0.mp4')
self.assertEqual(filepath_to_uri('upload\\чубака.mp4'.encode('utf-8')),
'upload/%D1%87%D1%83%D0%B1%D0%B0%D0%BA%D0%B0.mp4')
def test_iri_to_uri(self):
- self.assertEqual(iri_to_uri('red%09ros\xe9#red'),
- 'red%09ros%C3%A9#red')
+ cases = [
+ # Valid UTF-8 sequences are encoded.
+ ('red%09rosé#red', 'red%09ros%C3%A9#red'),
+ ('/blog/for/Jürgen Münster/', '/blog/for/J%C3%BCrgen%20M%C3%BCnster/'),
+ ('locations/%s' % urlquote_plus('Paris & Orléans'), 'locations/Paris+%26+Orl%C3%A9ans'),
+
+ # Reserved chars remain unescaped.
+ ('%&', '%&'),
+ ('red&♥ros%#red', 'red&%E2%99%A5ros%#red'),
+ ]
+
+ for iri, uri in cases:
+ self.assertEqual(iri_to_uri(iri), uri)
+
+ # Test idempotency.
+ self.assertEqual(iri_to_uri(iri_to_uri(iri)), uri)
+
+ def test_uri_to_iri(self):
+ cases = [
+ # Valid UTF-8 sequences are decoded.
+ ('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
+ ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
+
+ # Broken UTF-8 sequences remain escaped.
+ ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
+ ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
+ ('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),
+ ('/%E2%E2%99%A5%E2%99%A5%99/', '/%E2♥♥%99/'),
+ ('/%E2%99%A5%E2%99%A5/?utf8=%9C%93%E2%9C%93%9C%93', '/♥♥/?utf8=%9C%93✓%9C%93'),
+ ]
- self.assertEqual(iri_to_uri('/blog/for/J\xfcrgen M\xfcnster/'),
- '/blog/for/J%C3%BCrgen%20M%C3%BCnster/')
+ for uri, iri in cases:
+ self.assertEqual(uri_to_iri(uri), iri)
- self.assertEqual(iri_to_uri('locations/%s' % urlquote_plus('Paris & Orl\xe9ans')),
- 'locations/Paris+%26+Orl%C3%A9ans')
+ # Test idempotency.
+ self.assertEqual(uri_to_iri(uri_to_iri(uri)), iri)
- def test_iri_to_uri_idempotent(self):
- self.assertEqual(iri_to_uri(iri_to_uri('red%09ros\xe9#red')),
- 'red%09ros%C3%A9#red')
+ def test_complementarity(self):
+ cases = [
+ ('/blog/for/J%C3%BCrgen%20M%C3%BCnster/', '/blog/for/J\xfcrgen M\xfcnster/'),
+ ('%&', '%&'),
+ ('red&%E2%99%A5ros%#red', 'red&♥ros%#red'),
+ ('/%E2%99%A5%E2%99%A5/', '/♥♥/'),
+ ('/%E2%99%A5%E2%99%A5/?utf8=%E2%9C%93', '/♥♥/?utf8=✓'),
+ ('/%AAd%AAj%AAa%AAn%AAg%AAo%AA/', '/%AAd%AAj%AAa%AAn%AAg%AAo%AA/'),
+ ('/%E2%99%A5%E2%E2%99%A5/', '/♥%E2♥/'),
+ ('/%E2%99%A5%E2%99%E2%99%A5/', '/♥%E2%99♥/'),
+ ('/%E2%E2%99%A5%E2%99%A5%99/', '/%E2♥♥%99/'),
+ ('/%E2%99%A5%E2%99%A5/?utf8=%9C%93%E2%9C%93%9C%93', '/♥♥/?utf8=%9C%93✓%9C%93'),
+ ]
- @unittest.skipIf(six.PY3, "tests a class not defining __str__ under Python 2")
- def test_decorated_class_without_str(self):
- with self.assertRaises(ValueError):
- @python_2_unicode_compatible
- class NoStr(object):
- pass
+ for uri, iri in cases:
+ self.assertEqual(iri_to_uri(uri_to_iri(uri)), uri)
+ self.assertEqual(uri_to_iri(iri_to_uri(iri)), iri)

2 comments on commit 10b17a2

@coder9042

This comment has been minimized.

Show comment Hide comment
@coder9042

coder9042 Oct 15, 2014

Contributor

@loic Thanks for corrections and commiting the patch.

Contributor

coder9042 replied Oct 15, 2014

@loic Thanks for corrections and commiting the patch.

@loic

This comment has been minimized.

Show comment Hide comment
@loic

loic Oct 15, 2014

Member

My pleasure, thank you for the patch!

Member

loic replied Oct 15, 2014

My pleasure, thank you for the patch!

Please sign in to comment.