Simplified smart_urlquote and added some basic tests.

django · Jul 28, 2013 · b70c371 · b70c371
1 parent 0d0ccf8
commit b70c371
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 12 deletions.
diff --git a/django/utils/html.py b/django/utils/html.py
@@ -4,13 +4,13 @@
 
 import re
 try:
-    from urllib.parse import quote, urlsplit, urlunsplit
+    from urllib.parse import quote, unquote, urlsplit, urlunsplit
 except ImportError:     # Python 2
-    from urllib import quote
+    from urllib import quote, unquote
     from urlparse import urlsplit, urlunsplit
 
 from django.utils.safestring import SafeData, mark_safe
-from django.utils.encoding import force_bytes, force_text
+from django.utils.encoding import force_text, force_str
 from django.utils.functional import allow_lazy
 from django.utils import six
 from django.utils.text import normalize_newlines
@@ -26,7 +26,6 @@
 DOTS = ['&middot;', '*', '\u2022', '&#149;', '&bull;', '&#8226;']
 
 unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
-unquoted_percents_re = re.compile(r'%(?![0-9A-Fa-f]{2})')
 word_split_re = re.compile(r'(\s+)')
 simple_url_re = re.compile(r'^https?://\[?\w', re.IGNORECASE)
 simple_url_2_re = re.compile(r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)$', re.IGNORECASE)
@@ -185,11 +184,9 @@ def smart_urlquote(url):
         # invalid IPv6 URL (normally square brackets in hostname part).
         pass
 
-    # An URL is considered unquoted if it contains no % characters or
-    # contains a % not followed by two hexadecimal digits. See #9655.
-    if '%' not in url or unquoted_percents_re.search(url):
-        # See http://bugs.python.org/issue2637
-        url = quote(force_bytes(url), safe=b'!*\'();:@&=+$,/?#[]~')
+    url = unquote(force_str(url))
+    # See http://bugs.python.org/issue2637
+    url = quote(url, safe=b'!*\'();:@&=+$,/?#[]~')
 
     return force_text(url)
 

diff --git a/tests/defaultfilters/tests.py b/tests/defaultfilters/tests.py
@@ -249,9 +249,10 @@ def test_urlize(self):
             '<a href="https://google.com" rel="nofollow">https://google.com</a>')
 
         # Check urlize doesn't overquote already quoted urls - see #9655
-        self.assertEqual(urlize('http://hi.baidu.com/%D6%D8%D0%C2%BF'),
-            '<a href="http://hi.baidu.com/%D6%D8%D0%C2%BF" rel="nofollow">'
-            'http://hi.baidu.com/%D6%D8%D0%C2%BF</a>')
+        # The teststring is the urlquoted version of 'http://hi.baidu.com/重新开始'
+        self.assertEqual(urlize('http://hi.baidu.com/%E9%87%8D%E6%96%B0%E5%BC%80%E5%A7%8B'),
+            '<a href="http://hi.baidu.com/%E9%87%8D%E6%96%B0%E5%BC%80%E5%A7%8B" rel="nofollow">'
+            'http://hi.baidu.com/%E9%87%8D%E6%96%B0%E5%BC%80%E5%A7%8B</a>')
         self.assertEqual(urlize('www.mystore.com/30%OffCoupons!'),
             '<a href="http://www.mystore.com/30%25OffCoupons!" rel="nofollow">'
             'www.mystore.com/30%OffCoupons!</a>')

diff --git a/tests/utils_tests/test_html.py b/tests/utils_tests/test_html.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
 from datetime import datetime
@@ -181,3 +182,13 @@ def test_remove_tags(self):
         )
         for value, tags, output in items:
             self.assertEqual(f(value, tags), output)
+
+    def test_smart_urlquote(self):
+        quote = html.smart_urlquote
+        # Ensure that IDNs are properly quoted
+        self.assertEqual(quote('http://öäü.com/'), 'http://xn--4ca9at.com/')
+        self.assertEqual(quote('http://öäü.com/öäü/'), 'http://xn--4ca9at.com/%C3%B6%C3%A4%C3%BC/')
+        # Ensure that everything unsafe is quoted, !*'();:@&=+$,/?#[]~ is considered safe as per RFC
+        self.assertEqual(quote('http://example.com/path/öäü/'), 'http://example.com/path/%C3%B6%C3%A4%C3%BC/')
+        self.assertEqual(quote('http://example.com/%C3%B6/ä/'), 'http://example.com/%C3%B6/%C3%A4/')
+        self.assertEqual(quote('http://example.com/?x=1&y=2'), 'http://example.com/?x=1&y=2')