Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Fixed #7267 - UnicodeDecodeError in clean_html

Thanks to Nikolay for the report, and gav and aaugustin for the patch.

git-svn-id: http://code.djangoproject.com/svn/django/trunk@16118 bcc190cf-cafb-0310-a4f2-bffc1f526a37
  • Loading branch information...
commit cf11e3789b6643cf451d79d675a97c4de94542b0 1 parent 2ac4f17
Luke Plant authored April 28, 2011
8  django/utils/html.py
@@ -13,7 +13,7 @@
13 13
 TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>']
14 14
 
15 15
 # List of possible strings used for bullets in bulleted lists.
16  
-DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•']
  16
+DOTS = [u'·', u'*', u'\u2022', u'•', u'•', u'•']
17 17
 
18 18
 unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
19 19
 word_split_re = re.compile(r'(\s+)')
@@ -180,13 +180,13 @@ def clean_html(text):
180 180
     text = html_gunk_re.sub('', text)
181 181
     # Convert hard-coded bullets into HTML unordered lists.
182 182
     def replace_p_tags(match):
183  
-        s = match.group().replace('</p>', '</li>')
  183
+        s = match.group().replace(u'</p>', u'</li>')
184 184
         for d in DOTS:
185  
-            s = s.replace('<p>%s' % d, '<li>')
  185
+            s = s.replace(u'<p>%s' % d, u'<li>')
186 186
         return u'<ul>\n%s\n</ul>' % s
187 187
     text = hard_coded_bullets_re.sub(replace_p_tags, text)
188 188
     # Remove stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the bottom
189 189
     # of the text.
190  
-    text = trailing_empty_content_re.sub('', text)
  190
+    text = trailing_empty_content_re.sub(u'', text)
191 191
     return text
192 192
 clean_html = allow_lazy(clean_html, unicode)
12  tests/regressiontests/utils/html.py
@@ -121,3 +121,15 @@ def test_escapejs(self):
121 121
         )
122 122
         for value, output in items:
123 123
             self.check_output(f, value, output)
  124
+
  125
+    def test_clean_html(self):
  126
+        f = html.clean_html
  127
+        items = (
  128
+            (u'<p>I <i>believe</i> in <b>semantic markup</b>!</p>', u'<p>I <em>believe</em> in <strong>semantic markup</strong>!</p>'),
  129
+            (u'I escape & I don\'t <a href="#" target="_blank">target</a>', u'I escape &amp; I don\'t <a href="#" >target</a>'),
  130
+            (u'<p>I kill whitespace</p><br clear="all"><p>&nbsp;</p>', u'<p>I kill whitespace</p>'),
  131
+            # also a regression test for #7267: this used to raise an UnicodeDecodeError
  132
+            (u'<p>* foo</p><p>* bar</p>', u'<ul>\n<li> foo</li><li> bar</li>\n</ul>'),
  133
+        )
  134
+        for value, output in items:
  135
+            self.check_output(f, value, output)

0 notes on commit cf11e37

Please sign in to comment.
Something went wrong with that request. Please try again.