Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Fixed #19237 -- Used HTML parser to strip tags

The regex method used until now for the strip_tags utility is fast,
but subject to flaws and security issues. Consensus and good
practice lead use to use a slower but safer method.
  • Loading branch information...
commit dc51ec8bc214cf60ebb99732363624c23df8005f 1 parent 01948e3
Claude Paroz authored
28  django/utils/html.py
@@ -16,6 +16,9 @@
16 16
 from django.utils import six
17 17
 from django.utils.text import normalize_newlines
18 18
 
  19
+from .html_parser import HTMLParser
  20
+
  21
+
19 22
 # Configuration for urlize() function.
20 23
 TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)']
21 24
 WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('&lt;', '&gt;')]
@@ -33,7 +36,6 @@
33 36
 html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
34 37
 hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
35 38
 trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
36  
-strip_tags_re = re.compile(r'</?\S([^=>]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE)
37 39
 
38 40
 
39 41
 def escape(text):
@@ -116,9 +118,31 @@ def linebreaks(value, autoescape=False):
116 118
     return '\n\n'.join(paras)
117 119
 linebreaks = allow_lazy(linebreaks, six.text_type)
118 120
 
  121
+
  122
+class MLStripper(HTMLParser):
  123
+    def __init__(self):
  124
+        HTMLParser.__init__(self)
  125
+        self.reset()
  126
+        self.fed = []
  127
+    def handle_data(self, d):
  128
+        self.fed.append(d)
  129
+    def handle_entityref(self, name):
  130
+        self.fed.append('&%s;' % name)
  131
+    def handle_charref(self, name):
  132
+        self.fed.append('&#%s;' % name)
  133
+    def get_data(self):
  134
+        return ''.join(self.fed)
  135
+
119 136
 def strip_tags(value):
120 137
     """Returns the given HTML with all tags stripped."""
121  
-    return strip_tags_re.sub('', force_text(value))
  138
+    s = MLStripper()
  139
+    s.feed(value)
  140
+    data = s.get_data()
  141
+    try:
  142
+        res = s.close()
  143
+    except Exception as e:
  144
+        data += s.rawdata
  145
+    return data
122 146
 strip_tags = allow_lazy(strip_tags)
123 147
 
124 148
 def remove_tags(html, tags):
8  tests/utils_tests/test_html.py
@@ -5,6 +5,7 @@
5 5
 
6 6
 from django.utils import html
7 7
 from django.utils._os import upath
  8
+from django.utils.encoding import force_text
8 9
 from django.utils.unittest import TestCase
9 10
 
10 11
 
@@ -63,10 +64,12 @@ def test_linebreaks(self):
63 64
     def test_strip_tags(self):
64 65
         f = html.strip_tags
65 66
         items = (
  67
+            ('<p>See: &#39;&eacute; is an apostrophe followed by e acute</p>',
  68
+             'See: &#39;&eacute; is an apostrophe followed by e acute'),
66 69
             ('<adf>a', 'a'),
67 70
             ('</adf>a', 'a'),
68 71
             ('<asdf><asdf>e', 'e'),
69  
-            ('<f', '<f'),
  72
+            ('hi, <f x', 'hi, <f x'),
70 73
             ('</fe', '</fe'),
71 74
             ('<x>b<y>', 'b'),
72 75
             ('a<p onclick="alert(\'<test>\')">b</p>c', 'abc'),
@@ -81,8 +84,9 @@ def test_strip_tags(self):
81 84
         for filename in ('strip_tags1.html', 'strip_tags2.txt'):
82 85
             path = os.path.join(os.path.dirname(upath(__file__)), 'files', filename)
83 86
             with open(path, 'r') as fp:
  87
+                content = force_text(fp.read())
84 88
                 start = datetime.now()
85  
-                stripped = html.strip_tags(fp.read())
  89
+                stripped = html.strip_tags(content)
86 90
                 elapsed = datetime.now() - start
87 91
             self.assertEqual(elapsed.seconds, 0)
88 92
             self.assertIn("Please try again.", stripped)

0 notes on commit dc51ec8

Please sign in to comment.
Something went wrong with that request. Please try again.