Permalink
Browse files

Fixed #19237 -- Improved strip_tags utility

The previous pattern didn't properly addressed cases where '>'
was present inside quoted tag content.
  • Loading branch information...
1 parent be64dd3 commit bf1871d874a371ad0ae6c7e098e7665a468dca16 @khoomeister khoomeister committed with claudep Nov 24, 2012
Showing with 5 additions and 1 deletion.
  1. +2 −1 django/utils/html.py
  2. +3 −0 tests/regressiontests/utils/html.py
View
@@ -33,6 +33,7 @@
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
+strip_tags_re = re.compile(r'</?\S([^=]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE)
def escape(text):
@@ -117,7 +118,7 @@ def linebreaks(value, autoescape=False):
def strip_tags(value):
"""Returns the given HTML with all tags stripped."""
- return re.sub(r'<[^>]*?>', '', force_text(value))
+ return strip_tags_re.sub('', force_text(value))
strip_tags = allow_lazy(strip_tags)
def remove_tags(html, tags):
@@ -65,6 +65,9 @@ def test_strip_tags(self):
('<f', '<f'),
('</fe', '</fe'),
('<x>b<y>', 'b'),
+ ('a<p onclick="alert(\'<test>\')">b</p>c', 'abc'),
+ ('a<p a >b</p>c', 'abc'),
+ ('d<a:b c:d>e</p>f', 'def'),
)
for value, output in items:
self.check_output(f, value, output)

1 comment on commit bf1871d

Contributor

litchfield commented on bf1871d Apr 3, 2013

Unfortunately this new regex causes catastrophic backtracking on some strings, which causes the whole process to spin into an infinite loop. Refer to my comments on #19237 for details.

Please sign in to comment.