Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Moved text plugin cleaning to cms.plugins.text.models.AbstractText.clean

Moved html cleaning to cms.utils.html.clean_html
  • Loading branch information...
commit 7610771fa254f902599e53f1bd1d22714d387b4a 1 parent 9ca7738
@ojii ojii authored
View
32 cms/plugins/text/forms.py
@@ -1,29 +1,7 @@
from cms.plugins.text.models import Text
+from cms.utils.html import clean_html
from django import forms
from django.forms.models import ModelForm
-import html5lib
-from html5lib import sanitizer
-
-
-def _get_inner_body(doc):
- # find 'body'
- def _rec(node):
- if node.type == 5: # Element Type
- if node.name == 'body': # the body element
- return node
- for child in node.childNodes:
- childfound = _rec(child)
- if childfound:
- return childfound
- return None
- body = _rec(doc)
- # if the first element after <body> is a html tag, this returns an Element
- # instance, otherwise a (unicode) string, this is why we need to check
- # the output of this and potentially call .toxml() again.
- out = reduce(lambda x,y:x.toxml()+y.toxml(), body.childNodes)
- if isinstance(out, basestring):
- return out
- return out.toxml()
class TextForm(ModelForm):
body = forms.CharField()
@@ -31,11 +9,3 @@ class TextForm(ModelForm):
class Meta:
model = Text
exclude = ('page', 'position', 'placeholder', 'language', 'plugin_type')
-
- parser = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
-
- def clean_body(self):
- data = self.cleaned_data['body']
- doc = self.parser.parse(data)
- html = _get_inner_body(doc)
- return html
View
13 cms/plugins/text/models.py
@@ -1,11 +1,11 @@
-from django.db import models
-from django.utils.translation import ugettext_lazy as _
from cms.models import CMSPlugin
+from cms.plugins.text.utils import (plugin_admin_html_to_tags,
+ plugin_tags_to_admin_html, plugin_tags_to_id_list, replace_plugin_tags)
+from cms.utils.html import clean_html
+from django.db import models
from django.utils.html import strip_tags
from django.utils.text import truncate_words
-from cms.plugins.text.utils import plugin_admin_html_to_tags,\
- plugin_tags_to_admin_html, plugin_tags_to_id_list,\
- replace_plugin_tags
+from django.utils.translation import ugettext_lazy as _
_old_tree_cache = {}
@@ -34,6 +34,9 @@ def _get_body_admin(self):
def __unicode__(self):
return u"%s" % (truncate_words(strip_tags(self.body), 3)[:30]+"...")
+ def clean(self):
+ self.body = clean_html(self.body, full=False)
+
def clean_plugins(self):
ids = plugin_tags_to_id_list(self.body)
plugins = CMSPlugin.objects.filter(parent=self)
View
41 cms/utils/html.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+import html5lib
+from html5lib import sanitizer
+
+
+DEFAULT_PARSER = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
+
+
+def _get_inner_body(doc):
+ # find 'body'
+ def _rec(node):
+ if node.type == 5: # Element Type
+ if node.name == 'body': # the body element
+ return node
+ for child in node.childNodes:
+ childfound = _rec(child)
+ if childfound:
+ return childfound
+ return None
+ body = _rec(doc)
+ # if the first element after <body> is a html tag, this returns an Element
+ # instance, otherwise a (unicode) string, this is why we need to check
+ # the output of this and potentially call .toxml() again.
+ out = reduce(lambda x,y:x.toxml()+y.toxml(), body.childNodes)
+ if isinstance(out, basestring):
+ return out
+ return out.toxml()
+
+
+def clean_html(data, full=True, parser=DEFAULT_PARSER):
+ """
+ Cleans HTML from XSS vulnerabilities using html5lib
+
+ If full is False, only the contents inside <body> will be returned (without
+ the <body> tags).
+ """
+ doc = parser.parse(data)
+ if full:
+ return doc.toxml()
+ else:
+ return _get_inner_body(doc)
Please sign in to comment.
Something went wrong with that request. Please try again.