Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Loading…

change html_minify to recursive function #69

Merged
merged 3 commits into from

2 participants

@hrbonz

As per discussions in #21, I implemented a minify version using recursive functions to walk and clean the tree. This method is, I think, more robust and happens to be faster.

@hrbonz

I found a bug in the code, let me change this before merging

@hrbonz

Made a little fix, good to get reviewed

@andrewsmedina

@hrbonz thanks, I will review :)

@hrbonz

A few issues/PR could be closed if this is accepted: #66, #68 for example.

@hrbonz

Any chance to look at this?

@andrewsmedina andrewsmedina merged commit b8318a2 into cobrateam:master

1 check passed

Details default The Travis CI build passed
@andrewsmedina

Thanks @hrbonz

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Feb 18, 2014
  1. @hrbonz
Commits on Feb 19, 2014
  1. @hrbonz
Commits on Feb 20, 2014
  1. @hrbonz
This page is out of date. Refresh to see the latest.
View
206 htmlmin/minify.py
@@ -7,63 +7,161 @@
import re
import bs4
-
-from HTMLParser import HTMLParser
-from .util import force_decode, between_two_tags
-
-EXCLUDE_TAGS = ("pre", "script", "textarea",)
-
-TAGS_PATTERN = "<%s>%d</%s>"
-
-cond_regex = re.compile(r"<!--\[if .*\]>.*<!\[endif\]-->")
-
-
-def is_conditional_comment(text):
- return cond_regex.match(text)
+from .util import force_decode
+
+EXCLUDE_TAGS = ("pre", "script", "textarea")
+# element list coming from
+# https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/HTML5/HTML5_element_list
+# combining text-level semantics & edits
+TEXT_FLOW = ("a", "em", "strong", "small", "s", "cite", "q", "dfn", "abbr", "data", "time", "code", "var", "samp", "kbd", "sub", "i", "b", "u", "mark", "ruby", "rt", "rp", "bdi", "bdo", "span", "br", "wbr", "ins", "del")
+
+# fold the doctype element, if True then no newline is added after the
+# doctype element. If False, a newline will be insterted
+FOLD_DOCTYPE = True
+re_multi_space = re.compile(r'\s+', re.MULTILINE|re.UNICODE)
+re_single_nl = re.compile(r'^\n$', re.MULTILINE|re.UNICODE)
+re_only_space = re.compile(r'^\s+$', re.MULTILINE|re.UNICODE)
+re_start_space = re.compile(r'^\s+', re.MULTILINE|re.UNICODE)
+re_end_space = re.compile(r'\s+$', re.MULTILINE|re.UNICODE)
+# see http://en.wikipedia.org/wiki/Conditional_comment
+re_cond_comment = re.compile(r'\[if .*\]>.*<!\[endif\]',
+ re.MULTILINE|re.DOTALL|re.UNICODE)
+re_cond_comment_start_space = re.compile(r'(\[if .*\]>)\s+',
+ re.MULTILINE|re.DOTALL|re.UNICODE)
+re_cond_comment_end_space = re.compile(r'\s+(<!\[endif\])',
+ re.MULTILINE|re.DOTALL|re.UNICODE)
def html_minify(html_code, ignore_comments=True, parser="html5lib"):
html_code = force_decode(html_code)
soup = bs4.BeautifulSoup(html_code, parser)
- html_code = unicode(soup)
- exclude_tags = {}
-
- for tag in EXCLUDE_TAGS:
- exclude_tags[tag] = [unicode(e) for e in soup.findAll(name=tag)
- if len(e.text) > 0]
-
- for index, elem in enumerate(exclude_tags[tag]):
- html_code = html_code.replace(elem,
- TAGS_PATTERN % (tag, index, tag))
-
- soup = bs4.BeautifulSoup(html_code, parser)
-
- if ignore_comments:
- f = lambda text: isinstance(text, bs4.Comment) and not \
- cond_regex.match(text.output_ready())
- [comment.extract() for comment in soup.findAll(text=f)]
-
- html_code = unicode(soup)
- html_code = html_code.replace(" \n", " ")
- lines = html_code.split("\n")
- minified_lines = []
-
- for index, line in enumerate(lines):
- minified_line = line.strip()
- if not between_two_tags(minified_line, minified_lines, index):
- minified_line = " %s" % minified_line
- minified_lines.append(unicode(minified_line))
- if minified_line.endswith("</a>") and \
- not lines[index + 1].startswith("</body>"):
- minified_lines.append(u" ")
-
- spaces_pattern = re.compile(r"\s+")
- content = "".join(minified_lines)
- content = spaces_pattern.sub(" ", content)
-
- for tag in EXCLUDE_TAGS:
- for index, e in enumerate(exclude_tags[tag]):
- content = content.replace(TAGS_PATTERN % (tag, index, tag), e)
-
- return content
+ mini_soup = space_minify(soup, ignore_comments)
+ if FOLD_DOCTYPE is True:
+ # monkey patching to remove new line after doctype
+ bs4.element.Doctype.SUFFIX = u'>'
+ return unicode(mini_soup)
+
+def space_minify(soup, ignore_comments=True):
+ """recursive function to reduce space characters in html code.
+
+ :param soup: a BeautifulSoup of the code to reduce
+ :type soup: bs4.BeautifulSoup
+ :param ignore_comments: whether or not to keep comments in the
+ result
+ :type ignore_comments: bool
+ """
+ # if tag excluded from minification, just pass
+ if str(soup.name) in EXCLUDE_TAGS:
+ return
+
+ # loop through childrens of this element
+ if hasattr(soup, 'children'):
+ for child in soup.children:
+ space_minify(child, ignore_comments)
+
+ # if the element is a string ...
+ if is_navstr(soup):
+ # ... but not a comment, CData, Doctype or others (see
+ # bs4/element.py for list).
+ if not is_prestr(soup):
+ # reduce multiple space characters
+ new_string = re_multi_space.sub(' ', soup.string)
+ (prev_flow, next_flow) = is_inflow(soup)
+ # if the string is in a flow of text, don't remove lone
+ # spaces
+ if prev_flow and next_flow:
+ new_string = re_only_space.sub(' ', new_string)
+ # else, remove spaces, they are between grouping, section,
+ # metadata or other types of block
+ else:
+ new_string = re_only_space.sub('', new_string)
+ # if the previous element is not text then remove leading
+ # spaces
+ if prev_flow:
+ new_string = re_start_space.sub(' ', new_string)
+ else:
+ new_string = re_start_space.sub('', new_string)
+ # if the previous element is not text then remove leading
+ # spaces
+ if next_flow:
+ new_string = re_end_space.sub(' ', new_string)
+ else:
+ new_string = re_end_space.sub('', new_string)
+ # bs4 sometimes add a lone newline in the body
+ new_string = re_single_nl.sub('', new_string)
+ soup.string.replace_with(new_string)
+ # Conditional comment content is HTML code so it should be
+ # minified
+ elif is_cond_comment(soup):
+ new_string = re_multi_space.sub(' ', soup.string)
+ new_string = re_cond_comment_start_space.sub(r'\1',
+ new_string)
+ new_string = re_cond_comment_end_space.sub(r'\1', new_string)
+ new_comment = bs4.element.Comment(new_string)
+ soup.string.replace_with(new_comment)
+ # if ignore_comments is True and this is a comment but not a
+ # conditional comment and
+ elif ignore_comments == True and is_comment(soup):
+ # remove the element
+ soup.string.replace_with(u'')
+ return soup
+
+def is_navstr(soup):
+ """test whether an element is a NavigableString or not, return a
+ boolean.
+
+ :param soup: a BeautifulSoup of the code to reduce
+ :type soup: bs4.BeautifulSoup
+ """
+ return isinstance(soup, bs4.element.NavigableString)
+
+def is_prestr(soup):
+ """test whether an element is a PreformattedString or not, return a
+ boolean.
+
+ :param soup: a BeautifulSoup of the code to reduce
+ :type soup: bs4.BeautifulSoup
+ """
+ return isinstance(soup, bs4.element.PreformattedString)
+
+def is_comment(soup):
+ """test whether an element is a Comment, return a boolean.
+
+ :param soup: a BeautifulSoup of the code to reduce
+ :type soup: bs4.BeautifulSoup
+ """
+ return isinstance(soup, bs4.element.Comment) \
+ and not re_cond_comment.search(soup.string)
+
+def is_cond_comment(soup):
+ """test whether an element is a conditional comment, return a
+ boolean.
+
+ :param soup: a BeautifulSoup of the code to reduce
+ :type soup: bs4.BeautifulSoup
+ """
+ return isinstance(soup, bs4.element.Comment) \
+ and re_cond_comment.search(soup.string)
+
+def is_inflow(soup):
+ """test whether an element belongs to a text flow, returns a tuple
+ of two booleans describing the flow around the element. The first
+ boolean represents the flow before the element, the second boolean
+ represents the flow after the element.
+
+ :param soup: a BeautifulSoup of the code to reduce
+ :type soup: bs4.BeautifulSoup
+ """
+ if soup.previous_sibling is not None and \
+ soup.previous_sibling.name in TEXT_FLOW:
+ prev_flow = True
+ else:
+ prev_flow = False
+ if soup.next_sibling is not None and \
+ soup.next_sibling.name in TEXT_FLOW:
+ next_flow = True
+ else:
+ next_flow = False
+
+ return (prev_flow, next_flow)
View
4 htmlmin/tests/resources/line_break.html
@@ -1,2 +1,4 @@
<html><body><p>this is a
- multiline paragraph</p></body></html>
+ multiline paragraph</p>
+<p>this is another
+multiline paragraph</p></body></html>
View
2  htmlmin/tests/resources/line_break_minified.html
@@ -1 +1 @@
-<html><head></head><body><p>this is a multiline paragraph</p></body></html>
+<html><head></head><body><p>this is a multiline paragraph</p><p>this is another multiline paragraph</p></body></html>
View
2  htmlmin/tests/resources/non_ascii_in_excluded_element_minified.html
@@ -1 +1 @@
-<html><head><meta charset="utf-8"/><title>Unicode</title></head><body><p>This is a UTF-8 string with non ascii characters: ɐ ɑ ɒ ɓ ɔ ɕ ɖ ɗ ɘ ə ɚ ɛ ɜ </p><textarea>and smore more non ascii inside an excluded element ʘ ʙ ʚ ʛ ʜ ʝ ʞ ʟ ʠ ʡ ʢ</textarea></body></html>
+<html><head><meta charset="utf-8"/><title>Unicode</title></head><body><p>This is a UTF-8 string with non ascii characters: ɐ ɑ ɒ ɓ ɔ ɕ ɖ ɗ ɘ ə ɚ ɛ ɜ</p><textarea>and smore more non ascii inside an excluded element ʘ ʙ ʚ ʛ ʜ ʝ ʞ ʟ ʠ ʡ ʢ</textarea></body></html>
View
2  htmlmin/tests/resources/non_ascii_minified.html
@@ -1 +1 @@
-<html><head><meta charset="utf-8"/><title>Unicode</title></head><body><p>This is a UTF-8 string with non ascii characters: ɐ ɑ ɒ ɓ ɔ ɕ ɖ ɗ ɘ ə ɚ ɛ ɜ </p></body></html>
+<html><head><meta charset="utf-8"/><title>Unicode</title></head><body><p>This is a UTF-8 string with non ascii characters: ɐ ɑ ɒ ɓ ɔ ɕ ɖ ɗ ɘ ə ɚ ɛ ɜ</p></body></html>
View
2  htmlmin/tests/resources/with_conditional_comments_minified.html
@@ -1 +1 @@
-<!DOCTYPE html><html lang="pt-BR"><head><meta charset="utf-8"/><title>Page Title</title></head><body><!--[if IE]> this conditional comment should be included, because I like it! <![endif]--><h1>Header</h1></body></html>
+<!DOCTYPE html><html lang="pt-BR"><head><meta charset="utf-8"/><title>Page Title</title></head><body><!--[if IE]>this conditional comment should be included, because I like it!<![endif]--><h1>Header</h1></body></html>
View
14 htmlmin/tests/resources/with_multiple_line_conditional_comments.html
@@ -0,0 +1,14 @@
+<!DOCTYPE html>
+<html lang="en">
+ <head>
+ <meta charset="utf-8">
+ <title>Page Title</title>
+ <!--[if IE]>
+ <link rel="stylesheet" type="text/css" href="all-ie-only.css" />
+ <![endif]-->
+ </head>
+
+ <body>
+ <h1>Header</h1>
+ </body>
+</html>
View
1  htmlmin/tests/resources/with_multiple_line_conditional_comments_minified.html
@@ -0,0 +1 @@
+<!DOCTYPE html><html lang="en"><head><meta charset="utf-8"/><title>Page Title</title><!--[if IE]><link rel="stylesheet" type="text/css" href="all-ie-only.css" /><![endif]--></head><body><h1>Header</h1></body></html>
View
16 htmlmin/tests/test_middleware.py
@@ -39,7 +39,7 @@ def test_should_minify_response_when_mime_type_is_html(self):
RequestMock(), response_mock,
)
- minified = "<html><head></head><body>some text here </body></html>"
+ minified = "<html><head></head><body>some text here</body></html>"
self.assertEqual(minified, response.content)
def test_should_minify_with_any_charset(self):
@@ -49,7 +49,7 @@ def test_should_minify_with_any_charset(self):
RequestMock(), response_mock,
)
- minified = "<html><head></head><body>some text here </body></html>"
+ minified = "<html><head></head><body>some text here</body></html>"
self.assertEqual(minified, response.content)
def test_should_not_minify_not_html_content(self):
@@ -74,7 +74,7 @@ def test_should_minify_if_exclude_from_minifying_is_unset(self):
old = settings.EXCLUDE_FROM_MINIFYING
del settings.EXCLUDE_FROM_MINIFYING
- minified = "<html><head></head><body>some text here </body></html>"
+ minified = "<html><head></head><body>some text here</body></html>"
response = HtmlMinifyMiddleware().process_response(
RequestMock(), ResponseMock(),
)
@@ -92,7 +92,7 @@ def test_should_not_minify_response_with_minify_response_false(self):
self.assertEqual(html_not_minified, response.content)
def test_should_minify_response_with_minify_response_true(self):
- minified = "<html><head></head><body>some text here </body></html>"
+ minified = "<html><head></head><body>some text here</body></html>"
response_mock = ResponseMock()
response_mock.minify_response = True
response = HtmlMinifyMiddleware().process_response(
@@ -105,7 +105,7 @@ def test_should_keep_comments_when_they_are_enabled(self):
settings.KEEP_COMMENTS_ON_MINIFYING = True
minified = "<html><!-- some comment --><head></head><body>" + \
- "some text here </body></html>"
+ "some text here</body></html>"
response_mock = ResponseWithCommentMock()
response = HtmlMinifyMiddleware().process_response(
RequestMock(), response_mock,
@@ -118,7 +118,7 @@ def test_should_remove_comments_they_are_disabled(self):
old = settings.KEEP_COMMENTS_ON_MINIFYING
settings.KEEP_COMMENTS_ON_MINIFYING = False
- minified = "<html><head></head><body>some text here </body></html>"
+ minified = "<html><head></head><body>some text here</body></html>"
response_mock = ResponseWithCommentMock()
response = HtmlMinifyMiddleware().process_response(
RequestMock(), response_mock,
@@ -131,7 +131,7 @@ def test_should_remove_comments_when_the_setting_is_not_specified(self):
old = settings.KEEP_COMMENTS_ON_MINIFYING
del settings.KEEP_COMMENTS_ON_MINIFYING
- minified = "<html><head></head><body>some text here </body></html>"
+ minified = "<html><head></head><body>some text here</body></html>"
response_mock = ResponseWithCommentMock()
response = HtmlMinifyMiddleware().process_response(
RequestMock(), response_mock,
@@ -174,7 +174,7 @@ def test_should_minify_when_DEBUG_is_false_and_MINIFY_is_unset(self):
del settings.HTML_MINIFY
settings.DEBUG = False
- minified = "<html><head></head><body>some text here </body></html>"
+ minified = "<html><head></head><body>some text here</body></html>"
response = HtmlMinifyMiddleware().process_response(
RequestMock(), ResponseMock(),
View
6 htmlmin/tests/test_minify.py
@@ -72,7 +72,7 @@ def test_should_not_drop_blank_lines_from_the_begin_of_a_textarea(self):
def test_html_should_be_minified(self):
html = "<html> <body>some text here</body> </html>"
- minified = "<html><head></head><body>some text here </body></html>"
+ minified = "<html><head></head><body>some text here</body></html>"
self.assertEqual(minified, html_minify(html))
def test_minify_function_should_return_a_unicode_object(self):
@@ -113,6 +113,10 @@ def test_should_not_exclude_conditional_comments(self):
html, minified = self._normal_and_minified('with_conditional_comments')
self.assertEqual(minified, html_minify(html))
+ def test_should_not_rm_multiline_conditional_comments(self):
+ html, minified = self._normal_and_minified('with_multiple_line_conditional_comments')
+ self.assertEqual(minified, html_minify(html))
+
def test_should_touch_attributes_only_on_tags(self):
html = '<html>\n <body>I selected you!</body>\n </html>'
minified = '<html><head></head><body>I selected you!</body></html>'
Something went wrong with that request. Please try again.