Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

change html_minify to recursive function #69

Merged
merged 3 commits into from
Feb 24, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 152 additions & 54 deletions htmlmin/minify.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,63 +7,161 @@
import re

import bs4

from HTMLParser import HTMLParser

from .util import force_decode, between_two_tags

EXCLUDE_TAGS = ("pre", "script", "textarea",)

TAGS_PATTERN = "<%s>%d</%s>"

cond_regex = re.compile(r"<!--\[if .*\]>.*<!\[endif\]-->")


def is_conditional_comment(text):
return cond_regex.match(text)
from .util import force_decode

EXCLUDE_TAGS = ("pre", "script", "textarea")
# element list coming from
# https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/HTML5/HTML5_element_list
# combining text-level semantics & edits
TEXT_FLOW = ("a", "em", "strong", "small", "s", "cite", "q", "dfn", "abbr", "data", "time", "code", "var", "samp", "kbd", "sub", "i", "b", "u", "mark", "ruby", "rt", "rp", "bdi", "bdo", "span", "br", "wbr", "ins", "del")

# fold the doctype element, if True then no newline is added after the
# doctype element. If False, a newline will be insterted
FOLD_DOCTYPE = True
re_multi_space = re.compile(r'\s+', re.MULTILINE|re.UNICODE)
re_single_nl = re.compile(r'^\n$', re.MULTILINE|re.UNICODE)
re_only_space = re.compile(r'^\s+$', re.MULTILINE|re.UNICODE)
re_start_space = re.compile(r'^\s+', re.MULTILINE|re.UNICODE)
re_end_space = re.compile(r'\s+$', re.MULTILINE|re.UNICODE)
# see http://en.wikipedia.org/wiki/Conditional_comment
re_cond_comment = re.compile(r'\[if .*\]>.*<!\[endif\]',
re.MULTILINE|re.DOTALL|re.UNICODE)
re_cond_comment_start_space = re.compile(r'(\[if .*\]>)\s+',
re.MULTILINE|re.DOTALL|re.UNICODE)
re_cond_comment_end_space = re.compile(r'\s+(<!\[endif\])',
re.MULTILINE|re.DOTALL|re.UNICODE)


def html_minify(html_code, ignore_comments=True, parser="html5lib"):
html_code = force_decode(html_code)
soup = bs4.BeautifulSoup(html_code, parser)
html_code = unicode(soup)
exclude_tags = {}

for tag in EXCLUDE_TAGS:
exclude_tags[tag] = [unicode(e) for e in soup.findAll(name=tag)
if len(e.text) > 0]

for index, elem in enumerate(exclude_tags[tag]):
html_code = html_code.replace(elem,
TAGS_PATTERN % (tag, index, tag))

soup = bs4.BeautifulSoup(html_code, parser)

if ignore_comments:
f = lambda text: isinstance(text, bs4.Comment) and not \
cond_regex.match(text.output_ready())
[comment.extract() for comment in soup.findAll(text=f)]

html_code = unicode(soup)
html_code = html_code.replace(" \n", " ")
lines = html_code.split("\n")
minified_lines = []

for index, line in enumerate(lines):
minified_line = line.strip()
if not between_two_tags(minified_line, minified_lines, index):
minified_line = " %s" % minified_line
minified_lines.append(unicode(minified_line))
if minified_line.endswith("</a>") and \
not lines[index + 1].startswith("</body>"):
minified_lines.append(u" ")

spaces_pattern = re.compile(r"\s+")
content = "".join(minified_lines)
content = spaces_pattern.sub(" ", content)

for tag in EXCLUDE_TAGS:
for index, e in enumerate(exclude_tags[tag]):
content = content.replace(TAGS_PATTERN % (tag, index, tag), e)

return content
mini_soup = space_minify(soup, ignore_comments)
if FOLD_DOCTYPE is True:
# monkey patching to remove new line after doctype
bs4.element.Doctype.SUFFIX = u'>'
return unicode(mini_soup)

def space_minify(soup, ignore_comments=True):
"""recursive function to reduce space characters in html code.

:param soup: a BeautifulSoup of the code to reduce
:type soup: bs4.BeautifulSoup
:param ignore_comments: whether or not to keep comments in the
result
:type ignore_comments: bool
"""
# if tag excluded from minification, just pass
if str(soup.name) in EXCLUDE_TAGS:
return

# loop through childrens of this element
if hasattr(soup, 'children'):
for child in soup.children:
space_minify(child, ignore_comments)

# if the element is a string ...
if is_navstr(soup):
# ... but not a comment, CData, Doctype or others (see
# bs4/element.py for list).
if not is_prestr(soup):
# reduce multiple space characters
new_string = re_multi_space.sub(' ', soup.string)
(prev_flow, next_flow) = is_inflow(soup)
# if the string is in a flow of text, don't remove lone
# spaces
if prev_flow and next_flow:
new_string = re_only_space.sub(' ', new_string)
# else, remove spaces, they are between grouping, section,
# metadata or other types of block
else:
new_string = re_only_space.sub('', new_string)
# if the previous element is not text then remove leading
# spaces
if prev_flow:
new_string = re_start_space.sub(' ', new_string)
else:
new_string = re_start_space.sub('', new_string)
# if the previous element is not text then remove leading
# spaces
if next_flow:
new_string = re_end_space.sub(' ', new_string)
else:
new_string = re_end_space.sub('', new_string)
# bs4 sometimes add a lone newline in the body
new_string = re_single_nl.sub('', new_string)
soup.string.replace_with(new_string)
# Conditional comment content is HTML code so it should be
# minified
elif is_cond_comment(soup):
new_string = re_multi_space.sub(' ', soup.string)
new_string = re_cond_comment_start_space.sub(r'\1',
new_string)
new_string = re_cond_comment_end_space.sub(r'\1', new_string)
new_comment = bs4.element.Comment(new_string)
soup.string.replace_with(new_comment)
# if ignore_comments is True and this is a comment but not a
# conditional comment and
elif ignore_comments == True and is_comment(soup):
# remove the element
soup.string.replace_with(u'')
return soup

def is_navstr(soup):
"""test whether an element is a NavigableString or not, return a
boolean.

:param soup: a BeautifulSoup of the code to reduce
:type soup: bs4.BeautifulSoup
"""
return isinstance(soup, bs4.element.NavigableString)

def is_prestr(soup):
"""test whether an element is a PreformattedString or not, return a
boolean.

:param soup: a BeautifulSoup of the code to reduce
:type soup: bs4.BeautifulSoup
"""
return isinstance(soup, bs4.element.PreformattedString)

def is_comment(soup):
"""test whether an element is a Comment, return a boolean.

:param soup: a BeautifulSoup of the code to reduce
:type soup: bs4.BeautifulSoup
"""
return isinstance(soup, bs4.element.Comment) \
and not re_cond_comment.search(soup.string)

def is_cond_comment(soup):
"""test whether an element is a conditional comment, return a
boolean.

:param soup: a BeautifulSoup of the code to reduce
:type soup: bs4.BeautifulSoup
"""
return isinstance(soup, bs4.element.Comment) \
and re_cond_comment.search(soup.string)

def is_inflow(soup):
"""test whether an element belongs to a text flow, returns a tuple
of two booleans describing the flow around the element. The first
boolean represents the flow before the element, the second boolean
represents the flow after the element.

:param soup: a BeautifulSoup of the code to reduce
:type soup: bs4.BeautifulSoup
"""
if soup.previous_sibling is not None and \
soup.previous_sibling.name in TEXT_FLOW:
prev_flow = True
else:
prev_flow = False
if soup.next_sibling is not None and \
soup.next_sibling.name in TEXT_FLOW:
next_flow = True
else:
next_flow = False

return (prev_flow, next_flow)
4 changes: 3 additions & 1 deletion htmlmin/tests/resources/line_break.html
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
<html><body><p>this is a
multiline paragraph</p></body></html>
multiline paragraph</p>
<p>this is another
multiline paragraph</p></body></html>
2 changes: 1 addition & 1 deletion htmlmin/tests/resources/line_break_minified.html
Original file line number Diff line number Diff line change
@@ -1 +1 @@
<html><head></head><body><p>this is a multiline paragraph</p></body></html>
<html><head></head><body><p>this is a multiline paragraph</p><p>this is another multiline paragraph</p></body></html>
Original file line number Diff line number Diff line change
@@ -1 +1 @@
<html><head><meta charset="utf-8"/><title>Unicode</title></head><body><p>This is a UTF-8 string with non ascii characters: ɐ ɑ ɒ ɓ ɔ ɕ ɖ ɗ ɘ ə ɚ ɛ ɜ </p><textarea>and smore more non ascii inside an excluded element ʘ ʙ ʚ ʛ ʜ ʝ ʞ ʟ ʠ ʡ ʢ</textarea></body></html>
<html><head><meta charset="utf-8"/><title>Unicode</title></head><body><p>This is a UTF-8 string with non ascii characters: ɐ ɑ ɒ ɓ ɔ ɕ ɖ ɗ ɘ ə ɚ ɛ ɜ</p><textarea>and smore more non ascii inside an excluded element ʘ ʙ ʚ ʛ ʜ ʝ ʞ ʟ ʠ ʡ ʢ</textarea></body></html>
2 changes: 1 addition & 1 deletion htmlmin/tests/resources/non_ascii_minified.html
Original file line number Diff line number Diff line change
@@ -1 +1 @@
<html><head><meta charset="utf-8"/><title>Unicode</title></head><body><p>This is a UTF-8 string with non ascii characters: ɐ ɑ ɒ ɓ ɔ ɕ ɖ ɗ ɘ ə ɚ ɛ ɜ </p></body></html>
<html><head><meta charset="utf-8"/><title>Unicode</title></head><body><p>This is a UTF-8 string with non ascii characters: ɐ ɑ ɒ ɓ ɔ ɕ ɖ ɗ ɘ ə ɚ ɛ ɜ</p></body></html>
Original file line number Diff line number Diff line change
@@ -1 +1 @@
<!DOCTYPE html><html lang="pt-BR"><head><meta charset="utf-8"/><title>Page Title</title></head><body><!--[if IE]> this conditional comment should be included, because I like it! <![endif]--><h1>Header</h1></body></html>
<!DOCTYPE html><html lang="pt-BR"><head><meta charset="utf-8"/><title>Page Title</title></head><body><!--[if IE]>this conditional comment should be included, because I like it!<![endif]--><h1>Header</h1></body></html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Page Title</title>
<!--[if IE]>
<link rel="stylesheet" type="text/css" href="all-ie-only.css" />
<![endif]-->
</head>

<body>
<h1>Header</h1>
</body>
</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<!DOCTYPE html><html lang="en"><head><meta charset="utf-8"/><title>Page Title</title><!--[if IE]><link rel="stylesheet" type="text/css" href="all-ie-only.css" /><![endif]--></head><body><h1>Header</h1></body></html>
16 changes: 8 additions & 8 deletions htmlmin/tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_should_minify_response_when_mime_type_is_html(self):
RequestMock(), response_mock,
)

minified = "<html><head></head><body>some text here </body></html>"
minified = "<html><head></head><body>some text here</body></html>"
self.assertEqual(minified, response.content)

def test_should_minify_with_any_charset(self):
Expand All @@ -49,7 +49,7 @@ def test_should_minify_with_any_charset(self):
RequestMock(), response_mock,
)

minified = "<html><head></head><body>some text here </body></html>"
minified = "<html><head></head><body>some text here</body></html>"
self.assertEqual(minified, response.content)

def test_should_not_minify_not_html_content(self):
Expand All @@ -74,7 +74,7 @@ def test_should_minify_if_exclude_from_minifying_is_unset(self):
old = settings.EXCLUDE_FROM_MINIFYING
del settings.EXCLUDE_FROM_MINIFYING

minified = "<html><head></head><body>some text here </body></html>"
minified = "<html><head></head><body>some text here</body></html>"
response = HtmlMinifyMiddleware().process_response(
RequestMock(), ResponseMock(),
)
Expand All @@ -92,7 +92,7 @@ def test_should_not_minify_response_with_minify_response_false(self):
self.assertEqual(html_not_minified, response.content)

def test_should_minify_response_with_minify_response_true(self):
minified = "<html><head></head><body>some text here </body></html>"
minified = "<html><head></head><body>some text here</body></html>"
response_mock = ResponseMock()
response_mock.minify_response = True
response = HtmlMinifyMiddleware().process_response(
Expand All @@ -105,7 +105,7 @@ def test_should_keep_comments_when_they_are_enabled(self):
settings.KEEP_COMMENTS_ON_MINIFYING = True

minified = "<html><!-- some comment --><head></head><body>" + \
"some text here </body></html>"
"some text here</body></html>"
response_mock = ResponseWithCommentMock()
response = HtmlMinifyMiddleware().process_response(
RequestMock(), response_mock,
Expand All @@ -118,7 +118,7 @@ def test_should_remove_comments_they_are_disabled(self):
old = settings.KEEP_COMMENTS_ON_MINIFYING
settings.KEEP_COMMENTS_ON_MINIFYING = False

minified = "<html><head></head><body>some text here </body></html>"
minified = "<html><head></head><body>some text here</body></html>"
response_mock = ResponseWithCommentMock()
response = HtmlMinifyMiddleware().process_response(
RequestMock(), response_mock,
Expand All @@ -131,7 +131,7 @@ def test_should_remove_comments_when_the_setting_is_not_specified(self):
old = settings.KEEP_COMMENTS_ON_MINIFYING
del settings.KEEP_COMMENTS_ON_MINIFYING

minified = "<html><head></head><body>some text here </body></html>"
minified = "<html><head></head><body>some text here</body></html>"
response_mock = ResponseWithCommentMock()
response = HtmlMinifyMiddleware().process_response(
RequestMock(), response_mock,
Expand Down Expand Up @@ -174,7 +174,7 @@ def test_should_minify_when_DEBUG_is_false_and_MINIFY_is_unset(self):
del settings.HTML_MINIFY
settings.DEBUG = False

minified = "<html><head></head><body>some text here </body></html>"
minified = "<html><head></head><body>some text here</body></html>"

response = HtmlMinifyMiddleware().process_response(
RequestMock(), ResponseMock(),
Expand Down
6 changes: 5 additions & 1 deletion htmlmin/tests/test_minify.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_should_not_drop_blank_lines_from_the_begin_of_a_textarea(self):

def test_html_should_be_minified(self):
html = "<html> <body>some text here</body> </html>"
minified = "<html><head></head><body>some text here </body></html>"
minified = "<html><head></head><body>some text here</body></html>"
self.assertEqual(minified, html_minify(html))

def test_minify_function_should_return_a_unicode_object(self):
Expand Down Expand Up @@ -113,6 +113,10 @@ def test_should_not_exclude_conditional_comments(self):
html, minified = self._normal_and_minified('with_conditional_comments')
self.assertEqual(minified, html_minify(html))

def test_should_not_rm_multiline_conditional_comments(self):
html, minified = self._normal_and_minified('with_multiple_line_conditional_comments')
self.assertEqual(minified, html_minify(html))

def test_should_touch_attributes_only_on_tags(self):
html = '<html>\n <body>I selected you!</body>\n </html>'
minified = '<html><head></head><body>I selected you!</body></html>'
Expand Down