Merge da6d885 into e8ddeee

getnikola · Apr 27, 2014 · 6d00845 · 6d00845
2 parents e8ddeee + da6d885
commit 6d00845
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 7 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -76,6 +76,7 @@ Features
 Bugfixes
 --------
 
+* Fix lxml adding extra root tags being added by lxml by lxml.html.tostring
 * not having typogrify installed now produces a valid error (Issue #1262)
 * Pages were not rebuilt when DEMOTE_HEADERS was changed (Issue #1261)
 * code.css was not rebuilt, even though there were changes in v6.4.0 to its

diff --git a/nikola/post.py b/nikola/post.py
@@ -415,6 +415,18 @@ def text(self, lang=None, teaser_only=False, strip_html=False, show_read_more_li
         All links in the returned HTML will be relative.
         The HTML returned is a bare fragment, not a full document.
         """
+        def strip_root_element(el):
+            ''' Strips root tag from an Element.
+
+            Required because lxml has an tendency to add <div>, <body>
+            root tags to strings which are generated by using
+            lxml.html.tostring()
+
+            :param Element el: the root element to strip
+            '''
+            return (el.text or '') + ''.join(
+                [lxml.html.tostring(child, encoding='unicode')
+                    for child in el.iterchildren()])
 
         if lang is None:
             lang = nikola.utils.LocaleBorg().current_lang
@@ -439,10 +451,7 @@ def text(self, lang=None, teaser_only=False, strip_html=False, show_read_more_li
         # data here is a full HTML doc, including HTML and BODY tags
         # which is not ideal (Issue #464)
         try:
-            body = document.body
-            data = (body.text or '') + ''.join(
-                [lxml.html.tostring(child, encoding='unicode')
-                    for child in body.iterchildren()])
+            data = strip_root_element(document.body)
         except IndexError:  # No body there, it happens sometimes
             pass
 
@@ -466,7 +475,10 @@ def text(self, lang=None, teaser_only=False, strip_html=False, show_read_more_li
                             remaining_paragraph_count=self.remaining_paragraph_count)
                 # This closes all open tags and sanitizes the broken HTML
                 document = lxml.html.fromstring(teaser)
-                data = lxml.html.tostring(document, encoding='unicode')
+                try:
+                    data = strip_root_element(document)
+                except IndexError:
+                    pass
 
         if data and strip_html:
             try:
@@ -481,9 +493,9 @@ def text(self, lang=None, teaser_only=False, strip_html=False, show_read_more_li
                 try:
                     document = lxml.html.fromstring(data)
                     demote_headers(document, self.demote_headers)
+                    data = strip_root_element(document)
+                except (lxml.etree.ParserError, IndexError):
                     data = lxml.html.tostring(document, encoding='unicode')
-                except lxml.etree.ParserError:
-                    pass
 
         return data