Merge pull request #3580 from getnikola/fix-3573-div-closed-too-soon

Fix #3573, fix #3564 — fix <div> closed too soon on index page
getnikola · Oct 13, 2021 · 334000e · 334000e
2 parents 9f10746 + 7a5e440
commit 334000e
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 11 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -19,6 +19,8 @@ Features
 Bugfixes
 --------
 
+* Fix bug with posts after the first one appearing shifted due to a
+  ``<div>`` closed too early (Issue #3573, #3564)
 * Fix support for files outside of site root directory on Windows
 * Support passing ``--backend`` and ``--db-file`` to ``nikola auto``
 * Support blank values in certain WordPress-imported structures

diff --git a/nikola/post.py b/nikola/post.py
@@ -910,10 +910,7 @@ def text(self, lang=None, teaser_only=False, strip_html=False, show_read_more_li
         if self.hyphenate:
             hyphenate(document, real_lang)
 
-        try:
-            data = lxml.html.tostring(document.body, encoding='unicode')
-        except Exception:
-            data = lxml.html.tostring(document, encoding='unicode')
+        data = utils.html_tostring_fragment(document)
 
         if teaser_only:
             teaser_regexp = self.config.get('TEASER_REGEXP', TEASER_REGEXP)
@@ -936,10 +933,7 @@ def text(self, lang=None, teaser_only=False, strip_html=False, show_read_more_li
                         post_title=self.title(lang))
                 # This closes all open tags and sanitizes the broken HTML
                 document = lxml.html.fromstring(teaser)
-                try:
-                    data = lxml.html.tostring(document.body, encoding='unicode')
-                except IndexError:
-                    data = lxml.html.tostring(document, encoding='unicode')
+                data = utils.html_tostring_fragment(document)
 
         if data and strip_html:
             try:
@@ -952,11 +946,11 @@ def text(self, lang=None, teaser_only=False, strip_html=False, show_read_more_li
             if self.demote_headers:
                 # see above
                 try:
-                    document = lxml.html.fromstring(data)
+                    document = lxml.html.fragment_fromstring(data, "body")
                     demote_headers(document, self.demote_headers)
-                    data = lxml.html.tostring(document.body, encoding='unicode')
+                    data = utils.html_tostring_fragment(document)
                 except (lxml.etree.ParserError, IndexError):
-                    data = lxml.html.tostring(document, encoding='unicode')
+                    pass
 
         return data
 

diff --git a/nikola/utils.py b/nikola/utils.py
@@ -30,6 +30,7 @@
 import datetime
 import hashlib
 import io
+import lxml.html
 import operator
 import os
 import re
@@ -656,6 +657,25 @@ def get_theme_chain(theme, themes_dirs):
     return themes
 
 
+def html_tostring_fragment(document):
+    """Convert a HTML snippet to a fragment, ready for insertion elsewhere."""
+    try:
+        doc = lxml.html.tostring(document.body, encoding='unicode').strip()
+    except Exception:
+        doc = lxml.html.tostring(document, encoding='unicode').strip()
+    start_fragments = ["<html>", "<body>"]
+    end_fragments = ["</body>", "</html>"]
+    for start in start_fragments:
+        if doc.startswith(start):
+            doc = doc[len(start):].strip()
+            print(repr(doc))
+    for end in end_fragments:
+        if doc.endswith(end):
+            doc = doc[:-len(end)].strip()
+            print(repr(doc))
+    return doc
+
+
 INCOMPLETE_LANGUAGES_WARNED = set()