django · felixxm · Feb 7, 2024 · Jan 3, 2023
diff --git a/django/utils/text.py b/django/utils/text.py
@@ -2,12 +2,20 @@
 import re
 import secrets
 import unicodedata
+from collections import deque
 from gzip import GzipFile
 from gzip import compress as gzip_compress
+from html import escape
+from html.parser import HTMLParser
 from io import BytesIO
 
 from django.core.exceptions import SuspiciousFileOperation
-from django.utils.functional import SimpleLazyObject, keep_lazy_text, lazy
+from django.utils.functional import (
+    SimpleLazyObject,
+    cached_property,
+    keep_lazy_text,
+    lazy,
+)
 from django.utils.regex_helper import _lazy_re_compile
 from django.utils.translation import gettext as _
 from django.utils.translation import gettext_lazy, pgettext
@@ -80,6 +88,101 @@ def add_truncation_text(text, truncate=None):
     return f"{text}{truncate}"
 
 
+def calculate_truncate_chars_length(length, replacement):
+    truncate_len = length
+    for char in add_truncation_text("", replacement):
+        if not unicodedata.combining(char):
+            truncate_len -= 1
+            if truncate_len == 0:
+                break
+    return truncate_len
+
+
+class TruncateHTMLParser(HTMLParser):
+    class TruncationCompleted(Exception):
+        pass
+
+    def __init__(self, *, length, replacement, convert_charrefs=True):
+        super().__init__(convert_charrefs=convert_charrefs)
+        self.tags = deque()
+        self.output = ""
+        self.remaining = length
+        self.replacement = replacement
+
+    @cached_property
+    def void_elements(self):
+        from django.utils.html import VOID_ELEMENTS
+
+        return VOID_ELEMENTS
+
+    def handle_startendtag(self, tag, attrs):
+        self.handle_starttag(tag, attrs)
+        if tag not in self.void_elements:
+            self.handle_endtag(tag)
+
+    def handle_starttag(self, tag, attrs):
+        self.output += self.get_starttag_text()
+        if tag not in self.void_elements:
+            self.tags.appendleft(tag)
+
+    def handle_endtag(self, tag):
+        if tag not in self.void_elements:
+            self.output += f"</{tag}>"
+            try:
+                self.tags.remove(tag)
+            except ValueError:
+                pass
+
+    def handle_data(self, data):
+        data, output = self.process(data)
+        data_len = len(data)
+        if self.remaining < data_len:
+            self.remaining = 0
+            self.output += add_truncation_text(output, self.replacement)
+            raise self.TruncationCompleted
+        self.remaining -= data_len
+        self.output += output
+
+    def feed(self, data):
+        try:
+            super().feed(data)
+        except self.TruncationCompleted:
+            self.output += "".join([f"</{tag}>" for tag in self.tags])
+            self.tags.clear()
+            self.reset()
+        else:
+            # No data was handled.
+            self.reset()
+
+
+class TruncateCharsHTMLParser(TruncateHTMLParser):
+    def __init__(self, *, length, replacement, convert_charrefs=True):
+        self.length = length
+        self.processed_chars = 0
+        super().__init__(
+            length=calculate_truncate_chars_length(length, replacement),
+            replacement=replacement,
+            convert_charrefs=convert_charrefs,
+        )
+
+    def process(self, data):
+        self.processed_chars += len(data)
+        if (self.processed_chars == self.length) and (
+            len(self.output) + len(data) == len(self.rawdata)
+        ):
+            self.output += data
+            raise self.TruncationCompleted
+        output = escape("".join(data[: self.remaining]))
+        return data, output
+
+
+class TruncateWordsHTMLParser(TruncateHTMLParser):
+    def process(self, data):
+        data = re.split(r"(?<=\S)\s+(?=\S)", data)
+        output = escape(" ".join(data[: self.remaining]))
+        return data, output
+
+
 class Truncator(SimpleLazyObject):
     """
     An object used to truncate text, either by characters or words.
@@ -108,19 +211,16 @@ def chars(self, num, truncate=None, html=False):
             return ""
         text = unicodedata.normalize("NFC", self._wrapped)
 
-        # Calculate the length to truncate to (max length - end_text length)
-        truncate_len = length
-        for char in add_truncation_text("", truncate):
-            if not unicodedata.combining(char):
-                truncate_len -= 1
-                if truncate_len == 0:
-                    break
         if html:
-            return self._truncate_html(length, truncate, text, truncate_len, False)
-        return self._text_chars(length, truncate, text, truncate_len)
+            parser = TruncateCharsHTMLParser(length=length, replacement=truncate)
+            parser.feed(text)
+            parser.close()
+            return parser.output
+        return self._text_chars(length, truncate, text)
 
-    def _text_chars(self, length, truncate, text, truncate_len):
+    def _text_chars(self, length, truncate, text):
         """Truncate a string after a certain number of chars."""
+        truncate_len = calculate_truncate_chars_length(length, truncate)
         s_len = 0
         end_index = None
         for i, char in enumerate(text):
@@ -149,7 +249,10 @@ def words(self, num, truncate=None, html=False):
         if length <= 0:
             return ""
         if html:
-            return self._truncate_html(length, truncate, self._wrapped, length, True)
+            parser = TruncateWordsHTMLParser(length=length, replacement=truncate)
+            parser.feed(self._wrapped)
+            parser.close()
+            return parser.output
         return self._text_words(length, truncate)
 
     def _text_words(self, length, truncate):
@@ -164,94 +267,6 @@ def _text_words(self, length, truncate):
             return add_truncation_text(" ".join(words), truncate)
         return " ".join(words)
 
-    def _truncate_html(self, length, truncate, text, truncate_len, words):
-        """
-        Truncate HTML to a certain number of chars (not counting tags and
-        comments), or, if words is True, then to a certain number of words.
-        Close opened tags if they were correctly closed in the given HTML.
-
-        Preserve newlines in the HTML.
-        """
-        if words and length <= 0:
-            return ""
-
-        size_limited = False
-        if len(text) > self.MAX_LENGTH_HTML:
-            text = text[: self.MAX_LENGTH_HTML]
-            size_limited = True
-
-        html4_singlets = (
-            "br",
-            "col",
-            "link",
-            "base",
-            "img",
-            "param",
-            "area",
-            "hr",
-            "input",
-        )
-
-        # Count non-HTML chars/words and keep note of open tags
-        pos = 0
-        end_text_pos = 0
-        current_len = 0
-        open_tags = []
-
-        regex = re_words if words else re_chars
-
-        while current_len <= length:
-            m = regex.search(text, pos)
-            if not m:
-                # Checked through whole string
-                break
-            pos = m.end(0)
-            if m[1]:
-                # It's an actual non-HTML word or char
-                current_len += 1
-                if current_len == truncate_len:
-                    end_text_pos = pos
-                continue
-            # Check for tag
-            tag = re_tag.match(m[0])
-            if not tag or current_len >= truncate_len:
-                # Don't worry about non tags or tags after our truncate point
-                continue
-            closing_tag, tagname, self_closing = tag.groups()
-            # Element names are always case-insensitive
-            tagname = tagname.lower()
-            if self_closing or tagname in html4_singlets:
-                pass
-            elif closing_tag:
-                # Check for match in open tags list
-                try:
-                    i = open_tags.index(tagname)
-                except ValueError:
-                    pass
-                else:
-                    # SGML: An end tag closes, back to the matching start tag,
-                    # all unclosed intervening start tags with omitted end tags
-                    open_tags = open_tags[i + 1 :]
-            else:
-                # Add it to the start of the open tags list
-                open_tags.insert(0, tagname)
-
-        truncate_text = add_truncation_text("", truncate)
-
-        if current_len <= length:
-            if size_limited and truncate_text:
-                text += truncate_text
-            return text
-
-        out = text[:end_text_pos]
-        if truncate_text:
-            out += truncate_text
-        # Close any tags still open
-        for tag in open_tags:
-            out += "</%s>" % tag
-        # Return string
-        return out
-
 
 @keep_lazy_text
 def get_valid_filename(name):

diff --git a/docs/releases/5.1.txt b/docs/releases/5.1.txt
@@ -368,6 +368,11 @@ Miscellaneous
   :meth:`~django.test.SimpleTestCase.assertInHTML` now add ``": "`` to the
   ``msg_prefix``. This is consistent with the behavior of other assertions.
 
+* ``django.utils.text.Truncator`` used by :tfilter:`truncatechars_html` and
+  :tfilter:`truncatewords_html` template filters now uses
+  :py:class:`html.parser.HTMLParser` subclasses. This results in a more robust
+  and faster operation, but there may be small differences in the output.
+
 .. _deprecated-features-5.1:
 
 Features deprecated in 5.1

diff --git a/tests/template_tests/filter_tests/test_truncatewords_html.py b/tests/template_tests/filter_tests/test_truncatewords_html.py
@@ -24,15 +24,15 @@ def test_truncate2(self):
             truncatewords_html(
                 '<p>one <a href="#">two - three <br>four</a> five</p>', 4
             ),
-            '<p>one <a href="#">two - three …</a></p>',
+            '<p>one <a href="#">two - three <br> …</a></p>',
         )
 
     def test_truncate3(self):
         self.assertEqual(
             truncatewords_html(
                 '<p>one <a href="#">two - three <br>four</a> five</p>', 5
             ),
-            '<p>one <a href="#">two - three <br>four …</a></p>',
+            '<p>one <a href="#">two - three <br>four</a> …</p>',
         )
 
     def test_truncate4(self):
@@ -53,7 +53,7 @@ def test_truncate_complex(self):
             truncatewords_html(
                 "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>", 3
             ),
-            "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo …</i>",
+            "<i>Buenos días! ¿Cómo …</i>",
         )
 
     def test_invalid_arg(self):