Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed #30686 -- Used Python HTMLParser in utils.text.Truncator #16421

Merged
merged 1 commit into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
215 changes: 115 additions & 100 deletions django/utils/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,20 @@
import re
import secrets
import unicodedata
from collections import deque
from gzip import GzipFile
from gzip import compress as gzip_compress
from html import escape
from html.parser import HTMLParser
from io import BytesIO

from django.core.exceptions import SuspiciousFileOperation
from django.utils.functional import SimpleLazyObject, keep_lazy_text, lazy
from django.utils.functional import (
SimpleLazyObject,
cached_property,
keep_lazy_text,
lazy,
)
from django.utils.regex_helper import _lazy_re_compile
from django.utils.translation import gettext as _
from django.utils.translation import gettext_lazy, pgettext
Expand Down Expand Up @@ -80,6 +88,101 @@ def add_truncation_text(text, truncate=None):
return f"{text}{truncate}"


def calculate_truncate_chars_length(length, replacement):
truncate_len = length
for char in add_truncation_text("", replacement):
if not unicodedata.combining(char):
truncate_len -= 1
if truncate_len == 0:
break
return truncate_len


class TruncateHTMLParser(HTMLParser):
class TruncationCompleted(Exception):
pass

def __init__(self, *, length, replacement, convert_charrefs=True):
super().__init__(convert_charrefs=convert_charrefs)
self.tags = deque()
self.output = ""
self.remaining = length
self.replacement = replacement

@cached_property
def void_elements(self):
from django.utils.html import VOID_ELEMENTS

return VOID_ELEMENTS

def handle_startendtag(self, tag, attrs):
self.handle_starttag(tag, attrs)
if tag not in self.void_elements:
self.handle_endtag(tag)

def handle_starttag(self, tag, attrs):
self.output += self.get_starttag_text()
if tag not in self.void_elements:
self.tags.appendleft(tag)

def handle_endtag(self, tag):
if tag not in self.void_elements:
self.output += f"</{tag}>"
try:
self.tags.remove(tag)
except ValueError:
pass

def handle_data(self, data):
data, output = self.process(data)
data_len = len(data)
if self.remaining < data_len:
self.remaining = 0
self.output += add_truncation_text(output, self.replacement)
raise self.TruncationCompleted
self.remaining -= data_len
self.output += output

def feed(self, data):
try:
super().feed(data)
except self.TruncationCompleted:
self.output += "".join([f"</{tag}>" for tag in self.tags])
self.tags.clear()
self.reset()
else:
# No data was handled.
self.reset()


class TruncateCharsHTMLParser(TruncateHTMLParser):
def __init__(self, *, length, replacement, convert_charrefs=True):
self.length = length
self.processed_chars = 0
super().__init__(
length=calculate_truncate_chars_length(length, replacement),
replacement=replacement,
convert_charrefs=convert_charrefs,
)

def process(self, data):
self.processed_chars += len(data)
if (self.processed_chars == self.length) and (
len(self.output) + len(data) == len(self.rawdata)
):
self.output += data
raise self.TruncationCompleted
output = escape("".join(data[: self.remaining]))
return data, output


class TruncateWordsHTMLParser(TruncateHTMLParser):
def process(self, data):
data = re.split(r"(?<=\S)\s+(?=\S)", data)
output = escape(" ".join(data[: self.remaining]))
return data, output


class Truncator(SimpleLazyObject):
"""
An object used to truncate text, either by characters or words.
Expand Down Expand Up @@ -108,19 +211,16 @@ def chars(self, num, truncate=None, html=False):
return ""
felixxm marked this conversation as resolved.
Show resolved Hide resolved
text = unicodedata.normalize("NFC", self._wrapped)

# Calculate the length to truncate to (max length - end_text length)
truncate_len = length
for char in add_truncation_text("", truncate):
if not unicodedata.combining(char):
truncate_len -= 1
if truncate_len == 0:
break
if html:
return self._truncate_html(length, truncate, text, truncate_len, False)
return self._text_chars(length, truncate, text, truncate_len)
parser = TruncateCharsHTMLParser(length=length, replacement=truncate)
parser.feed(text)
parser.close()
return parser.output
return self._text_chars(length, truncate, text)

def _text_chars(self, length, truncate, text, truncate_len):
def _text_chars(self, length, truncate, text):
"""Truncate a string after a certain number of chars."""
truncate_len = calculate_truncate_chars_length(length, truncate)
s_len = 0
end_index = None
for i, char in enumerate(text):
Expand Down Expand Up @@ -149,7 +249,10 @@ def words(self, num, truncate=None, html=False):
if length <= 0:
return ""
if html:
return self._truncate_html(length, truncate, self._wrapped, length, True)
parser = TruncateWordsHTMLParser(length=length, replacement=truncate)
parser.feed(self._wrapped)
parser.close()
return parser.output
return self._text_words(length, truncate)

def _text_words(self, length, truncate):
Expand All @@ -164,94 +267,6 @@ def _text_words(self, length, truncate):
return add_truncation_text(" ".join(words), truncate)
return " ".join(words)

def _truncate_html(self, length, truncate, text, truncate_len, words):
"""
Truncate HTML to a certain number of chars (not counting tags and
comments), or, if words is True, then to a certain number of words.
Close opened tags if they were correctly closed in the given HTML.

Preserve newlines in the HTML.
"""
if words and length <= 0:
return ""

size_limited = False
if len(text) > self.MAX_LENGTH_HTML:
text = text[: self.MAX_LENGTH_HTML]
size_limited = True

html4_singlets = (
"br",
"col",
"link",
"base",
"img",
"param",
"area",
"hr",
"input",
)

# Count non-HTML chars/words and keep note of open tags
pos = 0
end_text_pos = 0
current_len = 0
open_tags = []

regex = re_words if words else re_chars

while current_len <= length:
m = regex.search(text, pos)
if not m:
# Checked through whole string
break
pos = m.end(0)
if m[1]:
# It's an actual non-HTML word or char
current_len += 1
if current_len == truncate_len:
end_text_pos = pos
continue
# Check for tag
tag = re_tag.match(m[0])
if not tag or current_len >= truncate_len:
# Don't worry about non tags or tags after our truncate point
continue
closing_tag, tagname, self_closing = tag.groups()
# Element names are always case-insensitive
tagname = tagname.lower()
if self_closing or tagname in html4_singlets:
pass
elif closing_tag:
# Check for match in open tags list
try:
i = open_tags.index(tagname)
except ValueError:
pass
else:
# SGML: An end tag closes, back to the matching start tag,
# all unclosed intervening start tags with omitted end tags
open_tags = open_tags[i + 1 :]
else:
# Add it to the start of the open tags list
open_tags.insert(0, tagname)

truncate_text = add_truncation_text("", truncate)

if current_len <= length:
if size_limited and truncate_text:
text += truncate_text
return text

out = text[:end_text_pos]
if truncate_text:
out += truncate_text
# Close any tags still open
for tag in open_tags:
out += "</%s>" % tag
# Return string
return out


@keep_lazy_text
def get_valid_filename(name):
Expand Down
5 changes: 5 additions & 0 deletions docs/releases/5.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,11 @@ Miscellaneous
:meth:`~django.test.SimpleTestCase.assertInHTML` now add ``": "`` to the
``msg_prefix``. This is consistent with the behavior of other assertions.

* ``django.utils.text.Truncator`` used by :tfilter:`truncatechars_html` and
:tfilter:`truncatewords_html` template filters now uses
:py:class:`html.parser.HTMLParser` subclasses. This results in a more robust
and faster operation, but there may be small differences in the output.

.. _deprecated-features-5.1:

Features deprecated in 5.1
Expand Down
6 changes: 3 additions & 3 deletions tests/template_tests/filter_tests/test_truncatewords_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,15 @@ def test_truncate2(self):
truncatewords_html(
'<p>one <a href="#">two - three <br>four</a> five</p>', 4
),
'<p>one <a href="#">two - three …</a></p>',
'<p>one <a href="#">two - three <br> …</a></p>',
felixxm marked this conversation as resolved.
Show resolved Hide resolved
)

def test_truncate3(self):
self.assertEqual(
truncatewords_html(
'<p>one <a href="#">two - three <br>four</a> five</p>', 5
),
'<p>one <a href="#">two - three <br>four</a></p>',
'<p>one <a href="#">two - three <br>four</a></p>',
smithdc1 marked this conversation as resolved.
Show resolved Hide resolved
)

def test_truncate4(self):
Expand All @@ -53,7 +53,7 @@ def test_truncate_complex(self):
truncatewords_html(
"<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>", 3
),
"<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo …</i>",
"<i>Buenos días! ¿Cómo …</i>",
smithdc1 marked this conversation as resolved.
Show resolved Hide resolved
)

def test_invalid_arg(self):
Expand Down