Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor clean_content #2850

Merged
merged 1 commit into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 4 additions & 98 deletions integreat_cms/cms/forms/custom_content_model_form.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,14 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from urllib.parse import urlparse

from django import forms
from django.conf import settings
from django.contrib import messages
from django.core.exceptions import ObjectDoesNotExist
from django.db.models import Q
from django.utils.translation import gettext_lazy as _
from lxml.etree import LxmlError
from lxml.html import fromstring, tostring

from ..constants import status
from ..models import MediaFile
from ..utils import internal_link_utils
from ..utils.linkcheck_utils import fix_content_link_encoding
from ..utils.content_utils import clean_content
from ..utils.slug_utils import generate_unique_slug_helper
from .custom_model_form import CustomModelForm

Expand Down Expand Up @@ -98,104 +91,17 @@ def clean(self) -> dict[str, Any]:

return cleaned_data

# pylint: disable=too-many-branches, too-many-locals
def clean_content(self) -> str:
"""
Validate the content field (see :ref:`overriding-modelform-clean-method`) and applies changes
to ``<img>``- and ``<a>``-Tags to match the guidelines.
to any element to match the guidelines.

:raises ~django.core.exceptions.ValidationError: When a heading 1 (``<h1>``) is used in the text content

:return: The valid content
"""
try:
content = fromstring(self.cleaned_data["content"])
except LxmlError:
# The content is not guaranteed to be valid html, for example it may be empty
return self.cleaned_data["content"]

# Convert heading 1 to heading 2
for heading in content.iter("h1"):
heading.tag = "h2"
self.logger.debug(
"Replaced heading 1 with heading 2: %r",
tostring(heading, encoding="unicode"),
)

# Convert pre and code tags to p tags
for monospaced in content.iter("pre", "code"):
tag_type = monospaced.tag
monospaced.tag = "p"
self.logger.debug(
"Replaced %r tag with p tag: %r",
tag_type,
tostring(monospaced, encoding="unicode"),
)

# Set link-external as class for external links
for link in content.iter("a"):
if href := link.get("href"):
is_external = not any(url in href for url in settings.INTERNAL_URLS)
if "link-external" not in link.classes and is_external:
link.classes.add("link-external")
self.logger.debug(
"Added class 'link-external' to %r",
tostring(link, encoding="unicode"),
)
elif "link-external" in link.classes and not is_external:
link.classes.remove("link-external")
self.logger.debug(
"Removed class 'link-external' from %r",
tostring(link, encoding="unicode"),
)

# Remove external links
for link in content.iter("a"):
link.attrib.pop("target", None)
self.logger.debug(
"Removed target attribute from link: %r",
tostring(link, encoding="unicode"),
)

# Update internal links
for link in content.iter("a"):
if href := link.attrib.get("href"):
if translation := internal_link_utils.update_link_language(
href, link.text, self.instance.language.slug
):
translated_url, translated_text = translation
link.set("href", translated_url)
# translated_text might be None if the link tag consists of other tags instead of plain text
if translated_text:
link.text = translated_text
self.logger.debug(
"Updated link url from %s to %s", href, translated_url
)

# Scan for media files in content and replace alt texts
for image in content.iter("img"):
if src := image.attrib.get("src"):
self.logger.debug("Image tag found in content (src: %s)", src)
# Remove host
relative_url = urlparse(src).path
# Remove media url prefix if exists
if relative_url.startswith(settings.MEDIA_URL):
relative_url = relative_url[len(settings.MEDIA_URL) :]
# Check whether media file exists in database
media_file = MediaFile.objects.filter(
Q(file=relative_url) | Q(thumbnail=relative_url)
).first()
# Replace alternative text
if media_file and media_file.alt_text:
self.logger.debug(
"Image alt text replaced: %r", media_file.alt_text
)
image.attrib["alt"] = media_file.alt_text
else:
self.logger.warning("Empty img tag was found.")

content_str = tostring(content, encoding="unicode", with_tail=False)
return fix_content_link_encoding(content_str)
content = self.cleaned_data["content"]
return clean_content(content, language_slug=self.instance.language.slug)

def clean_slug(self) -> str:
"""
Expand Down
156 changes: 156 additions & 0 deletions integreat_cms/cms/utils/content_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import logging
from urllib.parse import urlparse

from django.conf import settings
from django.db.models import Q
from lxml.etree import LxmlError
from lxml.html import fromstring, HtmlElement, tostring

from ..models import MediaFile
from ..utils import internal_link_utils
from ..utils.linkcheck_utils import fix_content_link_encoding


def clean_content(content: str, language_slug: str) -> str:
"""
This is the super function to clean content

:param content: the body of content that should be cleaned
:param language_slug: Slug of the current language
"""
try:
content = fromstring(content)
except LxmlError:
# The content is not guaranteed to be valid html, for example it may be empty
return content

convert_heading(content)
convert_monospaced_tags(content)
update_links(content, language_slug)
fix_alt_texts(content)

content_str = tostring(content, encoding="unicode", with_tail=False)
return fix_content_link_encoding(content_str)


def convert_heading(content: HtmlElement) -> None:
"""
Converts every ``h1`` tag in the content to a ``h2`` for SEO purposes.

:param content: the body of content of which every ``h1`` should be converted to an ``h2``.
"""
for heading in content.iter("h1"):
heading.tag = "h2"
logging.debug(
"Replaced heading 1 with heading 2: %r",
tostring(heading, encoding="unicode"),
)


def convert_monospaced_tags(content: HtmlElement) -> None:
"""
Converts ``pre`` and ``code`` tags to ``p`` tags.

:param content: the body of content of which every ``pre`` and ``code`` tag should be transformed
"""
for monospaced in content.iter("pre", "code"):
tag_type = monospaced.tag
monospaced.tag = "p"
logging.debug(
"Replaced %r tag with p tag: %r",
tag_type,
tostring(monospaced, encoding="unicode"),
)


def update_links(content: HtmlElement, language_slug: str) -> None:
"""
Super method that gathers all methods related to updating links

:param content: The content whose links should be updated
:param language_slug: Slug of the current language
"""
for link in content.iter("a"):
mark_external_links(link)
remove_target_attribute(link)
update_internal_links(link, language_slug)


def mark_external_links(link: HtmlElement) -> None:
"""
Set class ``link-external`` for links

:param link: the link which classes should be adjusted.
"""
if href := link.get("href"):
is_external = not any(url in href for url in settings.INTERNAL_URLS)
if "link-external" not in link.classes and is_external:
link.classes.add("link-external")
logging.debug(
"Added class 'link-external' to %r",
tostring(link, encoding="unicode"),
)
elif "link-external" in link.classes and not is_external:
link.classes.remove("link-external")
logging.debug(
"Removed class 'link-external' from %r",
tostring(link, encoding="unicode"),
)


def remove_target_attribute(link: HtmlElement) -> None:
"""
Removes the target attribute of links if these links are external links

:param link: links whose targets should be removed
"""
link.attrib.pop("target", None)
logging.debug(
"Removed target attribute from link: %r",
tostring(link, encoding="unicode"),
)


def update_internal_links(link: HtmlElement, language_slug: str) -> None:
"""
Updates internal links by adding the language slug of the translation

:param link: link which should be checked for an internal link and then be updated
:param language_slug: Slug of the current language
"""
if href := link.attrib.get("href"):
if translation := internal_link_utils.update_link_language(
href, link.text, language_slug
):
translated_url, translated_text = translation
link.set("href", translated_url)
# translated_text might be None if the link tag consists of other tags instead of plain text
if translated_text:
link.text = translated_text
logging.debug("Updated link url from %s to %s", href, translated_url)


def fix_alt_texts(content: HtmlElement) -> None:
"""
This function processes images by scanning for media files and replacing alt texts.

:param content: The body of content of which the images should be processed.
"""
for image in content.iter("img"):
if src := image.attrib.get("src"):
logging.debug("Image tag found in content (src: %s)", src)
# Remove host
relative_url = urlparse(src).path
# Remove media url prefix if exists
if relative_url.startswith(settings.MEDIA_URL):
relative_url = relative_url[len(settings.MEDIA_URL) :]
# Check whether media file exists in database
media_file = MediaFile.objects.filter(
Q(file=relative_url) | Q(thumbnail=relative_url)
).first()
# Replace alternative text
if media_file and media_file.alt_text:
logging.debug("Image alt text replaced: %r", media_file.alt_text)
image.attrib["alt"] = media_file.alt_text
else:
logging.warning("Empty img tag was found.")
37 changes: 37 additions & 0 deletions tests/cms/utils/test_content_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pytest
from django.test.client import Client

from integreat_cms.cms.utils.content_utils import clean_content
from tests.conftest import EDITOR, MANAGEMENT, PRIV_STAFF_ROLES


@pytest.mark.parametrize(
"login_role_user", PRIV_STAFF_ROLES + [MANAGEMENT, EDITOR], indirect=True
)
@pytest.mark.django_db
def test_clean_content(
load_test_data: None,
login_role_user: tuple[Client, str],
) -> None:
raw_content = '<h1>Das ist eine H1</h1><pre>Das ist vordefinierter Text</pre><code>Das ist vordefinierter Code</code><a href="https://www.integreat-app.de"></a><a href="http://localhost:8000/augsburg/pages/de/5" class="link-external"></a>'
cleaned_content = clean_content(raw_content, "de")

# Test convert_heading works
assert "<h1>Das ist eine H1</h1>" not in cleaned_content
assert "<h2>Das ist eine H1</h2>" in cleaned_content

# Test convert_monospaced_tags works
assert "<pre>Das ist vordefinierter Text</pre>" not in cleaned_content
assert "<code>Das ist vordefinierter Code</code>" not in cleaned_content
assert "<p>Das ist vordefinierter Text</p>" in cleaned_content
assert "<p>Das ist vordefinierter Code</p>" in cleaned_content

# Test update_links works
assert (
'a href="https://www.integreat-app.de" class="link-external"' in cleaned_content
)
assert (
'<a href="http://localhost:8000/augsburg/pages/de/5" class="link-external">'
not in cleaned_content
)
assert '<a href="http://localhost:8000/augsburg/pages/de/5"></a>' in cleaned_content
Loading