digitalfabrik · JoeyStk · Jul 10, 2024 · Jun 24, 2024
diff --git a/integreat_cms/cms/forms/custom_content_model_form.py b/integreat_cms/cms/forms/custom_content_model_form.py
@@ -1,21 +1,14 @@
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
-from urllib.parse import urlparse
 
 from django import forms
-from django.conf import settings
 from django.contrib import messages
 from django.core.exceptions import ObjectDoesNotExist
-from django.db.models import Q
 from django.utils.translation import gettext_lazy as _
-from lxml.etree import LxmlError
-from lxml.html import fromstring, tostring
 
 from ..constants import status
-from ..models import MediaFile
-from ..utils import internal_link_utils
-from ..utils.linkcheck_utils import fix_content_link_encoding
+from ..utils.content_utils import clean_content
 from ..utils.slug_utils import generate_unique_slug_helper
 from .custom_model_form import CustomModelForm
 
@@ -98,104 +91,17 @@ def clean(self) -> dict[str, Any]:
 
         return cleaned_data
 
-    # pylint: disable=too-many-branches, too-many-locals
     def clean_content(self) -> str:
         """
         Validate the content field (see :ref:`overriding-modelform-clean-method`) and applies changes
-        to ``<img>``- and ``<a>``-Tags to match the guidelines.
+        to any element to match the guidelines.
 
         :raises ~django.core.exceptions.ValidationError: When a heading 1 (``<h1>``) is used in the text content
 
         :return: The valid content
         """
-        try:
-            content = fromstring(self.cleaned_data["content"])
-        except LxmlError:
-            # The content is not guaranteed to be valid html, for example it may be empty
-            return self.cleaned_data["content"]
-
-        # Convert heading 1 to heading 2
-        for heading in content.iter("h1"):
-            heading.tag = "h2"
-            self.logger.debug(
-                "Replaced heading 1 with heading 2: %r",
-                tostring(heading, encoding="unicode"),
-            )
-
-        # Convert pre and code tags to p tags
-        for monospaced in content.iter("pre", "code"):
-            tag_type = monospaced.tag
-            monospaced.tag = "p"
-            self.logger.debug(
-                "Replaced %r tag with p tag: %r",
-                tag_type,
-                tostring(monospaced, encoding="unicode"),
-            )
-
-        # Set link-external as class for external links
-        for link in content.iter("a"):
-            if href := link.get("href"):
-                is_external = not any(url in href for url in settings.INTERNAL_URLS)
-                if "link-external" not in link.classes and is_external:
-                    link.classes.add("link-external")
-                    self.logger.debug(
-                        "Added class 'link-external' to %r",
-                        tostring(link, encoding="unicode"),
-                    )
-                elif "link-external" in link.classes and not is_external:
-                    link.classes.remove("link-external")
-                    self.logger.debug(
-                        "Removed class 'link-external' from %r",
-                        tostring(link, encoding="unicode"),
-                    )
-
-        # Remove external links
-        for link in content.iter("a"):
-            link.attrib.pop("target", None)
-            self.logger.debug(
-                "Removed target attribute from link: %r",
-                tostring(link, encoding="unicode"),
-            )
-
-        # Update internal links
-        for link in content.iter("a"):
-            if href := link.attrib.get("href"):
-                if translation := internal_link_utils.update_link_language(
-                    href, link.text, self.instance.language.slug
-                ):
-                    translated_url, translated_text = translation
-                    link.set("href", translated_url)
-                    # translated_text might be None if the link tag consists of other tags instead of plain text
-                    if translated_text:
-                        link.text = translated_text
-                    self.logger.debug(
-                        "Updated link url from %s to %s", href, translated_url
-                    )
-
-        # Scan for media files in content and replace alt texts
-        for image in content.iter("img"):
-            if src := image.attrib.get("src"):
-                self.logger.debug("Image tag found in content (src: %s)", src)
-                # Remove host
-                relative_url = urlparse(src).path
-                # Remove media url prefix if exists
-                if relative_url.startswith(settings.MEDIA_URL):
-                    relative_url = relative_url[len(settings.MEDIA_URL) :]
-                # Check whether media file exists in database
-                media_file = MediaFile.objects.filter(
-                    Q(file=relative_url) | Q(thumbnail=relative_url)
-                ).first()
-                # Replace alternative text
-                if media_file and media_file.alt_text:
-                    self.logger.debug(
-                        "Image alt text replaced: %r", media_file.alt_text
-                    )
-                    image.attrib["alt"] = media_file.alt_text
-            else:
-                self.logger.warning("Empty img tag was found.")
-
-        content_str = tostring(content, encoding="unicode", with_tail=False)
-        return fix_content_link_encoding(content_str)
+        content = self.cleaned_data["content"]
+        return clean_content(content, language_slug=self.instance.language.slug)
 
     def clean_slug(self) -> str:
         """

diff --git a/integreat_cms/cms/utils/content_utils.py b/integreat_cms/cms/utils/content_utils.py
@@ -0,0 +1,156 @@
+import logging
+from urllib.parse import urlparse
+
+from django.conf import settings
+from django.db.models import Q
+from lxml.etree import LxmlError
+from lxml.html import fromstring, HtmlElement, tostring
+
+from ..models import MediaFile
+from ..utils import internal_link_utils
+from ..utils.linkcheck_utils import fix_content_link_encoding
+
+
+def clean_content(content: str, language_slug: str) -> str:
+    """
+    This is the super function to clean content
+
+    :param content: the body of content that should be cleaned
+    :param language_slug: Slug of the current language
+    """
+    try:
+        content = fromstring(content)
+    except LxmlError:
+        # The content is not guaranteed to be valid html, for example it may be empty
+        return content
+
+    convert_heading(content)
+    convert_monospaced_tags(content)
+    update_links(content, language_slug)
+    fix_alt_texts(content)
+
+    content_str = tostring(content, encoding="unicode", with_tail=False)
+    return fix_content_link_encoding(content_str)
+
+
+def convert_heading(content: HtmlElement) -> None:
+    """
+    Converts every ``h1`` tag in the content to a ``h2`` for SEO purposes.
+
+    :param content: the body of content of which every ``h1`` should be converted to an ``h2``.
+    """
+    for heading in content.iter("h1"):
+        heading.tag = "h2"
+        logging.debug(
+            "Replaced heading 1 with heading 2: %r",
+            tostring(heading, encoding="unicode"),
+        )
+
+
+def convert_monospaced_tags(content: HtmlElement) -> None:
+    """
+    Converts ``pre`` and ``code`` tags to ``p`` tags.
+
+    :param content: the body of content of which every ``pre`` and ``code`` tag should be transformed
+    """
+    for monospaced in content.iter("pre", "code"):
+        tag_type = monospaced.tag
+        monospaced.tag = "p"
+        logging.debug(
+            "Replaced %r tag with p tag: %r",
+            tag_type,
+            tostring(monospaced, encoding="unicode"),
+        )
+
+
+def update_links(content: HtmlElement, language_slug: str) -> None:
+    """
+    Super method that gathers all methods related to updating links
+
+    :param content: The content whose links should be updated
+    :param language_slug: Slug of the current language
+    """
+    for link in content.iter("a"):
+        mark_external_links(link)
+        remove_target_attribute(link)
+        update_internal_links(link, language_slug)
+
+
+def mark_external_links(link: HtmlElement) -> None:
+    """
+    Set class ``link-external`` for links
+
+    :param link: the link which classes should be adjusted.
+    """
+    if href := link.get("href"):
+        is_external = not any(url in href for url in settings.INTERNAL_URLS)
+        if "link-external" not in link.classes and is_external:
+            link.classes.add("link-external")
+            logging.debug(
+                "Added class 'link-external' to %r",
+                tostring(link, encoding="unicode"),
+            )
+        elif "link-external" in link.classes and not is_external:
+            link.classes.remove("link-external")
+            logging.debug(
+                "Removed class 'link-external' from %r",
+                tostring(link, encoding="unicode"),
+            )
+
+
+def remove_target_attribute(link: HtmlElement) -> None:
+    """
+    Removes the target attribute of links if these links are external links
+
+    :param link: links whose targets should be removed
+    """
+    link.attrib.pop("target", None)
+    logging.debug(
+        "Removed target attribute from link: %r",
+        tostring(link, encoding="unicode"),
+    )
+
+
+def update_internal_links(link: HtmlElement, language_slug: str) -> None:
+    """
+    Updates internal links by adding the language slug of the translation
+
+    :param link: link which should be checked for an internal link and then be updated
+    :param language_slug: Slug of the current language
+    """
+    if href := link.attrib.get("href"):
+        if translation := internal_link_utils.update_link_language(
+            href, link.text, language_slug
+        ):
+            translated_url, translated_text = translation
+            link.set("href", translated_url)
+            # translated_text might be None if the link tag consists of other tags instead of plain text
+            if translated_text:
+                link.text = translated_text
+            logging.debug("Updated link url from %s to %s", href, translated_url)
+
+
+def fix_alt_texts(content: HtmlElement) -> None:
+    """
+    This function processes images by scanning for media files and replacing alt texts.
+
+    :param content: The body of content of which the images should be processed.
+    """
+    for image in content.iter("img"):
+        if src := image.attrib.get("src"):
+            logging.debug("Image tag found in content (src: %s)", src)
+            # Remove host
+            relative_url = urlparse(src).path
+            # Remove media url prefix if exists
+            if relative_url.startswith(settings.MEDIA_URL):
+                relative_url = relative_url[len(settings.MEDIA_URL) :]
+            # Check whether media file exists in database
+            media_file = MediaFile.objects.filter(
+                Q(file=relative_url) | Q(thumbnail=relative_url)
+            ).first()
+            # Replace alternative text
+            if media_file and media_file.alt_text:
+                logging.debug("Image alt text replaced: %r", media_file.alt_text)
+                image.attrib["alt"] = media_file.alt_text
+        else:
+            logging.warning("Empty img tag was found.")
diff --git a/tests/cms/utils/test_content_utils.py b/tests/cms/utils/test_content_utils.py
@@ -0,0 +1,37 @@
+import pytest
+from django.test.client import Client
+
+from integreat_cms.cms.utils.content_utils import clean_content
+from tests.conftest import EDITOR, MANAGEMENT, PRIV_STAFF_ROLES
+
+
+@pytest.mark.parametrize(
+    "login_role_user", PRIV_STAFF_ROLES + [MANAGEMENT, EDITOR], indirect=True
+)
+@pytest.mark.django_db
+def test_clean_content(
+    load_test_data: None,
+    login_role_user: tuple[Client, str],
+) -> None:
+    raw_content = '<h1>Das ist eine H1</h1><pre>Das ist vordefinierter Text</pre><code>Das ist vordefinierter Code</code><a href="https://www.integreat-app.de"></a><a href="http://localhost:8000/augsburg/pages/de/5" class="link-external"></a>'
+    cleaned_content = clean_content(raw_content, "de")
+
+    # Test convert_heading works
+    assert "<h1>Das ist eine H1</h1>" not in cleaned_content
+    assert "<h2>Das ist eine H1</h2>" in cleaned_content
+
+    # Test convert_monospaced_tags works
+    assert "<pre>Das ist vordefinierter Text</pre>" not in cleaned_content
+    assert "<code>Das ist vordefinierter Code</code>" not in cleaned_content
+    assert "<p>Das ist vordefinierter Text</p>" in cleaned_content
+    assert "<p>Das ist vordefinierter Code</p>" in cleaned_content
+
+    # Test update_links works
+    assert (
+        'a href="https://www.integreat-app.de" class="link-external"' in cleaned_content
+    )
+    assert (
+        '<a href="http://localhost:8000/augsburg/pages/de/5" class="link-external">'
+        not in cleaned_content
+    )
+    assert '<a href="http://localhost:8000/augsburg/pages/de/5"></a>' in cleaned_content