common/app/views/support/HtmlCleaner.scala

package views.support

import java.net.URI
import java.util.regex.{Matcher, Pattern}
import common.{Edition, GuLogging, LinkTo}
import conf.Configuration.affiliateLinks._
import conf.Configuration.site.host
import conf.switches.Switches._
import layout.ContentWidths
import layout.ContentWidths._
import model._
import model.content._
import model.dotcomrendering.pageElements.TextBlockElement
import navigation.ReaderRevenueSite
import org.joda.time.DateTime
import org.jsoup.Jsoup
import org.jsoup.nodes.{Document, Element, TextNode}
import play.api.mvc.RequestHeader
import services.SkimLinksCache
import scala.jdk.CollectionConverters._
import scala.collection.mutable
import scala.util.Try

trait HtmlCleaner {
  def clean(d: Document): Document
}

object BlockNumberCleaner extends HtmlCleaner {

  private val Block = """<!-- Block (\d*) -->""".r

  override def clean(document: Document): Document = {
    document.getAllElements.asScala.foreach { element =>
      val blockComments = element.childNodes.asScala.flatMap { node =>
        node.toString.trim match {
          case Block(num) =>
            Option(node.nextSibling).foreach(_.attr("id", s"block-$num"))
            Some(node)
          case _ => None
        }
      }
      blockComments.foreach(_.remove())
    }
    document
  }
}

object BlockquoteCleaner extends HtmlCleaner {

  override def clean(document: Document): Document = {
    val quotedBlockquotes = document.getElementsByTag("blockquote").asScala.filter(_.hasClass("quoted"))
    val quoteSvg = views.html.fragments.inlineSvg("quote", "icon").toString()
    val wrapBlockquoteChildren = (blockquoteElement: Element) => {
      val container = document.createElement("div")
      container.addClass("quoted__contents")
      // Get children before mutating
      val children = blockquoteElement.children()
      blockquoteElement.prependChild(container)
      container.insertChildren(0, children)

      blockquoteElement.prepend(quoteSvg)
    }
    quotedBlockquotes.foreach(wrapBlockquoteChildren)
    document
  }
}

object PullquoteCleaner extends HtmlCleaner {

  override def clean(document: Document): Document = {
    val pullquotes = document.getElementsByTag("aside").asScala.filter(_.hasClass("element-pullquote"))
    val openingQuoteSvg = views.html.fragments.inlineSvg("quote", "icon", List("inline-tone-fill")).toString()

    pullquotes.foreach { element: Element =>
      element.prepend(openingQuoteSvg)
      element.getElementsByTag("p").addClass("pullquote-paragraph")
      element.getElementsByTag("cite").addClass("pullquote-cite")
    }

    document
  }
}

case object R2VideoCleaner extends HtmlCleaner {

  override def clean(document: Document): Document = {

    val legacyVideos =
      document.getElementsByTag("video").asScala.filter(_.hasClass("gu-video")).filter(_.parent().tagName() != "figure")

    legacyVideos.foreach(videoElement => {
      videoElement.wrap("<figure class=\"test element element-video\"></figure>")
    })

    document
  }

}

case class RecipeBodyImage(isRecipeArticle: Boolean) extends HtmlCleaner {
  override def clean(document: Document): Document = {
    if (isRecipeArticle) {
      document.getElementsByClass("element-image").asScala foreach (_.remove())
      document.getElementsByTag("aside").asScala.filter(_.hasClass("element-pullquote")) foreach (_.remove())
      document.getElementsByClass("element-rich-link").asScala.foreach(_.remove())
    }
    document
  }
}

case class PictureCleaner(article: Article)(implicit request: RequestHeader)
    extends HtmlCleaner
    with implicits.Numbers {

  def clean(body: Document): Document = {
    for {
      figure <- body.getElementsByTag("figure").asScala
      image <- figure.getElementsByTag("img").asScala.headOption
      if !(figure.hasClass("element-comment") ||
        figure.hasClass("element-witness") ||
        figure.hasClass("element-atom"))
      container <- findContainerFromId(figure.attr("data-media-id"), image.attr("src"))
      image <- container.images.largestImage
    } {
      val hinting = findBreakpointWidths(figure)

      val relation = {
        if (article.isLiveBlog) LiveBlogMedia
        else if (article.isTheMinute) MinuteMedia
        else if (article.isImmersive) ImmersiveMedia
        else BodyMedia
      }

      val widths = ContentWidths.getWidthsFromContentElement(hinting, relation)

      val orientationClass = image.orientation match {
        case Portrait => Some("img--portrait")
        case _        => Some("img--landscape")
      }

      val smallImageClass = hinting match {
        case Thumbnail               => None
        case _ if image.width <= 220 => Some("img--inline")
        case _                       => None
      }

      val inlineClass =
        if (article.isTheMinute && !figure.hasClass("element--thumbnail")) Some("element--inline") else None

      val figureClasses = List(orientationClass, smallImageClass, hinting.className, inlineClass).flatten.mkString(" ")

      // lightbox uses the images in the order mentioned in the header array
      val lightboxInfo: Option[(Int, ImageAsset)] = for {
        index <- Some(article.lightbox.lightboxImages.indexOf(container)).flatMap(index =>
          if (index == -1) None else Some(index + 1),
        )
        crop <- container.images.largestEditorialCrop
        if !article.isLiveBlog
      } yield (index, crop)

      val html = views.html.fragments
        .imageFigure(
          container.images,
          lightboxIndex = lightboxInfo.map(_._1),
          widthsByBreakpoint = widths,
          image_figureClasses = Some(image, figureClasses),
          shareInfo = lightboxInfo.map {
            case (index, crop) =>
              (
                article.sharelinks.elementShares(s"img-$index", crop.url),
                article.metadata.contentType.getOrElse(DotcomContentType.Unknown),
              )
          },
        )
        .toString()

      figure.replaceWith(Jsoup.parseBodyFragment(html).body().child(0))
    }

    body
  }

  def findContainerFromId(id: String, src: String): Option[ImageElement] = {
    // It is possible that a single data media id can appear multiple times in the elements array.
    val maybeSrcImagePath = Try(new URI(src.trim).getPath).toOption
    val imageContainers = article.elements.bodyImages.filter(_.properties.id == id)

    // Try to match the container based on both URL and media ID.
    val fullyMatchedImage: Seq[ImageElement] = for {
      container <- imageContainers
      asset <- container.images.imageCrops
      url <- asset.url if maybeSrcImagePath.exists(url.contains)
    } yield container

    fullyMatchedImage.headOption orElse imageContainers.headOption
  }

  def findBreakpointWidths(figure: Element): ContentHinting = {

    figure.classNames().asScala.map(Some(_)) match {
      case classes if classes.asJava.contains(Supporting.className) => Supporting
      case classes if classes.asJava.contains(Showcase.className)   => Showcase
      case classes if classes.asJava.contains(Thumbnail.className)  => Thumbnail
      case classes if classes.asJava.contains(Immersive.className)  => Immersive
      case classes if classes.asJava.contains(Halfwidth.className)  => Halfwidth
      case _                                                        => Inline
    }
  }
}

object BulletCleaner {
  def apply(body: String): String = body.replace("•", """<span class="bullet">•</span>""")
}

trait HttpsUrl {
  def ensureHttps(url: String): String = url.replace("http:", "https:")
}

object VideoEncodingUrlCleaner extends HttpsUrl {
  def apply(url: String): String = ensureHttps(url.filter(_ != '\n'))
}

object AmpSrcCleaner extends HttpsUrl {
  def apply(videoSrc: String): String = {
    // All media sources need to start with https for AMP.
    // Temporary code until all media urls returned from CAPI are https
    ensureHttps(videoSrc)
  }
}

case class InBodyLinkCleaner(dataLinkName: String)(implicit val edition: Edition, implicit val request: RequestHeader)
    extends HtmlCleaner {
  def clean(body: Document): Document = {
    val links = body.getElementsByAttribute("href")

    links.asScala.foreach { link =>
      if (link.tagName == "a") {
        link.attr("href", LinkTo(link.attr("href"), edition))
        link.attr("data-link-name", dataLinkName)
        link.addClass("u-underline")
      }
      if (ReaderRevenueSite.isReaderRevenueSiteUrl(link.attr("href"))) {
        link.addClass("js-acquisition-link")
      }
    }

    // Prevent text in non clickable anchors from looking like links
    // <a name="foo">bar</a> -> <a name="foo"></a>bar
    val anchors = body.getElementsByAttribute("name")

    anchors.asScala.foreach { anchor =>
      if (anchor.tagName == "a") {
        val text = anchor.ownText()
        anchor.empty().after(text)
      }
    }

    body
  }
}

case class TruncateCleaner(limit: Int)(implicit val edition: Edition, implicit val request: RequestHeader)
    extends HtmlCleaner {
  def clean(body: Document): Document = {

    def truncateTextNode(charLimit: Int, textNode: TextNode): Int = {
      val newCharLimit = charLimit - textNode.text.length
      if (newCharLimit < 0) {
        textNode.text(textNode.text.take(charLimit.max(0)).trim.stripSuffix(".") + (if (charLimit > 0) "…" else ""))
      }
      newCharLimit
    }

    def truncateElement(charLimit: Int, element: Element): Int = {
      element.childNodes.asScala.foldLeft(charLimit) { (t, node) =>
        node match {
          case tNode: TextNode => truncateTextNode(t, tNode)
          case elem: Element   => truncateElement(t, elem)
          case _               => t
        }
      }
    }

    truncateElement(limit, body)
    body
  }
}

class TweetCleaner(content: Content) extends HtmlCleaner {

  override def clean(document: Document): Document = {

    document.getElementsByClass("element-tweet").asScala.foreach { tweet =>
      val tweetData: Option[Tweet] = Option(tweet.attr("data-canonical-url")).flatMap { url =>
        url.split('/').lastOption.flatMap { id =>
          content.tweets.find(_.id == id)
        }
      }

      val tweetImage = tweetData.flatMap(_.firstImage)

      tweet.getElementsByClass("twitter-tweet").asScala.foreach { element =>
        val el = element.clone()

        if (el.children.size > 1) {
          val body = el.child(0).attr("class", "tweet-body")
          val date = el.child(1).attr("class", "tweet-date")
          val user =
            el.ownText()
              .replaceFirst("— ", "")
              .split("""(?=\(@)""") // Remove the '-' and split at the '(@' username but keep delimiter

          val userName = user.headOption.getOrElse("")
          val userId = user.lift(1).getOrElse("")

          val userNameEl = document.createElement("span").attr("class", "tweet__user-name").text(userName)
          val userIdEl = document.createElement("span").attr("class", "tweet__user-id").text(userId)
          val link = document.createElement("a").attr("href", date.attr("href")).attr("style", "display: none;")

          element.empty().removeClass("twitter-tweet").addClass("js-tweet tweet")

          tweetImage.foreach { image =>
            val img = document.createElement("img")
            img.attr("src", image)
            img.attr("alt", "")
            img.attr("rel", "nofollow")
            img.addClass("js-tweet-main-image tweet-main-image")
            element.appendChild(img)
          }

          List(userNameEl, userIdEl, body, link, date).map(element.appendChild)
        }
      }
    }
    document
  }
}

case class TagLinker(article: Article)(implicit val edition: Edition, implicit val request: RequestHeader)
    extends HtmlCleaner {

  private val group1 = "$1"
  private val group2 = "$2"
  private val group4 = "$4"
  private val group5 = "$5"

  private val dot = Pattern.quote(".")
  private val question = Pattern.quote("?")

  private def keywordRegex(tag: Tag) = {
    val tagName = Pattern.quote(Matcher.quoteReplacement(tag.name))
    s"""(.*)( |^)($tagName)( |,|$$|$dot|$question)(.*)""".r
  }

  def clean(doc: Document): Document = {

    if (article.content.showInRelated) {

      // Get all paragraphs which are not contained in a pullquote or in an instagram caption
      val paragraphs = doc
        .getElementsByTag("p")
        .asScala
        .filterNot(p =>
          p.parents.asScala.exists { ancestor =>
            val inPullquote = ancestor.tagName() == "aside" && ancestor.hasClass("element-pullquote")
            val inInstagramBlock = ancestor.hasClass("instagram-media")
            inPullquote || inInstagramBlock
          },
        )

      // order by length of name so we do not make simple match errors
      // e.g 'Northern Ireland' & 'Ireland'
      article.tags.keywords.filterNot(_.isSectionTag).sortBy(_.name.length).reverse.foreach { keyword =>
        // don't link again in paragraphs that already have links
        val unlinkedParas = paragraphs.filterNot(_.html.contains("<a"))

        // pre-filter paragraphs so we do not do multiple regexes on every single paragraph in every single article
        val candidateParagraphs = unlinkedParas.filter(_.html.contains(keyword.name))

        if (candidateParagraphs.nonEmpty) {
          val regex = keywordRegex(keyword)
          val paragraphsWithMatchers =
            candidateParagraphs.map(p => (regex.pattern.matcher(p.html), p)).find(_._1.matches())

          paragraphsWithMatchers.foreach {
            case (matcher, p) =>
              val tagLink = doc.createElement("a")
              tagLink.attr("href", LinkTo(keyword.metadata.url, edition))
              tagLink.text(keyword.name)
              tagLink.attr("data-link-name", "auto-linked-tag")
              tagLink.attr("data-component", "auto-linked-tag")
              tagLink.addClass("u-underline")
              val tagLinkHtml = tagLink.toString
              val newHtml = matcher.replaceFirst(s"$group1$group2$tagLinkHtml$group4$group5")
              p.html(newHtml)
          }
        }
      }
    }
    doc
  }
}

object InBodyElementCleaner extends HtmlCleaner {

  private val supportedElements = Set(
    "element-tweet",
    "element-video",
    "element-image",
    "element-witness",
    "element-comment",
    "element-interactive",
  )

  override def clean(document: Document): Document = {
    // this code REMOVES unsupported embeds
    if (ShowAllArticleEmbedsSwitch.isSwitchedOff) {
      val embeddedElements = document.getElementsByTag("figure").asScala.filter(_.hasClass("element"))
      val unsupportedElements = embeddedElements.filterNot(e => supportedElements.exists(e.hasClass))
      unsupportedElements.foreach(_.remove())
    }
    document
  }
}

case class Summary(amount: Int) extends HtmlCleaner {
  override def clean(document: Document): Document = {
    val children = document.body().children().asScala.toList
    val para: Option[Element] = children.filter(_.nodeName() == "p").take(amount).lastOption
    // if there is are no p's, just take the first n things (could be a blog)
    para match {
      case Some(p) => children.drop(children.indexOf(p)).foreach(_.remove())
      case _       => children.drop(amount).foreach(_.remove())
    }
    document
  }
}

case class PhotoEssayImages(isPhotoEssay: Boolean) extends HtmlCleaner {
  override def clean(document: Document): Document = {
    if (isPhotoEssay) {
      document.getElementsByTag("figure").asScala.filter(_.hasClass("element-image")) foreach { images =>
        images.addClass("element-image--photo-essay")
      }
      document.getElementsByClass("block-share--article").asScala.foreach { shares =>
        shares.remove()
      }
    }
    document
  }
}

case class PhotoEssayQuotes(isPhotoEssay: Boolean) extends HtmlCleaner {
  override def clean(document: Document): Document = {
    if (isPhotoEssay) {
      document.getElementsByClass("element-pullquote").asScala.foreach { quotes =>
        quotes.addClass("element-pullquote--photo-essay")
      }
    }
    document
  }
}

case class PhotoEssayCaptions(isPhotoEssay: Boolean) extends HtmlCleaner {
  override def clean(document: Document): Document = {
    if (isPhotoEssay) {
      document.getElementsByClass("caption--img").asScala.foreach { captions =>
        captions.remove()
      }
    }
    document
  }
}

case class PhotoEssayHalfWidth(isPhotoEssay: Boolean) extends HtmlCleaner {
  override def clean(document: Document): Document = {
    if (isPhotoEssay) {
      document.getElementsByTag("figure").asScala.filter(_.hasClass("element--halfWidth")).zipWithIndex.foreach {
        case (halfWidthImage, index) =>
          if (index % 2 == 0) {
            halfWidthImage.addClass("half-width-odd")
          }
      }
    }
    document
  }
}

case class PhotoEssayBlockQuote(isPhotoEssay: Boolean) extends HtmlCleaner {
  override def clean(document: Document): Document = {
    if (isPhotoEssay) {
      document.getElementsByTag("blockquote").asScala.foreach { blockquotes =>
        if (!blockquotes.children().is(".pullquote-paragraph")) {
          blockquotes.addClass("photo-essay-block-quote")
        }
      }
    }
    document
  }
}

case class ImmersiveLinks(isImmersive: Boolean) extends HtmlCleaner {
  override def clean(document: Document): Document = {
    if (isImmersive) {
      document.getElementsByTag("a").asScala.foreach { a =>
        a.addClass("in-body-link--immersive")
      }
    }
    document
  }
}

case class ImmersiveHeaders(isImmersive: Boolean) extends HtmlCleaner {
  override def clean(document: Document): Document = {
    if (isImmersive) {
      document.getElementsByTag("h2").asScala.foreach { h2 =>
        val beforeH2 = h2.previousElementSibling()
        if (beforeH2 != null) {
          if (beforeH2.hasClass("element--immersive") && beforeH2.hasClass("element-image")) {
            beforeH2.addClass("section-image")
            beforeH2.prepend("""<h2 class="section-title">""" + h2.text() + "</h2>")
            h2.remove()
          }
        }
      }
    }
    document
  }
}

case class DropCaps(isFeature: Boolean, isImmersive: Boolean, isRecipeArticle: Boolean = false) extends HtmlCleaner {
  private def setDropCap(p: Element): String = {
    if (p.text.length > 199) {
      p.html.replaceFirst(
        "^([\"'“‘]*[a-zA-Z])(.{199,})",
        """<span class="drop-cap"><span class="drop-cap__inner">$1</span></span>$2""",
      )
    } else p.html
  }

  override def clean(document: Document): Document = {
    if (isFeature && !isRecipeArticle) {
      val children = document.body().children().asScala.toList
      children.headOption match {
        case Some(p) =>
          if (p.nodeName() == "p") p.html(setDropCap(p))
        case _ =>
      }
    }

    document.getElementsByTag("h2").asScala.foreach { h2 =>
      if (isImmersive && h2.text() == "* * *") {
        h2.before("""<hr class="section-rule" />""")

        val maybeNext = Option(h2.nextElementSibling())
        maybeNext
          .filter(_.nodeName() == "p")
          .foreach { el =>
            el.html(setDropCap(el))
          }

        h2.remove()
      }
    }
    document
  }
}

case class NumberedListFurniture(isNumberedList: Boolean) extends HtmlCleaner {
  override def clean(document: Document): Document = {
    if (isNumberedList) {
      // Adds yellow styling to star ratings mid article
      document.select("p:containsOwn(★)").asScala.foreach { star =>
        star.addClass("stars")
      }

      // Styled link/section end
      document.select("ul > li:only-child").asScala.foreach { li =>
        val ul = li.parent();
        ul.addClass("article-link")
      }

      // Faux h3 headings, for second level of heading hierarchy in numbered list articles
      document.select("p > strong").asScala.foreach { strong =>
        val p = strong.parent();
        if (p.is("p:matchesOwn(^$)") && !p.children().is("a")) {
          p.addClass("falseH3")
        }
      }
    }
    document
  }
}

// Gallery Caption's don't come back as structured data
// This is a hack to serve the correct html
object GalleryCaptionCleaner extends HtmlCleaner {
  override def clean(galleryCaption: Document): Document = {
    // There is an inconsistent number of <br> tags in gallery captions.
    // To create some consistency, re will remove them all.
    galleryCaption.getElementsByTag("br").remove()

    val firstStrong = Option(galleryCaption.getElementsByTag("strong").first())
    val captionTitle = galleryCaption.createElement("h2")
    val captionTitleText = firstStrong.map(_.html()).getOrElse("")

    // <strong> is removed in place of having a <h2> element
    firstStrong.foreach(_.remove())

    captionTitle.addClass("gallery__caption__title")
    captionTitle.html(captionTitleText)

    galleryCaption.body.prependChild(captionTitle)

    galleryCaption
  }
}

object InteractiveSrcdocCleaner extends HtmlCleaner {
  override def clean(document: Document): Document = {
    if (interactivePressing.isSwitchedOn) {
      for {
        iframe <- Option(document.getElementsByTag("iframe").first())
        srcdoc = iframe.attr("srcdoc")
        if srcdoc.nonEmpty
      } yield {
        // noscript is added for immersive interactives, no idea why
        // see https://github.com/guardian/flexible-content/pull/1597
        // hopefully we can remove all of this soon anyway
        val html = Jsoup.parse(srcdoc).getElementsByTag("noscript").html()
        iframe.after(html).remove()
      }
    }
    document
  }
}

object FigCaptionCleaner extends HtmlCleaner {
  override def clean(document: Document): Document = {
    document.getElementsByTag("figcaption").asScala.foreach { _.addClass("caption caption--img") }
    document
  }
}

object MainFigCaptionCleaner extends HtmlCleaner {
  override def clean(document: Document): Document = {
    document.getElementsByTag("figcaption").asScala.foreach { _.addClass("caption caption--img caption--main") }
    document
  }
}

case class RichLinkCleaner()(implicit val request: RequestHeader) extends HtmlCleaner {
  override def clean(document: Document): Document = {

    val richLinks = document.getElementsByClass("element-rich-link")

    richLinks
      .addClass("element-rich-link--not-upgraded")
      .attr("data-component", "rich-link")
      .asScala
      .zipWithIndex
      .map { case (el, index) => el.attr("data-link-name", s"rich-link-${richLinks.asScala.length} | ${index + 1}") }

    richLinks.asScala
      .map(richLink => {
        val link = richLink.getElementsByTag("a").first()
        val href = link.attr("href")
        val html = views.html.fragments.richLinkDefault(link.text(), href).toString()
        richLink.empty().prepend(html)
      })
    document
  }
}

object MembershipEventCleaner extends HtmlCleaner {
  override def clean(document: Document): Document = {
    val membershipEvents = document.getElementsByClass("element-membership")
    membershipEvents
      .addClass("element-membership--not-upgraded")
      .attr("data-component", "membership-events")
      .asScala
      .zipWithIndex
      .map {
        case (el, index) =>
          el.attr("data-link-name", s"membership-event-${membershipEvents.asScala.length} | ${index + 1}")
      }

    document
  }
}

case class AtomsCleaner(
    atoms: Option[Atoms],
    shouldFence: Boolean = true,
    mediaWrapper: Option[MediaWrapper] = None,
    posterImageOverride: Option[ImageMedia] = None,
)(implicit val request: RequestHeader, context: ApplicationContext)
    extends HtmlCleaner {
  private def findAtom(id: String): Option[Atom] = {
    atoms.flatMap(_.all.find(_.id == id))
  }

  override def clean(document: Document): Document = {
    if (UseAtomsSwitch.isSwitchedOn) {

      for {
        atomContainer <- document.getElementsByClass("element-atom").asScala
        bodyElement <- atomContainer.getElementsByTag("gu-atom").asScala
        atomId <- Some(bodyElement.attr("data-atom-id"))
        atomType <- Some(bodyElement.attr("data-atom-type"))
      } {
        if (atomType != "audio" || (atomType == "audio" && RenderInArticleAudioAtomSwitch.isSwitchedOn)) {
          findAtom(atomId).fold {
            atomContainer.remove()
          } { atomData =>
            if (mediaWrapper.contains(MediaWrapper.MainMedia)) {
              atomContainer.addClass("element-atom--main-media")
            }
            if (atomData.isInstanceOf[MediaAtom]) {
              atomContainer.addClass("element-atom--media")
            }

            atomContainer.attr("data-atom-id", atomId)
            atomContainer.attr("data-atom-type", atomType)

            val html = views.html.fragments.atoms
              .atom(
                atomData,
                shouldFence,
                mediaWrapper,
                posterImageOverride,
              )
              .toString()

            bodyElement.remove()
            atomContainer.append(html)
          }
        }
      }
    }
    document
  }
}

object setSvgClasses {
  def apply(svg: String, classes: Seq[String] = List(), label: Option[String] = None): String = {
    val document = Jsoup.parse(svg)
    val svgHtml = document.getElementsByTag("svg")
    val modifiedClasses = classes.map(_.concat("__svg")).mkString(" ")

    svgHtml.addClass(modifiedClasses)
    label.map(text => {
      svgHtml.attr("aria-label", text)
    })
    svgHtml.toString
  }
}

case class CommercialMPUForFronts()(implicit val request: RequestHeader) extends HtmlCleaner {
  override def clean(document: Document): Document = {

    def hasFirstContainerThrasher(element: Element, index: Int): Boolean = {
      index == 0 && element.hasClass("fc-container--thrasher")
    }

    def hasAdjacentCommercialContainer(element: Element): Boolean = {
      val maybeNextEl: Option[Element] = Option(element.nextElementSibling())
      element.hasClass("fc-container--commercial") || maybeNextEl.exists(_.hasClass("fc-container--commercial"))
    }

    def hasAdjacentThrasher(element: Element): Boolean =
      Option(element.nextElementSibling()).exists(_.hasClass("fc-container--thrasher"))

    def isMostViewedContainer(element: Element): Boolean =
      Option(element.id()).exists(_.contains("most-viewed")) || Option(element.id()).exists(_.contains("popular-in"))

    val sliceSlot = views.html.fragments.items.facia_cards.sliceSlot

    val containers: List[Element] = document.getElementsByClass("fc-container").asScala.toList

    // On mobile, we remove the first container if it is a thrasher
    // and remove a container if it, or the next sibling, is a commercial container
    // we also exclude any containers that are directly before a thrasher
    // then we take every other container, up to a maximum of 10, for targeting MPU insertion
    val containersForCommercialMPUs = containers.zipWithIndex
      .collect {
        case (x, i)
            if !hasFirstContainerThrasher(x, i) && !hasAdjacentCommercialContainer(x) && !hasAdjacentThrasher(
              x,
            ) && !isMostViewedContainer(x) =>
          x
      }
      .zipWithIndex
      .collect {
        case (x, i) if i % 2 == 0 => x
      }
      .take(10)

    for (container <- containersForCommercialMPUs) {
      container.after(s"""<section class="fc-container__mpu--mobile">${sliceSlot(
        containersForCommercialMPUs.indexOf(container),
        isMobile = true,
      )}</section>""")
    }

    // On desktop, a MPU slot is simply inserted when there is a slice available
    val slices: List[Element] = document.getElementsByClass("fc-slice__item--mpu-candidate").asScala.toList

    for (slice <- slices) {
      slice.append(s"${sliceSlot(slices.indexOf(slice) + 1)}")
    }

    document
  }
}

case class CommercialComponentHigh(isPaidContent: Boolean, isNetworkFront: Boolean, hasPageSkin: Boolean)(
    implicit val edition: Edition,
    implicit val request: RequestHeader,
) extends HtmlCleaner {

  override def clean(document: Document): Document = {

    val containers: List[(Element, Int)] = document.getElementsByClass("fc-container").asScala.toList.zipWithIndex

    val minContainers = if (isPaidContent) 1 else 2

    if (containers.length >= minContainers) {
      val containerIndex = (containers.length >= 4, isNetworkFront) match {
        case (false, _)    => 0
        case (true, false) => 2
        case (true, true)  => 3
      }

      val adSlotHtml = views.html.fragments.commercial.commercialComponentHigh(isPaidContent, hasPageSkin)

      val adSlot: Option[Element] =
        Jsoup.parseBodyFragment(adSlotHtml.toString).body().children().asScala.toList.headOption

      for {
        (container, _) <- containers.lift(containerIndex)
        slot <- adSlot
      } {
        container.after(slot)
        slot.wrap("""<div class="fc-container fc-container--commercial"></div>""")
      }
    }
    document
  }

}

object GarnettQuoteCleaner extends HtmlCleaner {
  val garnettQuote = views.html.fragments.inlineSvg("garnett-quote", "icon").toString

  override def clean(document: Document): Document = {
    for {
      quote <- document.getElementsByClass("inline-quote").asScala
    } {
      quote.before(garnettQuote)
      quote.remove()
    }

    document
  }
}

case class AffiliateLinksCleaner(
    pageUrl: String,
    sectionId: String,
    showAffiliateLinks: Option[Boolean],
    appendDisclaimer: Option[Boolean] = None,
    tags: List[String],
    publishedDate: Option[DateTime],
    contentType: String,
) extends HtmlCleaner
    with GuLogging {

  override def clean(document: Document): Document = {
    if (
      AffiliateLinks.isSwitchedOn && AffiliateLinksCleaner.shouldAddAffiliateLinks(
        AffiliateLinks.isSwitchedOn,
        sectionId,
        showAffiliateLinks,
        affiliateLinkSections,
        defaultOffTags,
        alwaysOffTags,
        tags,
        publishedDate,
        pageUrl,
        contentType,
      )
    ) {
      AffiliateLinksCleaner.replaceLinksInHtml(document, pageUrl, skimlinksId)
    } else document
  }
}

object AffiliateLinksCleaner {

  def getAffiliateableLinks(html: Document): mutable.Seq[Element] =
    html.getElementsByAttribute("href").asScala.filter(isAffiliatable)

  def replaceLinksInHtml(
      html: Document,
      pageUrl: String,
      skimlinksId: String,
  ): Document = {
    val linksToReplace: mutable.Seq[Element] = getAffiliateableLinks(html)
    linksToReplace.foreach { el => el.attr("href", linkToSkimLink(el.attr("href"), pageUrl, skimlinksId)) }
    html
  }

  def replaceLinksInElement(html: String, pageUrl: String): TextBlockElement = {
    val doc = Jsoup.parseBodyFragment(html)
    val linksToReplace: mutable.Seq[Element] = getAffiliateableLinks(doc)
    linksToReplace.foreach { el => el.attr("href", linkToSkimLink(el.attr("href"), pageUrl, skimlinksId)) }

    if (linksToReplace.nonEmpty) {
      TextBlockElement(doc.body().html())
    } else {
      TextBlockElement(html)
    }
  }

  def isAffiliatable(element: Element): Boolean =
    element.tagName == "a" && SkimLinksCache.isSkimLink(element.attr("href"))

  def linkToSkimLink(link: String, pageUrl: String, skimlinksId: String): String = {
    val urlEncodedLink = URLEncode(link)
    s"https://go.skimresources.com/?id=$skimlinksId&url=$urlEncodedLink&sref=$host$pageUrl"
  }

  def contentHasAlwaysOffTag(tagPaths: List[String], alwaysOffTags: Set[String]): Boolean = {
    tagPaths.exists(path => alwaysOffTags.contains(path))
  }

  def shouldAddAffiliateLinks(
      switchedOn: Boolean,
      section: String,
      showAffiliateLinks: Option[Boolean],
      supportedSections: Set[String],
      defaultOffTags: Set[String],
      alwaysOffTags: Set[String],
      tagPaths: List[String],
      firstPublishedDate: Option[DateTime],
      pageUrl: String,
      contentType: String,
  ): Boolean = {
    val publishedCutOffDate = new DateTime(2020, 8, 14, 0, 0)

    val cleanedPageUrl = if (pageUrl.charAt(0) == '/') {
      pageUrl.substring(1);
    } else pageUrl

    val affiliateLinksAllowList = List(
      "lifeandstyle/2024/jan/03/six-winter-warmers-tried-and-tested-the-heated-poncho-has-changed-me-i-will-never-have-sex-again",
      "lifeandstyle/2024/mar/11/im-south-asian-and-have-dark-eye-circles-what-can-i-do",
      "fashion/2024/mar/08/the-four-makeup-staples-i-cant-live-without",
      "travel/2023/mar/03/readers-favourite-budget-beach-campsites-hotels-in-europe",
      "travel/2024/feb/25/10-of-the-best-places-in-the-uk-to-see-them-bloom",
      "lifeandstyle/2023/dec/10/with-christmas-around-the-corner-what-to-give-the-gardener-in-your-life-",
      "fashion/2024/mar/01/spring-is-around-the-corner-time-to-soothe-and-restore-your-cracked-heels",
      "fashion/2024/mar/10/compact-and-bijou-why-women-need-a-pocket-mirror",
      "fashion/2024/mar/03/how-to-reset-your-wardrobe-for-spring",
      "lifeandstyle/2024/mar/03/beauty-spot-eyebrow-essentials-10-of-the-best",
      "fashion/2024/mar/17/beauty-spot-10-best-root-cover-ups",
      "fashion/2024/apr/05/peptides-help-with-good-looking-skin-but-dont-expect-botox-in-a-bottle",
      "fashion/2024/apr/13/sali-hughes-top-50-beauty-products-for-under-20-pounds",
    )

    val urlIsInAllowList = affiliateLinksAllowList.contains(cleanedPageUrl)

    // Never include affiliate links if it is tagged with an always off tag, or if it was published before our cut off date.
    // The cut off date is temporary while we are working on improving the compliance of affiliate links.
    // The cut off date does not apply to any URL on the allow list
    if (
      !contentHasAlwaysOffTag(tagPaths, alwaysOffTags) && (firstPublishedDate.exists(
        _.isBefore(publishedCutOffDate),
      ) || urlIsInAllowList || contentType == "gallery")
    ) {
      if (showAffiliateLinks.isDefined) {
        showAffiliateLinks.contains(true)
      } else {
        switchedOn && supportedSections.contains(section) && !tagPaths.exists(path => defaultOffTags.contains(path))
      }
    } else false
  }

  def stringContainsAffiliateableLinks(s: String): Boolean = {
    getAffiliateableLinks(Jsoup.parseBodyFragment(s)).nonEmpty
  }
}