Skip to content

Commit

Permalink
fix: Use CssParser to correctly pass options to wkhtmltopdf
Browse files Browse the repository at this point in the history
- Regex incorrectly fetches .print-format's child styles and also extracts the wrong attribute value
- A CssParser is more maintainable and more readable as well as less prone to errors while extracting values
- Method: We extract style tag contents out of the html and tokenize them. We then filter the styles for the right selector and extract the attributes we want from them.
- This way we make sure that the right value is extracted and only the ones applicable to .print-format directly

(cherry picked from commit 5dbcbbb)
  • Loading branch information
marination authored and mergify[bot] committed Mar 27, 2024
1 parent c15b47a commit e9811ea
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 9 deletions.
53 changes: 44 additions & 9 deletions frappe/utils/pdf.py
Expand Up @@ -5,10 +5,10 @@
import io
import mimetypes
import os
import re
import subprocess
from urllib.parse import parse_qs, urlparse

import cssutils
import pdfkit
from bs4 import BeautifulSoup
from packaging.version import Version
Expand Down Expand Up @@ -206,7 +206,8 @@ def read_options_from_html(html):

toggle_visible_pdf(soup)

# use regex instead of soup-parser
valid_styles = get_print_format_styles(soup)

for attr in (
"margin-top",
"margin-bottom",
Expand All @@ -218,17 +219,51 @@ def read_options_from_html(html):
"page-width",
"page-height",
):
try:
pattern = re.compile(r"(\.print-format)([\S|\s][^}]*?)(" + str(attr) + r":)(.+)(mm;)")
match = pattern.findall(html)
if match:
options[attr] = str(match[-1][3]).strip()
except Exception:
pass
for style in valid_styles:
if attr == style.name:
options[attr] = style.value

return str(soup), options


def get_print_format_styles(soup: BeautifulSoup) -> list[cssutils.css.Property]:
"""
Get styles purely on class 'print-format'.
Valid:
1) .print-format { ... }
2) .print-format, p { ... } | p, .print-format { ... }
Invalid (applied on child elements):
1) .print-format p { ... } | .print-format > p { ... }
2) .print-format #abc { ... }
Returns:
[cssutils.css.Property(name='margin-top', value='50mm', priority=''), ...]
"""
stylesheet = ""
style_tags = soup.find_all("style")

# Prepare a css stylesheet from all the style tags' contents
for style_tag in style_tags:
stylesheet += style_tag.string

# Use css parser to tokenize the classes and their styles
parsed_sheet = cssutils.parseString(stylesheet)

# Get all styles that are only for .print-format
valid_styles = []
for rule in parsed_sheet:
if not isinstance(rule, cssutils.css.CSSStyleRule):
continue

# Allow only .print-format { ... } and .print-format, p { ... }
# Disallow .print-format p { ... } and .print-format > p { ... }
if ".print-format" in [x.strip() for x in rule.selectorText.split(",")]:
valid_styles.extend(entry for entry in rule.style)

return valid_styles


def inline_private_images(html) -> str:
soup = BeautifulSoup(html, "html.parser")
for img in soup.find_all("img"):
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Expand Up @@ -35,6 +35,7 @@ dependencies = [
"chardet~=5.1.0",
"croniter~=2.0.1",
"cryptography~=42.0.0",
"cssutils~=2.9.0",
"email-reply-parser~=0.5.12",
"git-url-parse~=1.2.2",
"gunicorn~=21.2.0",
Expand Down

0 comments on commit e9811ea

Please sign in to comment.