Skip to content

Commit

Permalink
Dom: Proper handling for Unicode in HTML documents
Browse files Browse the repository at this point in the history
Modifications were needed in two different places because:
1. `libxml` needs to be told to parse the input as UTF-8. A `<meta>` tag would create an empty `<head>` and converting everything to entities is a hack. The XML declaration works great and is the easiest to remove right afterwards.
2. If the `<meta>` tag is used and kept in the document, additional processing (e.g. in `$dom->sanitize()`) will think that it's part of the input. So everything we add needs to be removed immediately after parsing.
3. On output we need the `<meta>` tag again as it's the only way to avoid an export as `ISO-8859-1` with entities.

Fixes #3798.
  • Loading branch information
lukasbestle committed Oct 31, 2021
1 parent 3c82f94 commit 036a270
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 5 deletions.
34 changes: 30 additions & 4 deletions src/Toolkit/Dom.php
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,17 @@ public function __construct(string $code, string $type = 'HTML')
$this->type = strtoupper($type);
if ($this->type === 'HTML') {
// the loadHTML() method expects ISO-8859-1 by default;
// convert every native UTF-8 character to an entity
$load = $this->doc->loadHTML(mb_convert_encoding($code, 'HTML-ENTITIES', 'UTF-8'));
// force parsing as UTF-8 by injecting an XML declaration
$xmlId = 'id="' . Str::random(10) . '"';
$load = $this->doc->loadHTML('<?xml encoding="UTF-8" ' . $xmlId . '>' . $code);

// remove the injected XML declaration again
$pis = $this->query('//processing-instruction()');
foreach (iterator_to_array($pis) as $pi) {
if (Str::contains($pi->data, $xmlId)) {
static::remove($pi);
}
}

// remove the default doctype
if (Str::contains($code, '<!DOCTYPE ') === false) {
Expand Down Expand Up @@ -545,8 +554,25 @@ public function toString(bool $xmlDecl = true): string
return $result;
}

$method = 'save' . $this->type;
return $this->doc->$method();
if ($this->type === 'HTML') {
// enforce export as UTF-8 by injecting a <meta> tag
// at the beginning of the document
$metaTag = $this->doc->createElement('meta');
$metaTag->setAttribute('http-equiv', 'Content-Type');
$metaTag->setAttribute('content', 'text/html; charset=utf-8');
$metaTag->setAttribute('id', $metaId = Str::random(10));
$this->doc->insertBefore($metaTag, $this->doc->documentElement);

$html = $this->doc->saveHTML();

// remove the <meta> tag from the document and from the output
static::remove($metaTag);
$html = str_replace($this->doc->saveHTML($metaTag), '', $html);

return $html;
}

return $this->doc->saveXML();
}

/**
Expand Down
47 changes: 46 additions & 1 deletion tests/Toolkit/DomTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,52 @@
*/
class DomTest extends TestCase
{
public function urlProvider()
public function parseHtmlProvider(): array
{
return [
// full document with doctype
[
'<!DOCTYPE html><html><body><p>Lorem ipsum</p></body></html>',
"<!DOCTYPE html>\n<html><body><p>Lorem ipsum</p></body></html>"
],

// full document with doctype (with whitespace)
[
"<!DOCTYPE html>\n\n<html><body><p>Lorem ipsum</p></body></html>",
"<!DOCTYPE html>\n<html><body><p>Lorem ipsum</p></body></html>"
],

// Unicode string
['<html><body><p>TEST — jūsų šildymo sistemai</p></body></html>'],

// Unicode string with entities
[
'<html><body><p>TEST &mdash;&nbsp;jūsų šildymo sistemai</p></body></html>',
'<html><body><p>TEST — jūsų šildymo sistemai</p></body></html>',
],

// weird whitespace
["<html>\n <body>\n <p>Lorem ipsum\n</p>\n </body>\n</html>"],

// partial document with syntax issue
[
'<p>This is <strong>important</strong!</p>',
'<html><body><p>This is <strong>important</strong>!</p></body></html>'
]
];
}

/**
* @dataProvider parseHtmlProvider
* @covers ::__construct
*/
public function testParseHtml(string $html, string $expected = null)
{
$dom = new Dom($html, 'HTML');
$this->assertSame(($expected ?? $html) . "\n", $dom->toString());
}

public function urlProvider(): array
{
return [
// allowed empty url
Expand Down

0 comments on commit 036a270

Please sign in to comment.